amaye15 commited on
Commit
03c0888
·
1 Parent(s): 1214696
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.txt +4 -0
  2. .gitattributes +12 -35
  3. .gitignore +232 -0
  4. CHANGELOG.md +1089 -0
  5. CODE_OF_CONDUCT.md +131 -0
  6. CONTRIBUTORS.md +42 -0
  7. Dockerfile +136 -0
  8. LICENSE +51 -0
  9. MANIFEST.in +2 -0
  10. MISSION.md +46 -0
  11. README.md +560 -1
  12. ROADMAP.md +503 -0
  13. crawl4ai/__init__.py +46 -0
  14. crawl4ai/__version__.py +2 -0
  15. crawl4ai/async_configs.py +603 -0
  16. crawl4ai/async_crawler_strategy.py +2191 -0
  17. crawl4ai/async_database.py +495 -0
  18. crawl4ai/async_logger.py +231 -0
  19. crawl4ai/async_webcrawler.py +833 -0
  20. crawl4ai/cache_context.py +115 -0
  21. crawl4ai/chunking_strategy.py +231 -0
  22. crawl4ai/cli.py +105 -0
  23. crawl4ai/config.py +64 -0
  24. crawl4ai/content_filter_strategy.py +627 -0
  25. crawl4ai/content_scraping_strategy.py +723 -0
  26. crawl4ai/crawler_strategy.py +360 -0
  27. crawl4ai/database.py +135 -0
  28. crawl4ai/docs_manager.py +67 -0
  29. crawl4ai/extraction_strategy.bak.py +1440 -0
  30. crawl4ai/extraction_strategy.py +1052 -0
  31. crawl4ai/html2text/__init__.py +1141 -0
  32. crawl4ai/html2text/__main__.py +3 -0
  33. crawl4ai/html2text/_typing.py +2 -0
  34. crawl4ai/html2text/cli.py +330 -0
  35. crawl4ai/html2text/config.py +172 -0
  36. crawl4ai/html2text/elements.py +18 -0
  37. crawl4ai/html2text/utils.py +303 -0
  38. crawl4ai/install.py +83 -0
  39. crawl4ai/js_snippet/__init__.py +15 -0
  40. crawl4ai/js_snippet/navigator_overrider.js +25 -0
  41. crawl4ai/js_snippet/remove_overlay_elements.js +119 -0
  42. crawl4ai/js_snippet/update_image_dimensions.js +54 -0
  43. crawl4ai/llmtxt.py +498 -0
  44. crawl4ai/markdown_generation_strategy.py +225 -0
  45. crawl4ai/migrations.py +168 -0
  46. crawl4ai/model_loader.py +256 -0
  47. crawl4ai/models.py +61 -0
  48. crawl4ai/prompts.py +204 -0
  49. crawl4ai/ssl_certificate.py +181 -0
  50. crawl4ai/user_agent_generator.py +305 -0
.env.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GROQ_API_KEY = "YOUR_GROQ_API"
2
+ OPENAI_API_KEY = "YOUR_OPENAI_API"
3
+ ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API"
4
+ # You can add more API keys here
.gitattributes CHANGED
@@ -1,35 +1,12 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Documentation
2
+ *.html linguist-documentation
3
+ docs/* linguist-documentation
4
+ docs/examples/* linguist-documentation
5
+ docs/md_v2/* linguist-documentation
6
+
7
+ # Explicitly mark Python as the main language
8
+ *.py linguist-detectable=true
9
+ *.py linguist-language=Python
10
+
11
+ # Exclude HTML from language statistics
12
+ *.html linguist-detectable=false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ Crawl4AI.egg-info/
165
+ Crawl4AI.egg-info/*
166
+ crawler_data.db
167
+ .vscode/
168
+ .tests/
169
+ .test_pads/
170
+ test_pad.py
171
+ test_pad*.py
172
+ .data/
173
+ Crawl4AI.egg-info/
174
+
175
+ requirements0.txt
176
+ a.txt
177
+
178
+ *.sh
179
+ .idea
180
+ docs/examples/.chainlit/
181
+ docs/examples/.chainlit/*
182
+ .chainlit/config.toml
183
+ .chainlit/translations/en-US.json
184
+
185
+ local/
186
+ .files/
187
+
188
+ a.txt
189
+ .lambda_function.py
190
+ ec2*
191
+
192
+ update_changelog.sh
193
+
194
+ .DS_Store
195
+ docs/.DS_Store
196
+ tmp/
197
+ test_env/
198
+ **/.DS_Store
199
+ **/.DS_Store
200
+
201
+ todo.md
202
+ todo_executor.md
203
+ git_changes.py
204
+ git_changes.md
205
+ pypi_build.sh
206
+ git_issues.py
207
+ git_issues.md
208
+
209
+ .next/
210
+ .tests/
211
+ # .issues/
212
+ .docs/
213
+ .issues/
214
+ .gitboss/
215
+ todo_executor.md
216
+ protect-all-except-feature.sh
217
+ manage-collab.sh
218
+ publish.sh
219
+ combine.sh
220
+ combined_output.txt
221
+ .local
222
+ .scripts
223
+ tree.md
224
+ tree.md
225
+ .scripts
226
+ .local
227
+ .do
228
+ /plans
229
+ plans/
230
+
231
+ # Codeium
232
+ .codeiumignore
CHANGELOG.md ADDED
@@ -0,0 +1,1089 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to Crawl4AI will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ---
9
+
10
+ ## [0.4.267] - 2025 - 01 - 06
11
+
12
+ ### Added
13
+ - **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
14
+ - **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
15
+
16
+ ### Changed
17
+ - **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py))
18
+ - **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
19
+ - **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py))
20
+ - **Documentation Update**:
21
+ - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
22
+
23
+ ### Removed
24
+ - **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py))
25
+
26
+ ### Fixed
27
+ - **Page Closing to Prevent Memory Leaks**:
28
+ - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided.
29
+ - **Impact**: Prevents memory leaks caused by lingering pages after a crawl.
30
+ - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py)
31
+ - **Code**:
32
+ ```python
33
+ finally:
34
+ # If no session_id is given we should close the page
35
+ if not config.session_id:
36
+ await page.close()
37
+ ```
38
+ - **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py))
39
+ - **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
40
+
41
+ ### Other
42
+ - **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore))
43
+
44
+
45
+ ## [0.4.24] - 2024-12-31
46
+
47
+ ### Added
48
+ - **Browser and SSL Handling**
49
+ - SSL certificate validation options in extraction strategies
50
+ - Custom certificate paths support
51
+ - Configurable certificate validation skipping
52
+ - Enhanced response status code handling with retry logic
53
+
54
+ - **Content Processing**
55
+ - New content filtering system with regex support
56
+ - Advanced chunking strategies for large content
57
+ - Memory-efficient parallel processing
58
+ - Configurable chunk size optimization
59
+
60
+ - **JSON Extraction**
61
+ - Complex JSONPath expression support
62
+ - JSON-CSS and Microdata extraction
63
+ - RDFa parsing capabilities
64
+ - Advanced data transformation pipeline
65
+
66
+ - **Field Types**
67
+ - New field types: `computed`, `conditional`, `aggregate`, `template`
68
+ - Field inheritance system
69
+ - Reusable field definitions
70
+ - Custom validation rules
71
+
72
+ ### Changed
73
+ - **Performance**
74
+ - Optimized selector compilation with caching
75
+ - Improved HTML parsing efficiency
76
+ - Enhanced memory management for large documents
77
+ - Batch processing optimizations
78
+
79
+ - **Error Handling**
80
+ - More detailed error messages and categorization
81
+ - Enhanced debugging capabilities
82
+ - Improved performance metrics tracking
83
+ - Better error recovery mechanisms
84
+
85
+ ### Deprecated
86
+ - Old field computation method using `eval`
87
+ - Direct browser manipulation without proper SSL handling
88
+ - Simple text-based content filtering
89
+
90
+ ### Removed
91
+ - Legacy extraction patterns without proper error handling
92
+ - Unsafe eval-based field computation
93
+ - Direct DOM manipulation without sanitization
94
+
95
+ ### Fixed
96
+ - Memory leaks in large document processing
97
+ - SSL certificate validation issues
98
+ - Incorrect handling of nested JSON structures
99
+ - Performance bottlenecks in parallel processing
100
+
101
+ ### Security
102
+ - Improved input validation and sanitization
103
+ - Safe expression evaluation system
104
+ - Enhanced resource protection
105
+ - Rate limiting implementation
106
+
107
+ ## [0.4.1] - 2024-12-08
108
+
109
+ ### **File: `crawl4ai/async_crawler_strategy.py`**
110
+
111
+ #### **New Parameters and Attributes Added**
112
+ - **`text_mode` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering.
113
+ - **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency.
114
+ - **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_mode` mode (default values: 800x600 for `text_mode`, 1920x1080 otherwise).
115
+ - **`extra_args`**: Adds browser-specific flags for `text_mode` mode.
116
+ - **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering.
117
+
118
+ #### **Browser Context Adjustments**
119
+ - Added **`viewport` adjustments**: Dynamically computed based on `text_mode` or custom configuration.
120
+ - Enhanced support for `light_mode` and `text_mode` by adding specific browser arguments to reduce resource consumption.
121
+
122
+ #### **Dynamic Content Handling**
123
+ - **Full Page Scan Feature**:
124
+ - Scrolls through the entire page while dynamically detecting content changes.
125
+ - Ensures scrolling stops when no new dynamic content is loaded.
126
+
127
+ #### **Session Management**
128
+ - Added **`create_session`** method:
129
+ - Creates a new browser session and assigns a unique ID.
130
+ - Supports persistent and non-persistent contexts with full compatibility for cookies, headers, and proxies.
131
+
132
+ #### **Improved Content Loading and Adjustment**
133
+ - **`adjust_viewport_to_content`**:
134
+ - Automatically adjusts viewport to match content dimensions.
135
+ - Includes scaling via Chrome DevTools Protocol (CDP).
136
+ - Enhanced content loading:
137
+ - Waits for images to load and ensures network activity is idle before proceeding.
138
+
139
+ #### **Error Handling and Logging**
140
+ - Improved error handling and detailed logging for:
141
+ - Viewport adjustment (`adjust_viewport_to_content`).
142
+ - Full page scanning (`scan_full_page`).
143
+ - Dynamic content loading.
144
+
145
+ #### **Refactoring and Cleanup**
146
+ - Removed hardcoded viewport dimensions in multiple places, replaced with dynamic values (`self.viewport_width`, `self.viewport_height`).
147
+ - Removed commented-out and unused code for better readability.
148
+ - Added default value for `delay_before_return_html` parameter.
149
+
150
+ #### **Optimizations**
151
+ - Reduced resource usage in `light_mode` by disabling unnecessary browser features such as extensions, background timers, and sync.
152
+ - Improved compatibility for different browser types (`chrome`, `firefox`, `webkit`).
153
+
154
+ ---
155
+
156
+ ### **File: `docs/examples/quickstart_async.py`**
157
+
158
+ #### **Schema Adjustment**
159
+ - Changed schema reference for `LLMExtractionStrategy`:
160
+ - **Old**: `OpenAIModelFee.schema()`
161
+ - **New**: `OpenAIModelFee.model_json_schema()`
162
+ - This likely ensures better compatibility with the `OpenAIModelFee` class and its JSON schema.
163
+
164
+ #### **Documentation Comments Updated**
165
+ - Improved extraction instruction for schema-based LLM strategies.
166
+
167
+ ---
168
+
169
+ ### **New Features Added**
170
+ 1. **Text-Only Mode**:
171
+ - Focuses on minimal resource usage by disabling non-essential browser features.
172
+ 2. **Light Mode**:
173
+ - Optimizes browser for performance by disabling background tasks and unnecessary services.
174
+ 3. **Full Page Scanning**:
175
+ - Ensures the entire content of a page is crawled, including dynamic elements loaded during scrolling.
176
+ 4. **Dynamic Viewport Adjustment**:
177
+ - Automatically resizes the viewport to match content dimensions, improving compatibility and rendering accuracy.
178
+ 5. **Session Management**:
179
+ - Simplifies session handling with better support for persistent and non-persistent contexts.
180
+
181
+ ---
182
+
183
+ ### **Bug Fixes**
184
+ - Fixed potential viewport mismatches by ensuring consistent use of `self.viewport_width` and `self.viewport_height` throughout the code.
185
+ - Improved robustness of dynamic content loading to avoid timeouts and failed evaluations.
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+ ## [0.3.75] December 1, 2024
194
+
195
+ ### PruningContentFilter
196
+
197
+ #### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
198
+ A new content filtering strategy that removes less relevant nodes based on metrics like text and link density.
199
+
200
+ **Affected Files:**
201
+ - `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities.
202
+ ```diff
203
+ Implemented effective pruning algorithm with comprehensive scoring.
204
+ ```
205
+ - `README.md`: Improved documentation regarding new features.
206
+ ```diff
207
+ Updated to include usage and explanation for the PruningContentFilter.
208
+ ```
209
+ - `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users.
210
+ ```diff
211
+ Added detailed section explaining the PruningContentFilter.
212
+ ```
213
+
214
+ #### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
215
+ Comprehensive tests added to ensure correct functionality of PruningContentFilter
216
+
217
+ **Affected Files:**
218
+ - `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies.
219
+ ```diff
220
+ Created test cases for various scenarios using the PruningContentFilter.
221
+ ```
222
+
223
+ ### Development Updates
224
+
225
+ #### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024)
226
+ Extended testing to cover additional edge cases and performance metrics.
227
+
228
+ **Affected Files:**
229
+ - `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance.
230
+ ```diff
231
+ Added tests for new extraction scenarios including malformed HTML.
232
+ ```
233
+
234
+ ### Infrastructure & Documentation
235
+
236
+ #### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024)
237
+ Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies.
238
+
239
+ **Affected Files:**
240
+ - `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users.
241
+ - Revised example to illustrate usage of PruningContentFilter.
242
+
243
+ ## [0.3.746] November 29, 2024
244
+
245
+ ### Major Features
246
+ 1. Enhanced Docker Support (Nov 29, 2024)
247
+ - Improved GPU support in Docker images.
248
+ - Dockerfile refactored for better platform-specific installations.
249
+ - Introduced new Docker commands for different platforms:
250
+ - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64.
251
+ - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64.
252
+
253
+ ### Infrastructure & Documentation
254
+ - Enhanced README.md to improve user guidance and installation instructions.
255
+ - Added installation instructions for Playwright setup in README.
256
+ - Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly.
257
+ - Updated `requirements.txt` with a new `pydantic` dependency.
258
+ - Bumped version number in `crawl4ai/__version__.py` to 0.3.746.
259
+
260
+ ### Breaking Changes
261
+ - Streamlined application structure:
262
+ - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content.
263
+
264
+ ### Development Updates
265
+ - Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks.
266
+ - Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility.
267
+ - Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities.
268
+ - Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing.
269
+
270
+ ### README.md
271
+ Updated README with new docker commands and setup instructions.
272
+ Enhanced installation instructions and guidance.
273
+
274
+ ### crawl4ai/install.py
275
+ Added post-install script functionality.
276
+ Introduced `post_install` method for automation of post-installation tasks.
277
+
278
+ ### crawl4ai/migrations.py
279
+ Improved migration logging.
280
+ Refined migration processes and added better logging.
281
+
282
+ ### docker-compose.yml
283
+ Refactored docker-compose for better service management.
284
+ Updated to define services for different platforms and versions.
285
+
286
+ ### requirements.txt
287
+ Updated dependencies.
288
+ Added `pydantic` to requirements file.
289
+
290
+ ### crawler/__version__.py
291
+ Updated version number.
292
+ Bumped version number to 0.3.746.
293
+
294
+ ### docs/examples/quickstart_async.py
295
+ Enhanced example scripts.
296
+ Uncommented example usage in async guide for user functionality.
297
+
298
+ ### main.py
299
+ Refactored code to improve maintainability.
300
+ Streamlined app structure by removing static pages code.
301
+
302
+ ## [0.3.743] November 27, 2024
303
+
304
+ Enhance features and documentation
305
+ - Updated version to 0.3.743
306
+ - Improved ManagedBrowser configuration with dynamic host/port
307
+ - Implemented fast HTML formatting in web crawler
308
+ - Enhanced markdown generation with a new generator class
309
+ - Improved sanitization and utility functions
310
+ - Added contributor details and pull request acknowledgments
311
+ - Updated documentation for clearer usage scenarios
312
+ - Adjusted tests to reflect class name changes
313
+
314
+ ### CONTRIBUTORS.md
315
+ Added new contributors and pull request details.
316
+ Updated community contributions and acknowledged pull requests.
317
+
318
+ ### crawl4ai/__version__.py
319
+ Version update.
320
+ Bumped version to 0.3.743.
321
+
322
+ ### crawl4ai/async_crawler_strategy.py
323
+ Improved ManagedBrowser configuration.
324
+ Enhanced browser initialization with configurable host and debugging port; improved hook execution.
325
+
326
+ ### crawl4ai/async_webcrawler.py
327
+ Optimized HTML processing.
328
+ Implemented 'fast_format_html' for optimized HTML formatting; applied it when 'prettiify' is enabled.
329
+
330
+ ### crawl4ai/content_scraping_strategy.py
331
+ Enhanced markdown generation strategy.
332
+ Updated to use DefaultMarkdownGenerator and improved markdown generation with filters option.
333
+
334
+ ### crawl4ai/markdown_generation_strategy.py
335
+ Refactored markdown generation class.
336
+ Renamed DefaultMarkdownGenerationStrategy to DefaultMarkdownGenerator; added content filter handling.
337
+
338
+ ### crawl4ai/utils.py
339
+ Enhanced utility functions.
340
+ Improved input sanitization and enhanced HTML formatting method.
341
+
342
+ ### docs/md_v2/advanced/hooks-auth.md
343
+ Improved documentation for hooks.
344
+ Updated code examples to include cookies in crawler strategy initialization.
345
+
346
+ ### tests/async/test_markdown_genertor.py
347
+ Refactored tests to match class renaming.
348
+ Updated tests to use renamed DefaultMarkdownGenerator class.
349
+
350
+ ## [0.3.74] November 17, 2024
351
+
352
+ This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information.
353
+
354
+ ### 1. File Download Processing
355
+
356
+ - Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory.
357
+ - File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths.
358
+ - Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download.
359
+
360
+ **Example:**
361
+
362
+ ```python
363
+ import asyncio
364
+ import os
365
+ from pathlib import Path
366
+ from crawl4ai import AsyncWebCrawler
367
+
368
+ async def download_example():
369
+ downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
370
+ os.makedirs(downloads_path, exist_ok=True)
371
+
372
+ async with AsyncWebCrawler(
373
+ accept_downloads=True,
374
+ downloads_path=downloads_path,
375
+ verbose=True
376
+ ) as crawler:
377
+ result = await crawler.arun(
378
+ url="https://www.python.org/downloads/",
379
+ js_code="""
380
+ const downloadLink = document.querySelector('a[href$=".exe"]');
381
+ if (downloadLink) { downloadLink.click(); }
382
+ """,
383
+ wait_for=5 # To ensure download has started
384
+ )
385
+
386
+ if result.downloaded_files:
387
+ print("Downloaded files:")
388
+ for file in result.downloaded_files:
389
+ print(f"- {file}")
390
+
391
+ asyncio.run(download_example())
392
+
393
+ ```
394
+
395
+ ### 2. Refined Content Filtering
396
+
397
+ - Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query.
398
+ - The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords.
399
+
400
+ **Example:**
401
+
402
+ ```python
403
+ from crawl4ai import AsyncWebCrawler
404
+ from crawl4ai.content_filter_strategy import BM25ContentFilter
405
+
406
+ async def filter_content(url, query):
407
+ async with AsyncWebCrawler() as crawler:
408
+ content_filter = BM25ContentFilter(user_query=query)
409
+ result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
410
+ print(result.extracted_content) # Or result.fit_markdown for the markdown version
411
+ print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content
412
+
413
+ asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health"))
414
+ ```
415
+
416
+ ### 3. Raw HTML and Local File Support
417
+
418
+ - Added support for crawling local files and raw HTML content directly.
419
+ - Use the `file://` prefix for local file paths.
420
+ - Use the `raw:` prefix for raw HTML strings.
421
+
422
+ **Example:**
423
+
424
+ ```python
425
+ async def crawl_local_or_raw(crawler, content, content_type):
426
+ prefix = "file://" if content_type == "local" else "raw:"
427
+ url = f"{prefix}{content}"
428
+ result = await crawler.arun(url=url)
429
+ if result.success:
430
+ print(f"Markdown Content from {content_type.title()} Source:")
431
+ print(result.markdown)
432
+
433
+ # Example usage with local file and raw HTML
434
+ async def main():
435
+ async with AsyncWebCrawler() as crawler:
436
+ # Local File
437
+ await crawl_local_or_raw(
438
+ crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local"
439
+ )
440
+ # Raw HTML
441
+ await crawl_raw_html(crawler, "<h1>Raw Test</h1><p>This is raw HTML.</p>")
442
+
443
+
444
+ asyncio.run(main())
445
+ ```
446
+
447
+ ### 4. Browser Management
448
+
449
+ - New asynchronous crawler strategy implemented using Playwright.
450
+ - `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring.
451
+ - Updated to tf-playwright-stealth for enhanced stealth capabilities.
452
+ - Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy.
453
+
454
+
455
+ **Example:**
456
+ ```python
457
+ async def browser_management_demo():
458
+ user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir")
459
+ os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists
460
+ async with AsyncWebCrawler(
461
+ use_managed_browser=True,
462
+ user_data_dir=user_data_dir,
463
+ use_persistent_context=True,
464
+ verbose=True
465
+ ) as crawler:
466
+ result1 = await crawler.arun(
467
+ url="https://example.com", session_id="my_session"
468
+ )
469
+ result2 = await crawler.arun(
470
+ url="https://example.com/anotherpage", session_id="my_session"
471
+ )
472
+
473
+ asyncio.run(browser_management_demo())
474
+ ```
475
+
476
+
477
+ ### 5. API Server & Cache Improvements
478
+
479
+ - Added CORS support to API server.
480
+ - Implemented static file serving.
481
+ - Enhanced root redirect functionality.
482
+ - Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently.
483
+ - New, more efficient caching database built using xxhash and file system approach.
484
+ - Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`.
485
+
486
+
487
+ ### 🗑️ Removals
488
+
489
+ - Removed deprecated: `crawl4ai/content_cleaning_strategy.py`.
490
+ - Removed internal class ContentCleaningStrategy
491
+ - Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`.
492
+
493
+
494
+ ### ⚙️ Other Changes
495
+
496
+ - Moved version file to `crawl4ai/__version__.py`.
497
+ - Added `crawl4ai/cache_context.py`.
498
+ - Added `crawl4ai/version_manager.py`.
499
+ - Added `crawl4ai/migrations.py`.
500
+ - Added `crawl4ai-migrate` entry point.
501
+ - Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`.
502
+ - API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security.
503
+ - Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue.
504
+
505
+
506
+ ### ⚠️ Deprecation Notices
507
+
508
+ - The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage.
509
+
510
+
511
+ ### 🐛 Bug Fixes
512
+
513
+ - Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments.
514
+ - Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers.
515
+ - Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues.
516
+ - Fixed issue with incorrect text parsing in specific HTML structures.
517
+
518
+
519
+ ### Example of migrating to the new CacheMode:
520
+
521
+ **Old way:**
522
+
523
+ ```python
524
+ crawler = AsyncWebCrawler(always_by_pass_cache=True)
525
+ result = await crawler.arun(url="https://example.com", bypass_cache=True)
526
+ ```
527
+
528
+ **New way:**
529
+
530
+ ```python
531
+ from crawl4ai import CacheMode
532
+
533
+ crawler = AsyncWebCrawler(always_bypass_cache=True)
534
+ result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS)
535
+ ```
536
+
537
+
538
+ ## [0.3.74] - November 13, 2024
539
+
540
+ 1. **File Download Processing** (Nov 14, 2024)
541
+ - Added capability for users to specify download folders
542
+ - Implemented file download tracking in crowd result object
543
+ - Created new file: `tests/async/test_async_doanloader.py`
544
+
545
+ 2. **Content Filtering Improvements** (Nov 14, 2024)
546
+ - Introduced Relevance Content Filter as an improvement over Fit Markdown
547
+ - Implemented BM25 algorithm for content relevance matching
548
+ - Added new file: `crawl4ai/content_filter_strategy.py`
549
+ - Removed deprecated: `crawl4ai/content_cleaning_strategy.py`
550
+
551
+ 3. **Local File and Raw HTML Support** (Nov 13, 2024)
552
+ - Added support for processing local files
553
+ - Implemented raw HTML input handling in AsyncWebCrawler
554
+ - Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements
555
+
556
+ 4. **Browser Management Enhancements** (Nov 12, 2024)
557
+ - Implemented new async crawler strategy using Playwright
558
+ - Introduced ManagedBrowser for better browser session handling
559
+ - Added support for persistent browser sessions
560
+ - Updated from playwright_stealth to tf-playwright-stealth
561
+
562
+ 5. **API Server Component**
563
+ - Added CORS support
564
+ - Implemented static file serving
565
+ - Enhanced root redirect functionality
566
+
567
+
568
+
569
+ ## [0.3.731] - November 13, 2024
570
+
571
+ ### Added
572
+ - Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://')
573
+ - Browser process monitoring for managed browser instances
574
+ - Screenshot capability for raw HTML and local file content
575
+ - Response headers storage in cache database
576
+ - New `fit_markdown` flag for optional markdown generation
577
+
578
+ ### Changed
579
+ - Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement
580
+ - Optimized BeautifulSoup text conversion and element selection
581
+ - Pre-compiled regular expressions for better performance
582
+ - Improved metadata extraction efficiency
583
+ - Response headers now stored alongside HTML in cache
584
+
585
+ ### Removed
586
+ - `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues
587
+
588
+ ### Fixed
589
+ - Issue #256: Added support for crawling raw HTML content
590
+ - Issue #253: Implemented file:// protocol handling
591
+ - Missing response headers in cached results
592
+ - Memory leaks from improper async cleanup
593
+
594
+ ## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix
595
+ - Fixed: Browser context unexpectedly closing in Docker environment during crawl operations.
596
+ - Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers.
597
+ - Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations.
598
+ - Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability.
599
+ - Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment.
600
+
601
+ ## [v0.3.73] - 2024-11-05
602
+
603
+ ### Major Features
604
+ - **New Doctor Feature**
605
+ - Added comprehensive system diagnostics tool
606
+ - Available through package hub and CLI
607
+ - Provides automated troubleshooting and system health checks
608
+ - Includes detailed reporting of configuration issues
609
+
610
+ - **Dockerized API Server**
611
+ - Released complete Docker implementation for API server
612
+ - Added comprehensive documentation for Docker deployment
613
+ - Implemented container communication protocols
614
+ - Added environment configuration guides
615
+
616
+ - **Managed Browser Integration**
617
+ - Added support for user-controlled browser instances
618
+ - Implemented `ManagedBrowser` class for better browser lifecycle management
619
+ - Added ability to connect to existing Chrome DevTools Protocol (CDP) endpoints
620
+ - Introduced user data directory support for persistent browser profiles
621
+
622
+ - **Enhanced HTML Processing**
623
+ - Added HTML tag preservation feature during markdown conversion
624
+ - Introduced configurable tag preservation system
625
+ - Improved pre-tag and code block handling
626
+ - Added support for nested preserved tags with attribute retention
627
+
628
+ ### Improvements
629
+ - **Browser Handling**
630
+ - Added flag to ignore body visibility for problematic pages
631
+ - Improved browser process cleanup and management
632
+ - Enhanced temporary directory handling for browser profiles
633
+ - Added configurable browser launch arguments
634
+
635
+ - **Database Management**
636
+ - Implemented connection pooling for better performance
637
+ - Added retry logic for database operations
638
+ - Improved error handling and logging
639
+ - Enhanced cleanup procedures for database connections
640
+
641
+ - **Resource Management**
642
+ - Added memory and CPU monitoring
643
+ - Implemented dynamic task slot allocation based on system resources
644
+ - Added configurable cleanup intervals
645
+
646
+ ### Technical Improvements
647
+ - **Code Structure**
648
+ - Moved version management to dedicated _version.py file
649
+ - Improved error handling throughout the codebase
650
+ - Enhanced logging system with better error reporting
651
+ - Reorganized core components for better maintainability
652
+
653
+ ### Bug Fixes
654
+ - Fixed issues with browser process termination
655
+ - Improved handling of connection timeouts
656
+ - Enhanced error recovery in database operations
657
+ - Fixed memory leaks in long-running processes
658
+
659
+ ### Dependencies
660
+ - Updated Playwright to v1.47
661
+ - Updated core dependencies with more flexible version constraints
662
+ - Added new development dependencies for testing
663
+
664
+ ### Breaking Changes
665
+ - Changed default browser handling behavior
666
+ - Modified database connection management approach
667
+ - Updated API response structure for better consistency
668
+
669
+ ### Migration Guide
670
+ When upgrading to v0.3.73, be aware of the following changes:
671
+
672
+ 1. Docker Deployment:
673
+ - Review Docker documentation for new deployment options
674
+ - Update environment configurations as needed
675
+ - Check container communication settings
676
+
677
+ 2. If using custom browser management:
678
+ - Update browser initialization code to use new ManagedBrowser class
679
+ - Review browser cleanup procedures
680
+
681
+ 3. For database operations:
682
+ - Check custom database queries for compatibility with new connection pooling
683
+ - Update error handling to work with new retry logic
684
+
685
+ 4. Using the Doctor:
686
+ - Run doctor command for system diagnostics: `crawl4ai doctor`
687
+ - Review generated reports for potential issues
688
+ - Follow recommended fixes for any identified problems
689
+
690
+
691
+ ## [v0.3.73] - 2024-11-04
692
+ This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities.
693
+
694
+ ## [v0.3.73] - 2024-10-24
695
+
696
+ ### Added
697
+ - preserve_tags: Added support for preserving specific HTML tags during markdown conversion.
698
+ - Smart overlay removal system in AsyncPlaywrightCrawlerStrategy:
699
+ - Automatic removal of popups, modals, and cookie notices
700
+ - Detection and removal of fixed/sticky position elements
701
+ - Cleaning of empty block elements
702
+ - Configurable via `remove_overlay_elements` parameter
703
+ - Enhanced screenshot capabilities:
704
+ - Added `screenshot_wait_for` parameter to control timing
705
+ - Improved screenshot handling with existing page context
706
+ - Better error handling with fallback error images
707
+ - New URL normalization utilities:
708
+ - `normalize_url` function for consistent URL formatting
709
+ - `is_external_url` function for better link classification
710
+ - Custom base directory support for cache storage:
711
+ - New `base_directory` parameter in AsyncWebCrawler
712
+ - Allows specifying alternative locations for `.crawl4ai` folder
713
+
714
+ ### Enhanced
715
+ - Link handling improvements:
716
+ - Better duplicate link detection
717
+ - Enhanced internal/external link classification
718
+ - Improved handling of special URL protocols
719
+ - Support for anchor links and protocol-relative URLs
720
+ - Configuration refinements:
721
+ - Streamlined social media domain list
722
+ - More focused external content filtering
723
+ - LLM extraction strategy:
724
+ - Added support for separate API base URL via `api_base` parameter
725
+ - Better handling of base URLs in configuration
726
+
727
+ ### Fixed
728
+ - Screenshot functionality:
729
+ - Resolved issues with screenshot timing and context
730
+ - Improved error handling and recovery
731
+ - Link processing:
732
+ - Fixed URL normalization edge cases
733
+ - Better handling of invalid URLs
734
+ - Improved error messages for link processing failures
735
+
736
+ ### Developer Notes
737
+ - The overlay removal system uses advanced JavaScript injection for better compatibility
738
+ - URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs
739
+ - Screenshot system now reuses existing page context for better performance
740
+ - Link processing maintains separate dictionaries for internal and external links to ensure uniqueness
741
+
742
+ ## [v0.3.72] - 2024-10-22
743
+
744
+ ### Added
745
+ - New `ContentCleaningStrategy` class:
746
+ - Smart content extraction based on text density and element scoring
747
+ - Automatic removal of boilerplate content
748
+ - DOM tree analysis for better content identification
749
+ - Configurable thresholds for content detection
750
+ - Advanced proxy support:
751
+ - Added `proxy_config` option for authenticated proxy connections
752
+ - Support for username/password in proxy configuration
753
+ - New content output formats:
754
+ - `fit_markdown`: Optimized markdown output with main content focus
755
+ - `fit_html`: Clean HTML with only essential content
756
+
757
+ ### Enhanced
758
+ - Image source detection:
759
+ - Support for multiple image source attributes (`src`, `data-src`, `srcset`, etc.)
760
+ - Automatic fallback through potential source attributes
761
+ - Smart handling of srcset attribute
762
+ - External content handling:
763
+ - Made external link exclusion optional (disabled by default)
764
+ - Improved detection and handling of social media links
765
+ - Better control over external image filtering
766
+
767
+ ### Fixed
768
+ - Image extraction reliability with multiple source attribute checks
769
+ - External link and image handling logic for better accuracy
770
+
771
+ ### Developer Notes
772
+ - The new `ContentCleaningStrategy` uses configurable thresholds for customization
773
+ - Proxy configuration now supports more complex authentication scenarios
774
+ - Content extraction process now provides both regular and optimized outputs
775
+
776
+ ## [v0.3.72] - 2024-10-20
777
+
778
+ ### Fixed
779
+ - Added support for parsing Base64 encoded images in WebScrapingStrategy
780
+
781
+ ### Added
782
+ - Forked and integrated a customized version of the html2text library for more control over Markdown generation
783
+ - New configuration options for controlling external content:
784
+ - Ability to exclude all external links
785
+ - Option to specify domains to exclude (default includes major social media platforms)
786
+ - Control over excluding external images
787
+
788
+ ### Changed
789
+ - Improved Markdown generation process:
790
+ - Added fine-grained control over character escaping in Markdown output
791
+ - Enhanced handling of code blocks and pre-formatted text
792
+ - Updated `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500)
793
+ - Enhanced flexibility in `CosineStrategy` with a more generic `load_HF_embedding_model` function
794
+
795
+ ### Improved
796
+ - Optimized content scraping and processing for better efficiency
797
+ - Enhanced error handling and logging in various components
798
+
799
+ ### Developer Notes
800
+ - The customized html2text library is now located within the crawl4ai package
801
+ - New configuration options are available in the `config.py` file for external content handling
802
+ - The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options
803
+
804
+ ## [v0.3.71] - 2024-10-19
805
+
806
+ ### Added
807
+ - New chunking strategies:
808
+ - `OverlappingWindowChunking`: Allows for overlapping chunks of text, useful for maintaining context between chunks.
809
+ - Enhanced `SlidingWindowChunking`: Improved to handle edge cases and last chunks more effectively.
810
+
811
+ ### Changed
812
+ - Updated `CHUNK_TOKEN_THRESHOLD` in config to 2048 tokens (2^11) for better compatibility with most LLM models.
813
+ - Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler.
814
+ - Enhanced flexibility in `CosineStrategy`:
815
+ - Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models.
816
+ - Updated `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` for better JSON-based extraction.
817
+
818
+ ### Fixed
819
+ - Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked.
820
+
821
+ ### Developer Notes
822
+ - Added more comprehensive docstrings to chunking strategies for better code documentation.
823
+ - Removed hardcoded device setting in `CosineStrategy`, now using the automatically detected device.
824
+ - Added a new example in `quickstart_async.py` for generating a knowledge graph from crawled content.
825
+
826
+ These updates aim to provide more flexibility in text processing, improve performance, and enhance the overall capabilities of the crawl4ai library. The new chunking strategies, in particular, offer more options for handling large texts in various scenarios.
827
+
828
+ ## [v0.3.71] - 2024-10-18
829
+
830
+ ### Changes
831
+ 1. **Version Update**:
832
+ - Updated version number from 0.3.7 to 0.3.71.
833
+
834
+ 2. **Crawler Enhancements**:
835
+ - Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
836
+ - Improved context creation with additional options:
837
+ - Enabled `accept_downloads` and `java_script_enabled`.
838
+ - Added a cookie to enable cookies by default.
839
+
840
+ 3. **Error Handling Improvements**:
841
+ - Enhanced error messages in AsyncWebCrawler's `arun` method.
842
+ - Updated error reporting format for better visibility and consistency.
843
+
844
+ 4. **Performance Optimization**:
845
+ - Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
846
+
847
+ ### Documentation
848
+ - Updated quickstart notebook:
849
+ - Changed installation command to use the released package instead of GitHub repository.
850
+ - Updated kernel display name.
851
+
852
+ ### Developer Notes
853
+ - Minor code refactoring and cleanup.
854
+
855
+ ## [v0.3.7] - 2024-10-17
856
+
857
+ ### New Features
858
+ 1. **Enhanced Browser Stealth**:
859
+ - Implemented `playwright_stealth` for improved bot detection avoidance.
860
+ - Added `StealthConfig` for fine-tuned control over stealth parameters.
861
+
862
+ 2. **User Simulation**:
863
+ - New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses).
864
+
865
+ 3. **Navigator Override**:
866
+ - Added `override_navigator` option to modify navigator properties, further improving bot detection evasion.
867
+
868
+ 4. **Improved iframe Handling**:
869
+ - New `process_iframes` parameter to extract and integrate iframe content into the main page.
870
+
871
+ 5. **Flexible Browser Selection**:
872
+ - Support for choosing between Chromium, Firefox, and WebKit browsers.
873
+
874
+ 6. **Include Links in Markdown**:
875
+ - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
876
+
877
+ ### Improvements
878
+ 1. **Better Error Handling**:
879
+ - Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions.
880
+ - Added console message and error logging for better debugging.
881
+
882
+ 2. **Image Processing Enhancements**:
883
+ - Improved image dimension updating and filtering logic.
884
+
885
+ 3. **Crawling Flexibility**:
886
+ - Added support for custom viewport sizes.
887
+ - Implemented delayed content retrieval with `delay_before_return_html` parameter.
888
+
889
+ 4. **Performance Optimization**:
890
+ - Adjusted default semaphore count for parallel crawling.
891
+
892
+ ### Bug Fixes
893
+ - Fixed an issue where the HTML content could be empty after processing.
894
+
895
+ ### Examples
896
+ - Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features.
897
+
898
+ ### Developer Notes
899
+ - Refactored code for better maintainability and readability.
900
+ - Updated browser launch arguments for improved compatibility and performance.
901
+
902
+ ## [v0.3.6] - 2024-10-12
903
+
904
+ ### 1. Improved Crawling Control
905
+ - **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`.
906
+ - **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content.
907
+ - Useful for pages with delayed content loading.
908
+ - **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout.
909
+ - Provides better handling for slow-loading pages.
910
+ - **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`.
911
+
912
+ ### 2. Browser Type Selection
913
+ - Added support for different browser types (Chromium, Firefox, WebKit).
914
+ - Users can now specify the browser type when initializing AsyncWebCrawler.
915
+ - **How to use**: Set `browser_type="firefox"` or `browser_type="webkit"` when initializing AsyncWebCrawler.
916
+
917
+ ### 3. Screenshot Capture
918
+ - Added ability to capture screenshots during crawling.
919
+ - Useful for debugging and content verification.
920
+ - **How to use**: Set `screenshot=True` when calling `crawler.arun()`.
921
+
922
+ ### 4. Enhanced LLM Extraction Strategy
923
+ - Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama).
924
+ - **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter.
925
+ - **Custom Headers**: Users can now pass custom headers to the extraction strategy.
926
+ - **How to use**: Specify the desired provider and custom arguments when using `LLMExtractionStrategy`.
927
+
928
+ ### 5. iframe Content Extraction
929
+ - New feature to process and extract content from iframes.
930
+ - **How to use**: Set `process_iframes=True` in the crawl method.
931
+
932
+ ### 6. Delayed Content Retrieval
933
+ - Introduced `get_delayed_content` method in `AsyncCrawlResponse`.
934
+ - Allows retrieval of content after a specified delay, useful for dynamically loaded content.
935
+ - **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling.
936
+
937
+ ### Improvements and Optimizations
938
+
939
+ #### 1. AsyncWebCrawler Enhancements
940
+ - **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy.
941
+ - Allows for more customized setups.
942
+
943
+ #### 2. Image Processing Optimization
944
+ - Enhanced image handling in WebScrapingStrategy.
945
+ - Added filtering for small, invisible, or irrelevant images.
946
+ - Improved image scoring system for better content relevance.
947
+ - Implemented JavaScript-based image dimension updating for more accurate representation.
948
+
949
+ #### 3. Database Schema Auto-updates
950
+ - Automatic database schema updates ensure compatibility with the latest version.
951
+
952
+ #### 4. Enhanced Error Handling and Logging
953
+ - Improved error messages and logging for easier debugging.
954
+
955
+ #### 5. Content Extraction Refinements
956
+ - Refined HTML sanitization process.
957
+ - Improved handling of base64 encoded images.
958
+ - Enhanced Markdown conversion process.
959
+ - Optimized content extraction algorithms.
960
+
961
+ #### 6. Utility Function Enhancements
962
+ - `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers.
963
+
964
+ ### Bug Fixes
965
+ - Fixed an issue where image tags were being prematurely removed during content extraction.
966
+
967
+ ### Examples and Documentation
968
+ - Updated `quickstart_async.py` with examples of:
969
+ - Using custom headers in LLM extraction.
970
+ - Different LLM provider usage (OpenAI, Hugging Face, Ollama).
971
+ - Custom browser type usage.
972
+
973
+ ### Developer Notes
974
+ - Refactored code for better maintainability, flexibility, and performance.
975
+ - Enhanced type hinting throughout the codebase for improved development experience.
976
+ - Expanded error handling for more robust operation.
977
+
978
+ These updates significantly enhance the flexibility, accuracy, and robustness of crawl4ai, providing users with more control and options for their web crawling and content extraction tasks.
979
+
980
+ ## [v0.3.5] - 2024-09-02
981
+
982
+ Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
983
+
984
+ - Implement smart_wait function in AsyncPlaywrightCrawlerStrategy
985
+ - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler
986
+ - Improve error handling and timeout management in crawling process
987
+ - Fix typo in CrawlResult model (responser_headers -> response_headers)
988
+
989
+ ## [v0.2.77] - 2024-08-04
990
+
991
+ Significant improvements in text processing and performance:
992
+
993
+ - 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
994
+ - 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
995
+ - ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
996
+ - 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
997
+
998
+ These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
999
+
1000
+ ## [v0.2.76] - 2024-08-02
1001
+
1002
+ Major improvements in functionality, performance, and cross-platform compatibility! 🚀
1003
+
1004
+ - 🐳 **Docker enhancements**: Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows.
1005
+ - 🌐 **Official Docker Hub image**: Launched our first official image on Docker Hub for streamlined deployment.
1006
+ - 🔧 **Selenium upgrade**: Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility.
1007
+ - 🖼️ **Image description**: Implemented ability to generate textual descriptions for extracted images from web pages.
1008
+ - ⚡ **Performance boost**: Various improvements to enhance overall speed and performance.
1009
+
1010
+ A big shoutout to our amazing community contributors:
1011
+ - [@aravindkarnam](https://github.com/aravindkarnam) for developing the textual description extraction feature.
1012
+ - [@FractalMind](https://github.com/FractalMind) for creating the first official Docker Hub image and fixing Dockerfile errors.
1013
+ - [@ketonkss4](https://github.com/ketonkss4) for identifying Selenium's new capabilities, helping us reduce dependencies.
1014
+
1015
+ Your contributions are driving Crawl4AI forward! 🙌
1016
+
1017
+ ## [v0.2.75] - 2024-07-19
1018
+
1019
+ Minor improvements for a more maintainable codebase:
1020
+
1021
+ - 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
1022
+ - 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
1023
+
1024
+ These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
1025
+
1026
+ ## [v0.2.74] - 2024-07-08
1027
+ A slew of exciting updates to improve the crawler's stability and robustness! 🎉
1028
+
1029
+ - 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
1030
+ - 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
1031
+ - 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
1032
+ - 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
1033
+
1034
+
1035
+ ## [v0.2.73] - 2024-07-03
1036
+
1037
+ 💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
1038
+
1039
+ * Supporting website need "with-head" mode to crawl the website with head.
1040
+ * Fixing the installation issues for setup.py and dockerfile.
1041
+ * Resolve multiple issues.
1042
+
1043
+ ## [v0.2.72] - 2024-06-30
1044
+
1045
+ This release brings exciting updates and improvements to our project! 🎉
1046
+
1047
+ * 📚 **Documentation Updates**: Our documentation has been revamped to reflect the latest changes and additions.
1048
+ * 🚀 **New Modes in setup.py**: We've added support for three new modes in setup.py: default, torch, and transformers. This enhances the project's flexibility and usability.
1049
+ * 🐳 **Docker File Updates**: The Docker file has been updated to ensure seamless compatibility with the new modes and improvements.
1050
+ * 🕷️ **Temporary Solution for Headless Crawling**: We've implemented a temporary solution to overcome issues with crawling websites in headless mode.
1051
+
1052
+ These changes aim to improve the overall user experience, provide more flexibility, and enhance the project's performance. We're thrilled to share these updates with you and look forward to continuing to evolve and improve our project!
1053
+
1054
+ ## [0.2.71] - 2024-06-26
1055
+
1056
+ **Improved Error Handling and Performance** 🚧
1057
+
1058
+ * 🚫 Refactored `crawler_strategy.py` to handle exceptions and provide better error messages, making it more robust and reliable.
1059
+ * 💻 Optimized the `get_content_of_website_optimized` function in `utils.py` for improved performance, reducing potential bottlenecks.
1060
+ * 💻 Updated `utils.py` with the latest changes, ensuring consistency and accuracy.
1061
+ * 🚫 Migrated to `ChromeDriverManager` to resolve Chrome driver download issues, providing a smoother user experience.
1062
+
1063
+ These changes focus on refining the existing codebase, resulting in a more stable, efficient, and user-friendly experience. With these improvements, you can expect fewer errors and better performance in the crawler strategy and utility functions.
1064
+
1065
+ ## [0.2.71] - 2024-06-25
1066
+ ### Fixed
1067
+ - Speed up twice the extraction function.
1068
+
1069
+
1070
+ ## [0.2.6] - 2024-06-22
1071
+ ### Fixed
1072
+ - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
1073
+
1074
+ ## [0.2.5] - 2024-06-18
1075
+ ### Added
1076
+ - Added five important hooks to the crawler:
1077
+ - on_driver_created: Called when the driver is ready for initializations.
1078
+ - before_get_url: Called right before Selenium fetches the URL.
1079
+ - after_get_url: Called after Selenium fetches the URL.
1080
+ - before_return_html: Called when the data is parsed and ready.
1081
+ - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
1082
+ - Added an example in `quickstart.py` in the example folder under the docs.
1083
+ - Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
1084
+ - Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
1085
+ - Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!).
1086
+
1087
+ ## [v0.2.4] - 2024-06-17
1088
+ ### Fixed
1089
+ - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Crawl4AI Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual
10
+ identity and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the overall
26
+ community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or advances of
31
+ any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email address,
35
+ without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official email address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ [email protected]. All complaints will be reviewed and investigated promptly and fairly.
64
+
65
+ All community leaders are obligated to respect the privacy and security of the
66
+ reporter of any incident.
67
+
68
+ ## Enforcement Guidelines
69
+
70
+ Community leaders will follow these Community Impact Guidelines in determining
71
+ the consequences for any action they deem in violation of this Code of Conduct:
72
+
73
+ ### 1. Correction
74
+
75
+ **Community Impact**: Use of inappropriate language or other behavior deemed
76
+ unprofessional or unwelcome in the community.
77
+
78
+ **Consequence**: A private, written warning from community leaders, providing
79
+ clarity around the nature of the violation and an explanation of why the
80
+ behavior was inappropriate. A public apology may be requested.
81
+
82
+ ### 2. Warning
83
+
84
+ **Community Impact**: A violation through a single incident or series of
85
+ actions.
86
+
87
+ **Consequence**: A warning with consequences for continued behavior. No
88
+ interaction with the people involved, including unsolicited interaction with
89
+ those enforcing the Code of Conduct, for a specified period of time. This
90
+ includes avoiding interactions in community spaces as well as external channels
91
+ like social media. Violating these terms may lead to a temporary or permanent
92
+ ban.
93
+
94
+ ### 3. Temporary Ban
95
+
96
+ **Community Impact**: A serious violation of community standards, including
97
+ sustained inappropriate behavior.
98
+
99
+ **Consequence**: A temporary ban from any sort of interaction or public
100
+ communication with the community for a specified period of time. No public or
101
+ private interaction with the people involved, including unsolicited interaction
102
+ with those enforcing the Code of Conduct, is allowed during this period.
103
+ Violating these terms may lead to a permanent ban.
104
+
105
+ ### 4. Permanent Ban
106
+
107
+ **Community Impact**: Demonstrating a pattern of violation of community
108
+ standards, including sustained inappropriate behavior, harassment of an
109
+ individual, or aggression toward or disparagement of classes of individuals.
110
+
111
+ **Consequence**: A permanent ban from any sort of public interaction within the
112
+ community.
113
+
114
+ ## Attribution
115
+
116
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117
+ version 2.1, available at
118
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
119
+
120
+ Community Impact Guidelines were inspired by
121
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
122
+
123
+ For answers to common questions about this code of conduct, see the FAQ at
124
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
125
+ [https://www.contributor-covenant.org/translations][translations].
126
+
127
+ [homepage]: https://www.contributor-covenant.org
128
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
129
+ [Mozilla CoC]: https://github.com/mozilla/diversity
130
+ [FAQ]: https://www.contributor-covenant.org/faq
131
+ [translations]: https://www.contributor-covenant.org/translations
CONTRIBUTORS.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributors to Crawl4AI
2
+
3
+ We would like to thank the following people for their contributions to Crawl4AI:
4
+
5
+ ## Core Team
6
+
7
+ - [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
8
+ - [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
9
+ - [Aravind Karnam](https://github.com/aravindkarnam) - Developer
10
+
11
+ ## Community Contributors
12
+
13
+ - [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined.
14
+ - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
15
+ - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
16
+ - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
17
+ - [datehoer](https://github.com/datehoer) - Add browser prxy support
18
+
19
+ ## Pull Requests
20
+
21
+ - [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304)
22
+ - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
23
+ - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
24
+ - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
25
+ - [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
26
+
27
+
28
+ ## Other Contributors
29
+
30
+ - [Gokhan](https://github.com/gkhngyk)
31
+ - [Shiv Kumar](https://github.com/shivkumar0757)
32
+ - [QIN2DIM](https://github.com/QIN2DIM)
33
+
34
+ ## Acknowledgements
35
+
36
+ We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
37
+
38
+ ---
39
+
40
+ If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly.
41
+
42
+ Thank you all for your contributions!
Dockerfile ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1.4
2
+
3
+ ARG TARGETPLATFORM
4
+ ARG BUILDPLATFORM
5
+
6
+ # Other build arguments
7
+ ARG PYTHON_VERSION=3.10
8
+
9
+ # Base stage with system dependencies
10
+ FROM python:${PYTHON_VERSION}-slim as base
11
+
12
+ # Declare ARG variables again within the build stage
13
+ ARG INSTALL_TYPE=basic
14
+ ARG ENABLE_GPU=false
15
+
16
+ # Platform-specific labels
17
+ LABEL maintainer="unclecode"
18
+ LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
19
+ LABEL version="1.0"
20
+
21
+ # Environment setup
22
+ ENV PYTHONUNBUFFERED=1 \
23
+ PYTHONDONTWRITEBYTECODE=1 \
24
+ PIP_NO_CACHE_DIR=1 \
25
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
26
+ PIP_DEFAULT_TIMEOUT=100 \
27
+ DEBIAN_FRONTEND=noninteractive
28
+
29
+ # Install system dependencies
30
+ RUN apt-get update && apt-get install -y --no-install-recommends \
31
+ build-essential \
32
+ curl \
33
+ wget \
34
+ gnupg \
35
+ git \
36
+ cmake \
37
+ pkg-config \
38
+ python3-dev \
39
+ libjpeg-dev \
40
+ libpng-dev \
41
+ && rm -rf /var/lib/apt/lists/*
42
+
43
+ # Playwright system dependencies for Linux
44
+ RUN apt-get update && apt-get install -y --no-install-recommends \
45
+ libglib2.0-0 \
46
+ libnss3 \
47
+ libnspr4 \
48
+ libatk1.0-0 \
49
+ libatk-bridge2.0-0 \
50
+ libcups2 \
51
+ libdrm2 \
52
+ libdbus-1-3 \
53
+ libxcb1 \
54
+ libxkbcommon0 \
55
+ libx11-6 \
56
+ libxcomposite1 \
57
+ libxdamage1 \
58
+ libxext6 \
59
+ libxfixes3 \
60
+ libxrandr2 \
61
+ libgbm1 \
62
+ libpango-1.0-0 \
63
+ libcairo2 \
64
+ libasound2 \
65
+ libatspi2.0-0 \
66
+ && rm -rf /var/lib/apt/lists/*
67
+
68
+ # GPU support if enabled and architecture is supported
69
+ RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
70
+ apt-get update && apt-get install -y --no-install-recommends \
71
+ nvidia-cuda-toolkit \
72
+ && rm -rf /var/lib/apt/lists/* ; \
73
+ else \
74
+ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
75
+ fi
76
+
77
+ # Create and set working directory
78
+ WORKDIR /app
79
+
80
+ # Copy the entire project
81
+ COPY . .
82
+
83
+ # Install base requirements
84
+ RUN pip install --no-cache-dir -r requirements.txt
85
+
86
+ # Install required library for FastAPI
87
+ RUN pip install fastapi uvicorn psutil
88
+
89
+ # Install ML dependencies first for better layer caching
90
+ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
91
+ pip install --no-cache-dir \
92
+ torch \
93
+ torchvision \
94
+ torchaudio \
95
+ scikit-learn \
96
+ nltk \
97
+ transformers \
98
+ tokenizers && \
99
+ python -m nltk.downloader punkt stopwords ; \
100
+ fi
101
+
102
+ # Install the package
103
+ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
104
+ pip install ".[all]" && \
105
+ python -m crawl4ai.model_loader ; \
106
+ elif [ "$INSTALL_TYPE" = "torch" ] ; then \
107
+ pip install ".[torch]" ; \
108
+ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
109
+ pip install ".[transformer]" && \
110
+ python -m crawl4ai.model_loader ; \
111
+ else \
112
+ pip install "." ; \
113
+ fi
114
+
115
+ # Install MkDocs and required plugins
116
+ RUN pip install --no-cache-dir \
117
+ mkdocs \
118
+ mkdocs-material \
119
+ mkdocs-terminal \
120
+ pymdown-extensions
121
+
122
+ # Build MkDocs documentation
123
+ RUN mkdocs build
124
+
125
+ # Install Playwright and browsers
126
+ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
127
+ playwright install chromium; \
128
+ elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
129
+ playwright install chromium; \
130
+ fi
131
+
132
+ # Expose port
133
+ EXPOSE 8000 11235 9222 8080
134
+
135
+ # Start the FastAPI server
136
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
LICENSE ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
10
+
11
+ "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
12
+
13
+ "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
14
+
15
+ "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
16
+
17
+ "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
18
+
19
+ "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
20
+
21
+ "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
22
+
23
+ "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
24
+
25
+ "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
26
+
27
+ "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
28
+
29
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
30
+
31
+ 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
32
+
33
+ 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
34
+
35
+ You must give any other recipients of the Work or Derivative Works a copy of this License; and
36
+ You must cause any modified files to carry prominent notices stating that You changed the files; and
37
+ You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
38
+ If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
39
+ You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
40
+
41
+ 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
42
+
43
+ 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
44
+
45
+ 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
46
+
47
+ 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
48
+
49
+ 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
50
+
51
+ END OF TERMS AND CONDITIONS
MANIFEST.in ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ include requirements.txt
2
+ recursive-include crawl4ai/js_snippet *.js
MISSION.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mission
2
+
3
+ ![Mission Diagram](./docs/assets/pitch-dark.svg)
4
+
5
+ ### 1. The Data Capitalization Opportunity
6
+
7
+ We live in an unprecedented era of digital wealth creation. Every day, individuals and enterprises generate massive amounts of valuable digital footprints across various platforms, social media channels, messenger apps, and cloud services. While people can interact with their data within these platforms, there's an immense untapped opportunity to transform this data into true capital assets. Just as physical property became a foundational element of wealth creation, personal and enterprise data has the potential to become a new form of capital on balance sheets.
8
+
9
+ For individuals, this represents an opportunity to transform their digital activities into valuable assets. For enterprises, their internal communications, team discussions, and collaborative documents contain rich insights that could be structured and valued as intellectual capital. This wealth of information represents an unprecedented opportunity for value creation in the digital age.
10
+
11
+ ### 2. The Potential of Authentic Data
12
+
13
+ While synthetic data has played a crucial role in AI development, there's an enormous untapped potential in the authentic data generated by individuals and organizations. Every message, document, and interaction contains unique insights and patterns that could enhance AI development. The challenge isn't a lack of data - it's that most authentic human-generated data remains inaccessible for productive use.
14
+
15
+ By enabling willing participation in data sharing, we can unlock this vast reservoir of authentic human knowledge. This represents an opportunity to enhance AI development with diverse, real-world data that reflects the full spectrum of human experience and knowledge.
16
+
17
+ ## Our Pathway to Data Democracy
18
+
19
+ ### 1. Open-Source Foundation
20
+
21
+ Our first step is creating an open-source data extraction engine that empowers developers and innovators to build tools for data structuring and organization. This foundation ensures transparency, security, and community-driven development. By making these tools openly available, we enable the technical infrastructure needed for true data ownership and capitalization.
22
+
23
+ ### 2. Data Capitalization Platform
24
+
25
+ Building on this open-source foundation, we're developing a platform that helps individuals and enterprises transform their digital footprints into structured, valuable assets. This platform will provide the tools and frameworks needed to organize, understand, and value personal and organizational data as true capital assets.
26
+
27
+ ### 3. Creating a Data Marketplace
28
+
29
+ The final piece is establishing a marketplace where individuals and organizations can willingly share their data assets. This creates opportunities for:
30
+ - Individuals to earn equity, revenue, or other forms of value from their data
31
+ - Enterprises to access diverse, high-quality data for AI development
32
+ - Researchers to work with authentic human-generated data
33
+ - Startups to build innovative solutions using real-world data
34
+
35
+ ## Economic Vision: A Shared Data Economy
36
+
37
+ We envision a future where data becomes a fundamental asset class in a thriving shared economy. This transformation will democratize AI development by enabling willing participation in data sharing, ensuring that the benefits of AI advancement flow back to data creators. Just as property rights revolutionized economic systems, establishing data as a capital asset will create new opportunities for wealth creation and economic participation.
38
+
39
+ This shared data economy will:
40
+ - Enable individuals to capitalize on their digital footprints
41
+ - Create new revenue streams for data creators
42
+ - Provide AI developers with access to diverse, authentic data
43
+ - Foster innovation through broader access to real-world data
44
+ - Ensure more equitable distribution of AI's economic benefits
45
+
46
+ Our vision is to facilitate this transformation from the ground up - starting with open-source tools, progressing to data capitalization platforms, and ultimately creating a thriving marketplace where data becomes a true asset class in a shared economy. This approach ensures that the future of AI is built on a foundation of authentic human knowledge, with benefits flowing back to the individuals and organizations who create and share their valuable data.
README.md CHANGED
@@ -6,6 +6,565 @@ colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: mit
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ port: 11235
10
  ---
11
 
12
+
13
+
14
+ # 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
15
+
16
+ <div align="center">
17
+
18
+ <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
19
+
20
+ [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
21
+ [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
22
+
23
+ [![PyPI version](https://badge.fury.io/py/crawl4ai.svg)](https://badge.fury.io/py/crawl4ai)
24
+ [![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/)
25
+ [![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai)
26
+
27
+ <!-- [![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/) -->
28
+ [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
29
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
30
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
31
+ [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md)
32
+
33
+ </div>
34
+
35
+ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
36
+
37
+ [✨ Check out latest update v0.4.24x](#-recent-updates)
38
+
39
+ 🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
40
+
41
+ ## 🧐 Why Crawl4AI?
42
+
43
+ 1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications.
44
+ 2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance.
45
+ 3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access.
46
+ 4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models.
47
+ 5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration.
48
+ 6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository.
49
+
50
+ ## 🚀 Quick Start
51
+
52
+ 1. Install Crawl4AI:
53
+ ```bash
54
+ # Install the package
55
+ pip install -U crawl4ai
56
+
57
+ # Run post-installation setup
58
+ crawl4ai-setup
59
+
60
+ # Verify your installation
61
+ crawl4ai-doctor
62
+ ```
63
+
64
+ If you encounter any browser-related issues, you can install them manually:
65
+ ```bash
66
+ python -m playwright install --with-deps chromium
67
+ ```
68
+
69
+ 2. Run a simple web crawl:
70
+ ```python
71
+ import asyncio
72
+ from crawl4ai import *
73
+
74
+ async def main():
75
+ async with AsyncWebCrawler() as crawler:
76
+ result = await crawler.arun(
77
+ url="https://www.nbcnews.com/business",
78
+ )
79
+ print(result.markdown)
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
83
+ ```
84
+
85
+ ## ✨ Features
86
+
87
+ <details>
88
+ <summary>📝 <strong>Markdown Generation</strong></summary>
89
+
90
+ - 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting.
91
+ - 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing.
92
+ - 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations.
93
+ - 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs.
94
+ - 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
95
+ </details>
96
+
97
+ <details>
98
+ <summary>📊 <strong>Structured Data Extraction</strong></summary>
99
+
100
+ - 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction.
101
+ - 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing.
102
+ - 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction.
103
+ - 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors.
104
+ - 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns.
105
+
106
+ </details>
107
+
108
+ <details>
109
+ <summary>🌐 <strong>Browser Integration</strong></summary>
110
+
111
+ - 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection.
112
+ - 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction.
113
+ - 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling.
114
+ - 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access.
115
+ - ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups.
116
+ - 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit.
117
+ - 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to match page content, ensuring complete rendering and capturing of all elements.
118
+
119
+ </details>
120
+
121
+ <details>
122
+ <summary>🔎 <strong>Crawling & Scraping</strong></summary>
123
+
124
+ - 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`.
125
+ - 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction.
126
+ - 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
127
+ - 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
128
+ - 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
129
+ - 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
130
+ - 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
131
+ - 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
132
+ - 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
133
+ - 🕵️ **Lazy Load Handling**: Waits for images to fully load, ensuring no content is missed due to lazy loading.
134
+ - 🔄 **Full-Page Scanning**: Simulates scrolling to load and capture all dynamic content, perfect for infinite scroll pages.
135
+
136
+ </details>
137
+
138
+ <details>
139
+ <summary>🚀 <strong>Deployment</strong></summary>
140
+
141
+ - 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment.
142
+ - 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows.
143
+ - 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance.
144
+ - ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms.
145
+
146
+ </details>
147
+
148
+ <details>
149
+ <summary>🎯 <strong>Additional Features</strong></summary>
150
+
151
+ - 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users.
152
+ - 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata.
153
+ - 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration.
154
+ - 🛡️ **Error Handling**: Robust error management for seamless execution.
155
+ - 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests.
156
+ - 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage.
157
+ - 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency.
158
+
159
+ </details>
160
+
161
+ ## Try it Now!
162
+
163
+ ✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
164
+
165
+ ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
166
+
167
+ ## Installation 🛠️
168
+
169
+ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
170
+
171
+ <details>
172
+ <summary>🐍 <strong>Using pip</strong></summary>
173
+
174
+ Choose the installation option that best fits your needs:
175
+
176
+ ### Basic Installation
177
+
178
+ For basic web crawling and scraping tasks:
179
+
180
+ ```bash
181
+ pip install crawl4ai
182
+ crawl4ai-setup # Setup the browser
183
+ ```
184
+
185
+ By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
186
+
187
+ 👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
188
+
189
+ 1. Through the command line:
190
+
191
+ ```bash
192
+ playwright install
193
+ ```
194
+
195
+ 2. If the above doesn't work, try this more specific command:
196
+
197
+ ```bash
198
+ python -m playwright install chromium
199
+ ```
200
+
201
+ This second method has proven to be more reliable in some cases.
202
+
203
+ ---
204
+
205
+ ### Installation with Synchronous Version
206
+
207
+ The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium:
208
+
209
+ ```bash
210
+ pip install crawl4ai[sync]
211
+ ```
212
+
213
+ ---
214
+
215
+ ### Development Installation
216
+
217
+ For contributors who plan to modify the source code:
218
+
219
+ ```bash
220
+ git clone https://github.com/unclecode/crawl4ai.git
221
+ cd crawl4ai
222
+ pip install -e . # Basic installation in editable mode
223
+ ```
224
+
225
+ Install optional features:
226
+
227
+ ```bash
228
+ pip install -e ".[torch]" # With PyTorch features
229
+ pip install -e ".[transformer]" # With Transformer features
230
+ pip install -e ".[cosine]" # With cosine similarity features
231
+ pip install -e ".[sync]" # With synchronous crawling (Selenium)
232
+ pip install -e ".[all]" # Install all optional features
233
+ ```
234
+
235
+ </details>
236
+
237
+ <details>
238
+ <summary>🐳 <strong>Docker Deployment</strong></summary>
239
+
240
+ > 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution.
241
+
242
+ ### Current Docker Support
243
+
244
+ The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version:
245
+
246
+ - 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation
247
+ - ⚠️ Note: This setup will be replaced in the next major release
248
+
249
+ ### What's Coming Next?
250
+
251
+ Our new Docker implementation will bring:
252
+ - Improved performance and resource efficiency
253
+ - Streamlined deployment process
254
+ - Better integration with Crawl4AI features
255
+ - Enhanced scalability options
256
+
257
+ Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates!
258
+
259
+ </details>
260
+
261
+ ---
262
+
263
+ ### Quick Test
264
+
265
+ Run a quick test (works for both Docker options):
266
+
267
+ ```python
268
+ import requests
269
+
270
+ # Submit a crawl job
271
+ response = requests.post(
272
+ "http://localhost:11235/crawl",
273
+ json={"urls": "https://example.com", "priority": 10}
274
+ )
275
+ task_id = response.json()["task_id"]
276
+
277
+ # Continue polling until the task is complete (status="completed")
278
+ result = requests.get(f"http://localhost:11235/task/{task_id}")
279
+ ```
280
+
281
+ For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
282
+
283
+ </details>
284
+
285
+
286
+ ## 🔬 Advanced Usage Examples 🔬
287
+
288
+ You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared.
289
+
290
+ <details>
291
+ <summary>📝 <strong>Heuristic Markdown Generation with Clean and Fit Markdown</strong></summary>
292
+
293
+ ```python
294
+ import asyncio
295
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
296
+ from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
297
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
298
+
299
+ async def main():
300
+ browser_config = BrowserConfig(
301
+ headless=True,
302
+ verbose=True,
303
+ )
304
+ run_config = CrawlerRunConfig(
305
+ cache_mode=CacheMode.ENABLED,
306
+ markdown_generator=DefaultMarkdownGenerator(
307
+ content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
308
+ ),
309
+ # markdown_generator=DefaultMarkdownGenerator(
310
+ # content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0)
311
+ # ),
312
+ )
313
+
314
+ async with AsyncWebCrawler(config=browser_config) as crawler:
315
+ result = await crawler.arun(
316
+ url="https://docs.micronaut.io/4.7.6/guide/",
317
+ config=run_config
318
+ )
319
+ print(len(result.markdown))
320
+ print(len(result.fit_markdown))
321
+ print(len(result.markdown_v2.fit_markdown))
322
+
323
+ if __name__ == "__main__":
324
+ asyncio.run(main())
325
+ ```
326
+
327
+ </details>
328
+
329
+ <details>
330
+ <summary>🖥️ <strong>Executing JavaScript & Extract Structured Data without LLMs</strong></summary>
331
+
332
+ ```python
333
+ import asyncio
334
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
335
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
336
+ import json
337
+
338
+ async def main():
339
+ schema = {
340
+ "name": "KidoCode Courses",
341
+ "baseSelector": "section.charge-methodology .w-tab-content > div",
342
+ "fields": [
343
+ {
344
+ "name": "section_title",
345
+ "selector": "h3.heading-50",
346
+ "type": "text",
347
+ },
348
+ {
349
+ "name": "section_description",
350
+ "selector": ".charge-content",
351
+ "type": "text",
352
+ },
353
+ {
354
+ "name": "course_name",
355
+ "selector": ".text-block-93",
356
+ "type": "text",
357
+ },
358
+ {
359
+ "name": "course_description",
360
+ "selector": ".course-content-text",
361
+ "type": "text",
362
+ },
363
+ {
364
+ "name": "course_icon",
365
+ "selector": ".image-92",
366
+ "type": "attribute",
367
+ "attribute": "src"
368
+ }
369
+ }
370
+ }
371
+
372
+ extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
373
+
374
+ browser_config = BrowserConfig(
375
+ headless=False,
376
+ verbose=True
377
+ )
378
+ run_config = CrawlerRunConfig(
379
+ extraction_strategy=extraction_strategy,
380
+ js_code=["""(async () => {const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");for(let tab of tabs) {tab.scrollIntoView();tab.click();await new Promise(r => setTimeout(r, 500));}})();"""],
381
+ cache_mode=CacheMode.BYPASS
382
+ )
383
+
384
+ async with AsyncWebCrawler(config=browser_config) as crawler:
385
+
386
+ result = await crawler.arun(
387
+ url="https://www.kidocode.com/degrees/technology",
388
+ config=run_config
389
+ )
390
+
391
+ companies = json.loads(result.extracted_content)
392
+ print(f"Successfully extracted {len(companies)} companies")
393
+ print(json.dumps(companies[0], indent=2))
394
+
395
+
396
+ if __name__ == "__main__":
397
+ asyncio.run(main())
398
+ ```
399
+
400
+ </details>
401
+
402
+ <details>
403
+ <summary>📚 <strong>Extracting Structured Data with LLMs</strong></summary>
404
+
405
+ ```python
406
+ import os
407
+ import asyncio
408
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
409
+ from crawl4ai.extraction_strategy import LLMExtractionStrategy
410
+ from pydantic import BaseModel, Field
411
+
412
+ class OpenAIModelFee(BaseModel):
413
+ model_name: str = Field(..., description="Name of the OpenAI model.")
414
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
415
+ output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
416
+
417
+ async def main():
418
+ browser_config = BrowserConfig(verbose=True)
419
+ run_config = CrawlerRunConfig(
420
+ word_count_threshold=1,
421
+ extraction_strategy=LLMExtractionStrategy(
422
+ # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
423
+ # provider="ollama/qwen2", api_token="no-token",
424
+ provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
425
+ schema=OpenAIModelFee.schema(),
426
+ extraction_type="schema",
427
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
428
+ Do not miss any models in the entire content. One extracted model JSON format should look like this:
429
+ {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
430
+ ),
431
+ cache_mode=CacheMode.BYPASS,
432
+ )
433
+
434
+ async with AsyncWebCrawler(config=browser_config) as crawler:
435
+ result = await crawler.arun(
436
+ url='https://openai.com/api/pricing/',
437
+ config=run_config
438
+ )
439
+ print(result.extracted_content)
440
+
441
+ if __name__ == "__main__":
442
+ asyncio.run(main())
443
+ ```
444
+
445
+ </details>
446
+
447
+ <details>
448
+ <summary>🤖 <strong>Using You own Browswer with Custome User Profile</strong></summary>
449
+
450
+ ```python
451
+ import os, sys
452
+ from pathlib import Path
453
+ import asyncio, time
454
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
455
+
456
+ async def test_news_crawl():
457
+ # Create a persistent user data directory
458
+ user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
459
+ os.makedirs(user_data_dir, exist_ok=True)
460
+
461
+ browser_config = BrowserConfig(
462
+ verbose=True,
463
+ headless=True,
464
+ user_data_dir=user_data_dir,
465
+ use_persistent_context=True,
466
+ )
467
+ run_config = CrawlerRunConfig(
468
+ cache_mode=CacheMode.BYPASS
469
+ )
470
+
471
+ async with AsyncWebCrawler(config=browser_config) as crawler:
472
+ url = "ADDRESS_OF_A_CHALLENGING_WEBSITE"
473
+
474
+ result = await crawler.arun(
475
+ url,
476
+ config=run_config,
477
+ magic=True,
478
+ )
479
+
480
+ print(f"Successfully crawled {url}")
481
+ print(f"Content length: {len(result.markdown)}")
482
+ ```
483
+
484
+ </details>
485
+
486
+
487
+ ## ✨ Recent Updates
488
+
489
+ - 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling
490
+ - 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies
491
+ - 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction
492
+ - 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types
493
+ - ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management
494
+ - 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking
495
+ - 🔐 **Security Features**: Improved input validation and safe expression evaluation
496
+
497
+ Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
498
+
499
+ ## 📖 Documentation & Roadmap
500
+
501
+ > 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide!
502
+
503
+ For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
504
+
505
+ To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md).
506
+
507
+ <details>
508
+ <summary>📈 <strong>Development TODOs</strong></summary>
509
+
510
+ - [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction
511
+ - [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction
512
+ - [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction
513
+ - [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations
514
+ - [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas
515
+ - [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce)
516
+ - [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content
517
+ - [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance
518
+ - [ ] 8. Performance Monitor: Real-time insights into crawler operations
519
+ - [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers
520
+ - [ ] 10. Sponsorship Program: Structured support system with tiered benefits
521
+ - [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials
522
+
523
+ </details>
524
+
525
+ ## 🤝 Contributing
526
+
527
+ We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
528
+
529
+ ## 📄 License
530
+
531
+ Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
532
+
533
+ ## 📧 Contact
534
+
535
+ For questions, suggestions, or feedback, feel free to reach out:
536
+
537
+ - GitHub: [unclecode](https://github.com/unclecode)
538
+ - Twitter: [@unclecode](https://twitter.com/unclecode)
539
+ - Website: [crawl4ai.com](https://crawl4ai.com)
540
+
541
+ Happy Crawling! 🕸️🚀
542
+
543
+ ## 🗾 Mission
544
+
545
+ Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy.
546
+
547
+ We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement.
548
+
549
+ <details>
550
+ <summary>🔑 <strong>Key Opportunities</strong></summary>
551
+
552
+ - **Data Capitalization**: Transform digital footprints into measurable, valuable assets.
553
+ - **Authentic AI Data**: Provide AI systems with real human insights.
554
+ - **Shared Economy**: Create a fair data marketplace that benefits data creators.
555
+
556
+ </details>
557
+
558
+ <details>
559
+ <summary>🚀 <strong>Development Pathway</strong></summary>
560
+
561
+ 1. **Open-Source Tools**: Community-driven platforms for transparent data extraction.
562
+ 2. **Digital Asset Structuring**: Tools to organize and value digital knowledge.
563
+ 3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data.
564
+
565
+ For more details, see our [full mission statement](./MISSION.md).
566
+ </details>
567
+
568
+ ## Star History
569
+
570
+ [![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
ROADMAP.md ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Crawl4AI Strategic Roadmap
2
+
3
+ ```mermaid
4
+ %%{init: {'themeVariables': { 'fontSize': '14px'}}}%%
5
+ graph TD
6
+ subgraph A1[Advanced Crawling Systems 🔧]
7
+ A["`
8
+ • Graph Crawler ✓
9
+ • Question-Based Crawler
10
+ • Knowledge-Optimal Crawler
11
+ • Agentic Crawler
12
+ `"]
13
+ end
14
+
15
+ subgraph A2[Specialized Features 🛠️]
16
+ B["`
17
+ • Automated Schema Generator
18
+ • Domain-Specific Scrapers
19
+
20
+
21
+ `"]
22
+ end
23
+
24
+ subgraph A3[Development Tools 🔨]
25
+ C["`
26
+ • Interactive Playground
27
+ • Performance Monitor
28
+ • Cloud Integration
29
+
30
+ `"]
31
+ end
32
+
33
+ subgraph A4[Community & Growth 🌱]
34
+ D["`
35
+ • Sponsorship Program
36
+ • Educational Content
37
+
38
+
39
+ `"]
40
+ end
41
+
42
+ classDef default fill:#f9f9f9,stroke:#333,stroke-width:2px
43
+ classDef section fill:#f0f0f0,stroke:#333,stroke-width:4px,rx:10
44
+ class A1,A2,A3,A4 section
45
+
46
+ %% Layout hints
47
+ A1 --> A2[" "]
48
+ A3 --> A4[" "]
49
+ linkStyle 0,1 stroke:none
50
+ ```
51
+
52
+ Crawl4AI is evolving to provide more intelligent, efficient, and versatile web crawling capabilities. This roadmap outlines the key developments and features planned for the project, organized into strategic sections that build upon our current foundation.
53
+
54
+ ## 1. Advanced Crawling Systems 🔧
55
+
56
+ This section introduces three powerful crawling systems that extend Crawl4AI's capabilities from basic web crawling to intelligent, purpose-driven data extraction.
57
+
58
+ ### 1.1 Question-Based Crawler
59
+ The Question-Based Crawler enhances our core engine by enabling automatic discovery and extraction of relevant web content based on natural language questions.
60
+
61
+ Key Features:
62
+ - SerpiAPI integration for intelligent web search
63
+ - Relevancy scoring for search results
64
+ - Automatic URL discovery and prioritization
65
+ - Cross-source validation
66
+
67
+ ```python
68
+ from crawl4ai import AsyncWebCrawler
69
+ from crawl4ai.discovery import QuestionBasedDiscovery
70
+
71
+ async with AsyncWebCrawler() as crawler:
72
+ discovery = QuestionBasedDiscovery(crawler)
73
+ results = await discovery.arun(
74
+ question="What are the system requirements for major cloud providers' GPU instances?",
75
+ max_urls=5,
76
+ relevance_threshold=0.7
77
+ )
78
+
79
+ for result in results:
80
+ print(f"Source: {result.url} (Relevance: {result.relevance_score})")
81
+ print(f"Content: {result.markdown}\n")
82
+ ```
83
+
84
+ ### 1.2 Knowledge-Optimal Crawler
85
+ An intelligent crawling system that solves the optimization problem of minimizing data extraction while maximizing knowledge acquisition for specific objectives.
86
+
87
+ Key Features:
88
+ - Smart content prioritization
89
+ - Minimal data extraction for maximum knowledge
90
+ - Probabilistic relevance assessment
91
+ - Objective-driven crawling paths
92
+
93
+ ```python
94
+ from crawl4ai import AsyncWebCrawler
95
+ from crawl4ai.optimization import KnowledgeOptimizer
96
+
97
+ async with AsyncWebCrawler() as crawler:
98
+ optimizer = KnowledgeOptimizer(
99
+ objective="Understand GPU instance pricing and limitations across cloud providers",
100
+ required_knowledge=[
101
+ "pricing structure",
102
+ "GPU specifications",
103
+ "usage limits",
104
+ "availability zones"
105
+ ],
106
+ confidence_threshold=0.85
107
+ )
108
+
109
+ result = await crawler.arun(
110
+ urls=[
111
+ "https://aws.amazon.com/ec2/pricing/",
112
+ "https://cloud.google.com/gpu",
113
+ "https://azure.microsoft.com/pricing/"
114
+ ],
115
+ optimizer=optimizer,
116
+ optimization_mode="minimal_extraction"
117
+ )
118
+
119
+ print(f"Knowledge Coverage: {result.knowledge_coverage}")
120
+ print(f"Data Efficiency: {result.efficiency_ratio}")
121
+ print(f"Extracted Content: {result.optimal_content}")
122
+ ```
123
+
124
+ ### 1.3 Agentic Crawler
125
+ An autonomous system capable of understanding complex goals and automatically planning and executing multi-step crawling operations.
126
+
127
+ Key Features:
128
+ - Autonomous goal interpretation
129
+ - Dynamic step planning
130
+ - Interactive navigation capabilities
131
+ - Visual recognition and interaction
132
+ - Automatic error recovery
133
+
134
+ ```python
135
+ from crawl4ai import AsyncWebCrawler
136
+ from crawl4ai.agents import CrawlerAgent
137
+
138
+ async with AsyncWebCrawler() as crawler:
139
+ agent = CrawlerAgent(crawler)
140
+
141
+ # Automatic planning and execution
142
+ result = await agent.arun(
143
+ goal="Find research papers about quantum computing published in 2023 with more than 50 citations",
144
+ auto_retry=True
145
+ )
146
+ print("Generated Plan:", result.executed_steps)
147
+ print("Extracted Data:", result.data)
148
+
149
+ # Using custom steps with automatic execution
150
+ result = await agent.arun(
151
+ goal="Extract conference deadlines from ML conferences",
152
+ custom_plan=[
153
+ "Navigate to conference page",
154
+ "Find important dates section",
155
+ "Extract submission deadlines",
156
+ "Verify dates are for 2024"
157
+ ]
158
+ )
159
+
160
+ # Monitoring execution
161
+ print("Step Completion:", result.step_status)
162
+ print("Execution Time:", result.execution_time)
163
+ print("Success Rate:", result.success_rate)
164
+ ```
165
+
166
+ # Section 2: Specialized Features 🛠️
167
+
168
+ This section introduces specialized tools and features that enhance Crawl4AI's capabilities for specific use cases and data extraction needs.
169
+
170
+ ### 2.1 Automated Schema Generator
171
+ A system that automatically generates JsonCssExtractionStrategy schemas from natural language descriptions, making structured data extraction accessible to all users.
172
+
173
+ Key Features:
174
+ - Natural language schema generation
175
+ - Automatic pattern detection
176
+ - Predefined schema templates
177
+ - Chrome extension for visual schema building
178
+
179
+ ```python
180
+ from crawl4ai import AsyncWebCrawler
181
+ from crawl4ai.schema import SchemaGenerator
182
+
183
+ # Generate schema from natural language description
184
+ generator = SchemaGenerator()
185
+ schema = await generator.generate(
186
+ url="https://news-website.com",
187
+ description="For each news article on the page, I need the headline, publication date, and main image"
188
+ )
189
+
190
+ # Use generated schema with crawler
191
+ async with AsyncWebCrawler() as crawler:
192
+ result = await crawler.arun(
193
+ url="https://news-website.com",
194
+ extraction_strategy=schema
195
+ )
196
+
197
+ # Example of generated schema:
198
+ """
199
+ {
200
+ "name": "News Article Extractor",
201
+ "baseSelector": "article.news-item",
202
+ "fields": [
203
+ {
204
+ "name": "headline",
205
+ "selector": "h2.article-title",
206
+ "type": "text"
207
+ },
208
+ {
209
+ "name": "date",
210
+ "selector": "span.publish-date",
211
+ "type": "text"
212
+ },
213
+ {
214
+ "name": "image",
215
+ "selector": "img.article-image",
216
+ "type": "attribute",
217
+ "attribute": "src"
218
+ }
219
+ ]
220
+ }
221
+ """
222
+ ```
223
+
224
+ ### 2.2 Domain Specific Scrapers
225
+ Specialized extraction strategies optimized for common website types and platforms, providing consistent and reliable data extraction without additional configuration.
226
+
227
+ Key Features:
228
+ - Pre-configured extractors for popular platforms
229
+ - Academic site specialization (arXiv, NCBI)
230
+ - E-commerce standardization
231
+ - Documentation site handling
232
+
233
+ ```python
234
+ from crawl4ai import AsyncWebCrawler
235
+ from crawl4ai.extractors import AcademicExtractor, EcommerceExtractor
236
+
237
+ async with AsyncWebCrawler() as crawler:
238
+ # Academic paper extraction
239
+ papers = await crawler.arun(
240
+ url="https://arxiv.org/list/cs.AI/recent",
241
+ extractor="academic", # Built-in extractor type
242
+ site_type="arxiv", # Specific site optimization
243
+ extract_fields=[
244
+ "title",
245
+ "authors",
246
+ "abstract",
247
+ "citations"
248
+ ]
249
+ )
250
+
251
+ # E-commerce product data
252
+ products = await crawler.arun(
253
+ url="https://store.example.com/products",
254
+ extractor="ecommerce",
255
+ extract_fields=[
256
+ "name",
257
+ "price",
258
+ "availability",
259
+ "reviews"
260
+ ]
261
+ )
262
+ ```
263
+
264
+ ### 2.3 Web Embedding Index
265
+ Creates and maintains a semantic search infrastructure for crawled content, enabling efficient retrieval and querying of web content through vector embeddings.
266
+
267
+ Key Features:
268
+ - Automatic embedding generation
269
+ - Intelligent content chunking
270
+ - Efficient vector storage and indexing
271
+ - Semantic search capabilities
272
+
273
+ ```python
274
+ from crawl4ai import AsyncWebCrawler
275
+ from crawl4ai.indexing import WebIndex
276
+
277
+ # Initialize and build index
278
+ index = WebIndex(model="efficient-mini")
279
+
280
+ async with AsyncWebCrawler() as crawler:
281
+ # Crawl and index content
282
+ await index.build(
283
+ urls=["https://docs.example.com"],
284
+ crawler=crawler,
285
+ options={
286
+ "chunk_method": "semantic",
287
+ "update_policy": "incremental",
288
+ "embedding_batch_size": 100
289
+ }
290
+ )
291
+
292
+ # Search through indexed content
293
+ results = await index.search(
294
+ query="How to implement OAuth authentication?",
295
+ filters={
296
+ "content_type": "technical",
297
+ "recency": "6months"
298
+ },
299
+ top_k=5
300
+ )
301
+
302
+ # Get similar content
303
+ similar = await index.find_similar(
304
+ url="https://docs.example.com/auth/oauth",
305
+ threshold=0.85
306
+ )
307
+ ```
308
+
309
+ Each of these specialized features builds upon Crawl4AI's core functionality while providing targeted solutions for specific use cases. They can be used independently or combined for more complex data extraction and processing needs.
310
+
311
+ # Section 3: Development Tools 🔧
312
+
313
+ This section covers tools designed to enhance the development experience, monitoring, and deployment of Crawl4AI applications.
314
+
315
+ ### 3.1 Crawl4AI Playground 🎮
316
+
317
+ The Crawl4AI Playground is an interactive web-based development environment that simplifies web scraping experimentation, development, and deployment. With its intuitive interface and AI-powered assistance, users can quickly prototype, test, and deploy web scraping solutions.
318
+
319
+ #### Key Features 🌟
320
+
321
+ ##### Visual Strategy Builder
322
+ - Interactive point-and-click interface for building extraction strategies
323
+ - Real-time preview of selected elements
324
+ - Side-by-side comparison of different extraction approaches
325
+ - Visual validation of CSS selectors and XPath queries
326
+
327
+ ##### AI Assistant Integration
328
+ - Strategy recommendations based on target website analysis
329
+ - Parameter optimization suggestions
330
+ - Best practices guidance for specific use cases
331
+ - Automated error detection and resolution
332
+ - Performance optimization tips
333
+
334
+ ##### Real-Time Testing & Validation
335
+ - Live preview of extraction results
336
+ - Side-by-side comparison of multiple strategies
337
+ - Performance metrics visualization
338
+ - Automatic validation of extracted data
339
+ - Error detection and debugging tools
340
+
341
+ ##### Project Management
342
+ - Save and organize multiple scraping projects
343
+ - Version control for configurations
344
+ - Export/import project settings
345
+ - Share configurations with team members
346
+ - Project templates for common use cases
347
+
348
+ ##### Deployment Pipeline
349
+ - One-click deployment to various environments
350
+ - Docker container generation
351
+ - Cloud deployment templates (AWS, GCP, Azure)
352
+ - Scaling configuration management
353
+ - Monitoring setup automation
354
+
355
+
356
+ ### 3.2 Performance Monitoring System
357
+ A comprehensive monitoring solution providing real-time insights into crawler operations, resource usage, and system health through both CLI and GUI interfaces.
358
+
359
+ Key Features:
360
+ - Real-time resource tracking
361
+ - Active crawl monitoring
362
+ - Performance statistics
363
+ - Customizable alerting system
364
+
365
+ ```python
366
+ from crawl4ai import AsyncWebCrawler
367
+ from crawl4ai.monitor import CrawlMonitor
368
+
369
+ # Initialize monitoring
370
+ monitor = CrawlMonitor()
371
+
372
+ # Start monitoring with CLI interface
373
+ await monitor.start(
374
+ mode="cli", # or "gui"
375
+ refresh_rate="1s",
376
+ metrics={
377
+ "resources": ["cpu", "memory", "network"],
378
+ "crawls": ["active", "queued", "completed"],
379
+ "performance": ["success_rate", "response_times"]
380
+ }
381
+ )
382
+
383
+ # Example CLI output:
384
+ """
385
+ Crawl4AI Monitor (Live) - Press Q to exit
386
+ ────────────────────────────────────────
387
+ System Usage:
388
+ ├─ CPU: ███████░░░ 70%
389
+ └─ Memory: ████░░░░░ 2.1GB/8GB
390
+
391
+ Active Crawls:
392
+ ID URL Status Progress
393
+ 001 docs.example.com 🟢 Active 75%
394
+ 002 api.service.com 🟡 Queue -
395
+
396
+ Metrics (Last 5min):
397
+ ├─ Success Rate: 98%
398
+ ├─ Avg Response: 0.6s
399
+ └─ Pages/sec: 8.5
400
+ """
401
+ ```
402
+
403
+ ### 3.3 Cloud Integration
404
+ Streamlined deployment tools for setting up Crawl4AI in various cloud environments, with support for scaling and monitoring.
405
+
406
+ Key Features:
407
+ - One-click deployment solutions
408
+ - Auto-scaling configuration
409
+ - Load balancing setup
410
+ - Cloud-specific optimizations
411
+ - Monitoring integration
412
+
413
+ ```python
414
+ from crawl4ai import AsyncWebCrawler
415
+ from crawl4ai.deploy import CloudDeployer
416
+
417
+ # Initialize deployer
418
+ deployer = CloudDeployer()
419
+
420
+ # Deploy crawler service
421
+ deployment = await deployer.deploy(
422
+ service_name="crawler-cluster",
423
+ platform="aws", # or "gcp", "azure"
424
+ config={
425
+ "instance_type": "compute-optimized",
426
+ "auto_scaling": {
427
+ "min_instances": 2,
428
+ "max_instances": 10,
429
+ "scale_based_on": "cpu_usage"
430
+ },
431
+ "region": "us-east-1",
432
+ "monitoring": True
433
+ }
434
+ )
435
+
436
+ # Get deployment status and endpoints
437
+ print(f"Service Status: {deployment.status}")
438
+ print(f"API Endpoint: {deployment.endpoint}")
439
+ print(f"Monitor URL: {deployment.monitor_url}")
440
+ ```
441
+
442
+ These development tools work together to provide a comprehensive environment for developing, testing, monitoring, and deploying Crawl4AI applications. The Playground helps users experiment and generate optimal configurations, the Performance Monitor ensures smooth operation, and the Cloud Integration tools simplify deployment and scaling.
443
+
444
+ # Section 4: Community & Growth 🌱
445
+
446
+ This section outlines initiatives designed to build and support the Crawl4AI community, provide educational resources, and ensure sustainable project growth.
447
+
448
+ ### 4.1 Sponsorship Program
449
+ A structured program to support ongoing development and maintenance of Crawl4AI while providing valuable benefits to sponsors.
450
+
451
+ Key Features:
452
+ - Multiple sponsorship tiers
453
+ - Sponsor recognition system
454
+ - Priority support for sponsors
455
+ - Early access to new features
456
+ - Custom feature development opportunities
457
+
458
+ Program Structure (not yet finalized):
459
+ ```
460
+ Sponsorship Tiers:
461
+
462
+ 🥉 Bronze Supporter
463
+ - GitHub Sponsor badge
464
+ - Priority issue response
465
+ - Community Discord role
466
+
467
+ 🥈 Silver Supporter
468
+ - All Bronze benefits
469
+ - Technical support channel
470
+ - Vote on roadmap priorities
471
+ - Early access to beta features
472
+
473
+ 🥇 Gold Supporter
474
+ - All Silver benefits
475
+ - Custom feature requests
476
+ - Direct developer access
477
+ - Private support sessions
478
+
479
+ 💎 Diamond Partner
480
+ - All Gold benefits
481
+ - Custom development
482
+ - On-demand consulting
483
+ - Integration support
484
+ ```
485
+
486
+ ### 4.2 "How to Crawl" Video Series
487
+ A comprehensive educational resource teaching users how to effectively use Crawl4AI for various web scraping and data extraction scenarios.
488
+
489
+ Key Features:
490
+ - Step-by-step tutorials
491
+ - Real-world use cases
492
+ - Best practices
493
+ - Integration guides
494
+ - Advanced feature deep-dives
495
+
496
+ These community initiatives are designed to:
497
+ - Provide comprehensive learning resources
498
+ - Foster a supportive user community
499
+ - Ensure sustainable project development
500
+ - Share knowledge and best practices
501
+ - Create opportunities for collaboration
502
+
503
+ The combination of structured support through sponsorship, educational content through video series, and interactive learning through the playground creates a robust ecosystem for both new and experienced users of Crawl4AI.
crawl4ai/__init__.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # __init__.py
2
+
3
+ from .async_webcrawler import AsyncWebCrawler, CacheMode
4
+ from .async_configs import BrowserConfig, CrawlerRunConfig
5
+ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
6
+ from .chunking_strategy import ChunkingStrategy, RegexChunking
7
+ from .markdown_generation_strategy import DefaultMarkdownGenerator
8
+ from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
9
+ from .models import CrawlResult
10
+ from .__version__ import __version__
11
+
12
+ __all__ = [
13
+ "AsyncWebCrawler",
14
+ "CrawlResult",
15
+ "CacheMode",
16
+ 'BrowserConfig',
17
+ 'CrawlerRunConfig',
18
+ 'ExtractionStrategy',
19
+ 'LLMExtractionStrategy',
20
+ 'CosineStrategy',
21
+ 'JsonCssExtractionStrategy',
22
+ 'ChunkingStrategy',
23
+ 'RegexChunking',
24
+ 'DefaultMarkdownGenerator',
25
+ 'PruningContentFilter',
26
+ 'BM25ContentFilter',
27
+ ]
28
+
29
+ def is_sync_version_installed():
30
+ try:
31
+ import selenium
32
+ return True
33
+ except ImportError:
34
+ return False
35
+
36
+ if is_sync_version_installed():
37
+ try:
38
+ from .web_crawler import WebCrawler
39
+ __all__.append("WebCrawler")
40
+ except ImportError:
41
+ import warnings
42
+ print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
43
+ else:
44
+ WebCrawler = None
45
+ # import warnings
46
+ # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
crawl4ai/__version__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # crawl4ai/_version.py
2
+ __version__ = "0.4.247"
crawl4ai/async_configs.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import (
2
+ MIN_WORD_THRESHOLD,
3
+ IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
4
+ SCREENSHOT_HEIGHT_TRESHOLD,
5
+ PAGE_TIMEOUT,
6
+ IMAGE_SCORE_THRESHOLD,
7
+ SOCIAL_MEDIA_DOMAINS,
8
+
9
+ )
10
+ from .user_agent_generator import UserAgentGenerator
11
+ from .extraction_strategy import ExtractionStrategy
12
+ from .chunking_strategy import ChunkingStrategy
13
+ from .markdown_generation_strategy import MarkdownGenerationStrategy
14
+ from typing import Union, List
15
+
16
+
17
+ class BrowserConfig:
18
+ """
19
+ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
20
+
21
+ This class centralizes all parameters that affect browser and context creation. Instead of passing
22
+ scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
23
+ code will then reference these settings to initialize the browser in a consistent, documented manner.
24
+
25
+ Attributes:
26
+ browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
27
+ Default: "chromium".
28
+ headless (bool): Whether to run the browser in headless mode (no visible GUI).
29
+ Default: True.
30
+ use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
31
+ advanced manipulation. Default: False.
32
+ debugging_port (int): Port for the browser debugging protocol. Default: 9222.
33
+ use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
34
+ Automatically sets use_managed_browser=True. Default: False.
35
+ user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
36
+ temporary directory may be used. Default: None.
37
+ chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
38
+ is "chromium". Default: "chromium".
39
+ channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
40
+ is "chromium". Default: "chromium".
41
+ proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
42
+ Default: None.
43
+ proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
44
+ If None, no additional proxy config. Default: None.
45
+ viewport_width (int): Default viewport width for pages. Default: 1080.
46
+ viewport_height (int): Default viewport height for pages. Default: 600.
47
+ verbose (bool): Enable verbose logging.
48
+ Default: True.
49
+ accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
50
+ Default: False.
51
+ downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
52
+ a default path will be created. Default: None.
53
+ storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
54
+ Default: None.
55
+ ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
56
+ java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
57
+ cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
58
+ {"name": "...", "value": "...", "url": "..."}.
59
+ Default: [].
60
+ headers (dict): Extra HTTP headers to apply to all requests in this context.
61
+ Default: {}.
62
+ user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
63
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
64
+ user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
65
+ user_agent as-is. Default: None.
66
+ user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
67
+ Default: None.
68
+ text_mode (bool): If True, disables images and other rich content for potentially faster load times.
69
+ Default: False.
70
+ light_mode (bool): Disables certain background features for performance gains. Default: False.
71
+ extra_args (list): Additional command-line arguments passed to the browser.
72
+ Default: [].
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ browser_type: str = "chromium",
78
+ headless: bool = True,
79
+ use_managed_browser: bool = False,
80
+ use_persistent_context: bool = False,
81
+ user_data_dir: str = None,
82
+ chrome_channel: str = "chromium",
83
+ channel: str = "chromium",
84
+ proxy: str = None,
85
+ proxy_config: dict = None,
86
+ viewport_width: int = 1080,
87
+ viewport_height: int = 600,
88
+ accept_downloads: bool = False,
89
+ downloads_path: str = None,
90
+ storage_state=None,
91
+ ignore_https_errors: bool = True,
92
+ java_script_enabled: bool = True,
93
+ sleep_on_close: bool = False,
94
+ verbose: bool = True,
95
+ cookies: list = None,
96
+ headers: dict = None,
97
+ user_agent: str = (
98
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
99
+ "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
100
+ ),
101
+ user_agent_mode: str = None,
102
+ user_agent_generator_config: dict = None,
103
+ text_mode: bool = False,
104
+ light_mode: bool = False,
105
+ extra_args: list = None,
106
+ debugging_port : int = 9222,
107
+ ):
108
+ self.browser_type = browser_type
109
+ self.headless = headless
110
+ self.use_managed_browser = use_managed_browser
111
+ self.use_persistent_context = use_persistent_context
112
+ self.user_data_dir = user_data_dir
113
+ self.chrome_channel = chrome_channel or self.browser_type or "chromium"
114
+ self.channel = channel or self.browser_type or "chromium"
115
+ self.proxy = proxy
116
+ self.proxy_config = proxy_config
117
+ self.viewport_width = viewport_width
118
+ self.viewport_height = viewport_height
119
+ self.accept_downloads = accept_downloads
120
+ self.downloads_path = downloads_path
121
+ self.storage_state = storage_state
122
+ self.ignore_https_errors = ignore_https_errors
123
+ self.java_script_enabled = java_script_enabled
124
+ self.cookies = cookies if cookies is not None else []
125
+ self.headers = headers if headers is not None else {}
126
+ self.user_agent = user_agent
127
+ self.user_agent_mode = user_agent_mode
128
+ self.user_agent_generator_config = user_agent_generator_config
129
+ self.text_mode = text_mode
130
+ self.light_mode = light_mode
131
+ self.extra_args = extra_args if extra_args is not None else []
132
+ self.sleep_on_close = sleep_on_close
133
+ self.verbose = verbose
134
+ self.debugging_port = debugging_port
135
+
136
+ user_agenr_generator = UserAgentGenerator()
137
+ if self.user_agent_mode != "random" and self.user_agent_generator_config:
138
+ self.user_agent = user_agenr_generator.generate(
139
+ **(self.user_agent_generator_config or {})
140
+ )
141
+ elif self.user_agent_mode == "random":
142
+ self.user_agent = user_agenr_generator.generate()
143
+ else:
144
+ pass
145
+
146
+ self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
147
+ self.headers.setdefault("sec-ch-ua", self.browser_hint)
148
+
149
+ # If persistent context is requested, ensure managed browser is enabled
150
+ if self.use_persistent_context:
151
+ self.use_managed_browser = True
152
+
153
+ @staticmethod
154
+ def from_kwargs(kwargs: dict) -> "BrowserConfig":
155
+ return BrowserConfig(
156
+ browser_type=kwargs.get("browser_type", "chromium"),
157
+ headless=kwargs.get("headless", True),
158
+ use_managed_browser=kwargs.get("use_managed_browser", False),
159
+ use_persistent_context=kwargs.get("use_persistent_context", False),
160
+ user_data_dir=kwargs.get("user_data_dir"),
161
+ chrome_channel=kwargs.get("chrome_channel", "chromium"),
162
+ channel=kwargs.get("channel", "chromium"),
163
+ proxy=kwargs.get("proxy"),
164
+ proxy_config=kwargs.get("proxy_config"),
165
+ viewport_width=kwargs.get("viewport_width", 1080),
166
+ viewport_height=kwargs.get("viewport_height", 600),
167
+ accept_downloads=kwargs.get("accept_downloads", False),
168
+ downloads_path=kwargs.get("downloads_path"),
169
+ storage_state=kwargs.get("storage_state"),
170
+ ignore_https_errors=kwargs.get("ignore_https_errors", True),
171
+ java_script_enabled=kwargs.get("java_script_enabled", True),
172
+ cookies=kwargs.get("cookies", []),
173
+ headers=kwargs.get("headers", {}),
174
+ user_agent=kwargs.get(
175
+ "user_agent",
176
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
177
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
178
+ ),
179
+ user_agent_mode=kwargs.get("user_agent_mode"),
180
+ user_agent_generator_config=kwargs.get("user_agent_generator_config"),
181
+ text_mode=kwargs.get("text_mode", False),
182
+ light_mode=kwargs.get("light_mode", False),
183
+ extra_args=kwargs.get("extra_args", []),
184
+ )
185
+
186
+
187
+ class CrawlerRunConfig:
188
+ """
189
+ Configuration class for controlling how the crawler runs each crawl operation.
190
+ This includes parameters for content extraction, page manipulation, waiting conditions,
191
+ caching, and other runtime behaviors.
192
+
193
+ This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
194
+ By using this class, you have a single place to understand and adjust the crawling options.
195
+
196
+ Attributes:
197
+ # Content Processing Parameters
198
+ word_count_threshold (int): Minimum word count threshold before processing content.
199
+ Default: MIN_WORD_THRESHOLD (typically 200).
200
+ extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
201
+ Default: None (NoExtractionStrategy is used if None).
202
+ chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
203
+ Default: RegexChunking().
204
+ markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
205
+ Default: None.
206
+ content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
207
+ Default: None.
208
+ only_text (bool): If True, attempt to extract text-only content where applicable.
209
+ Default: False.
210
+ css_selector (str or None): CSS selector to extract a specific portion of the page.
211
+ Default: None.
212
+ excluded_tags (list of str or None): List of HTML tags to exclude from processing.
213
+ Default: None.
214
+ excluded_selector (str or None): CSS selector to exclude from processing.
215
+ Default: None.
216
+ keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
217
+ Default: False.
218
+ remove_forms (bool): If True, remove all `<form>` elements from the HTML.
219
+ Default: False.
220
+ prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
221
+ Default: False.
222
+ parser_type (str): Type of parser to use for HTML parsing.
223
+ Default: "lxml".
224
+
225
+ # Caching Parameters
226
+ cache_mode (CacheMode or None): Defines how caching is handled.
227
+ If None, defaults to CacheMode.ENABLED internally.
228
+ Default: None.
229
+ session_id (str or None): Optional session ID to persist the browser context and the created
230
+ page instance. If the ID already exists, the crawler does not
231
+ create a new page and uses the current page to preserve the state.
232
+ bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
233
+ Default: False.
234
+ disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
235
+ Default: False.
236
+ no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
237
+ Default: False.
238
+ no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
239
+ Default: False.
240
+
241
+ # Page Navigation and Timing Parameters
242
+ wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
243
+ Default: "domcontentloaded".
244
+ page_timeout (int): Timeout in ms for page operations like navigation.
245
+ Default: 60000 (60 seconds).
246
+ wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
247
+ Default: None.
248
+ wait_for_images (bool): If True, wait for images to load before extracting content.
249
+ Default: False.
250
+ delay_before_return_html (float): Delay in seconds before retrieving final HTML.
251
+ Default: 0.1.
252
+ mean_delay (float): Mean base delay between requests when calling arun_many.
253
+ Default: 0.1.
254
+ max_range (float): Max random additional delay range for requests in arun_many.
255
+ Default: 0.3.
256
+ semaphore_count (int): Number of concurrent operations allowed.
257
+ Default: 5.
258
+
259
+ # Page Interaction Parameters
260
+ js_code (str or list of str or None): JavaScript code/snippets to run on the page.
261
+ Default: None.
262
+ js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
263
+ Default: False.
264
+ ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
265
+ Default: True.
266
+ scan_full_page (bool): If True, scroll through the entire page to load all content.
267
+ Default: False.
268
+ scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
269
+ Default: 0.2.
270
+ process_iframes (bool): If True, attempts to process and inline iframe content.
271
+ Default: False.
272
+ remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
273
+ Default: False.
274
+ simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
275
+ Default: False.
276
+ override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
277
+ Default: False.
278
+ magic (bool): If True, attempts automatic handling of overlays/popups.
279
+ Default: False.
280
+ adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
281
+ Default: False.
282
+
283
+ # Media Handling Parameters
284
+ screenshot (bool): Whether to take a screenshot after crawling.
285
+ Default: False.
286
+ screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
287
+ Default: None.
288
+ screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
289
+ Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
290
+ pdf (bool): Whether to generate a PDF of the page.
291
+ Default: False.
292
+ image_description_min_word_threshold (int): Minimum words for image description extraction.
293
+ Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
294
+ image_score_threshold (int): Minimum score threshold for processing an image.
295
+ Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
296
+ exclude_external_images (bool): If True, exclude all external images from processing.
297
+ Default: False.
298
+
299
+ # Link and Domain Handling Parameters
300
+ exclude_social_media_domains (list of str): List of domains to exclude for social media links.
301
+ Default: SOCIAL_MEDIA_DOMAINS (from config).
302
+ exclude_external_links (bool): If True, exclude all external links from the results.
303
+ Default: False.
304
+ exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
305
+ Default: False.
306
+ exclude_domains (list of str): List of specific domains to exclude from results.
307
+ Default: [].
308
+
309
+ # Debugging and Logging Parameters
310
+ verbose (bool): Enable verbose logging.
311
+ Default: True.
312
+ log_console (bool): If True, log console messages from the page.
313
+ Default: False.
314
+ """
315
+
316
+ def __init__(
317
+ self,
318
+ # Content Processing Parameters
319
+ word_count_threshold: int = MIN_WORD_THRESHOLD,
320
+ extraction_strategy: ExtractionStrategy = None,
321
+ chunking_strategy: ChunkingStrategy = None,
322
+ markdown_generator: MarkdownGenerationStrategy = None,
323
+ content_filter=None,
324
+ only_text: bool = False,
325
+ css_selector: str = None,
326
+ excluded_tags: list = None,
327
+ excluded_selector: str = None,
328
+ keep_data_attributes: bool = False,
329
+ remove_forms: bool = False,
330
+ prettiify: bool = False,
331
+ parser_type: str = "lxml",
332
+
333
+ # SSL Parameters
334
+ fetch_ssl_certificate: bool = False,
335
+
336
+ # Caching Parameters
337
+ cache_mode=None,
338
+ session_id: str = None,
339
+ bypass_cache: bool = False,
340
+ disable_cache: bool = False,
341
+ no_cache_read: bool = False,
342
+ no_cache_write: bool = False,
343
+
344
+ # Page Navigation and Timing Parameters
345
+ wait_until: str = "domcontentloaded",
346
+ page_timeout: int = PAGE_TIMEOUT,
347
+ wait_for: str = None,
348
+ wait_for_images: bool = False,
349
+ delay_before_return_html: float = 0.1,
350
+ mean_delay: float = 0.1,
351
+ max_range: float = 0.3,
352
+ semaphore_count: int = 5,
353
+
354
+ # Page Interaction Parameters
355
+ js_code: Union[str, List[str]] = None,
356
+ js_only: bool = False,
357
+ ignore_body_visibility: bool = True,
358
+ scan_full_page: bool = False,
359
+ scroll_delay: float = 0.2,
360
+ process_iframes: bool = False,
361
+ remove_overlay_elements: bool = False,
362
+ simulate_user: bool = False,
363
+ override_navigator: bool = False,
364
+ magic: bool = False,
365
+ adjust_viewport_to_content: bool = False,
366
+
367
+ # Media Handling Parameters
368
+ screenshot: bool = False,
369
+ screenshot_wait_for: float = None,
370
+ screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
371
+ pdf: bool = False,
372
+ image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
373
+ image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
374
+ exclude_external_images: bool = False,
375
+
376
+ # Link and Domain Handling Parameters
377
+ exclude_social_media_domains: list = None,
378
+ exclude_external_links: bool = False,
379
+ exclude_social_media_links: bool = False,
380
+ exclude_domains: list = None,
381
+
382
+ # Debugging and Logging Parameters
383
+ verbose: bool = True,
384
+ log_console: bool = False,
385
+
386
+ url: str = None,
387
+ ):
388
+ self.url = url
389
+
390
+ # Content Processing Parameters
391
+ self.word_count_threshold = word_count_threshold
392
+ self.extraction_strategy = extraction_strategy
393
+ self.chunking_strategy = chunking_strategy
394
+ self.markdown_generator = markdown_generator
395
+ self.content_filter = content_filter
396
+ self.only_text = only_text
397
+ self.css_selector = css_selector
398
+ self.excluded_tags = excluded_tags or []
399
+ self.excluded_selector = excluded_selector or ""
400
+ self.keep_data_attributes = keep_data_attributes
401
+ self.remove_forms = remove_forms
402
+ self.prettiify = prettiify
403
+ self.parser_type = parser_type
404
+
405
+ # SSL Parameters
406
+ self.fetch_ssl_certificate = fetch_ssl_certificate
407
+
408
+ # Caching Parameters
409
+ self.cache_mode = cache_mode
410
+ self.session_id = session_id
411
+ self.bypass_cache = bypass_cache
412
+ self.disable_cache = disable_cache
413
+ self.no_cache_read = no_cache_read
414
+ self.no_cache_write = no_cache_write
415
+
416
+ # Page Navigation and Timing Parameters
417
+ self.wait_until = wait_until
418
+ self.page_timeout = page_timeout
419
+ self.wait_for = wait_for
420
+ self.wait_for_images = wait_for_images
421
+ self.delay_before_return_html = delay_before_return_html
422
+ self.mean_delay = mean_delay
423
+ self.max_range = max_range
424
+ self.semaphore_count = semaphore_count
425
+
426
+ # Page Interaction Parameters
427
+ self.js_code = js_code
428
+ self.js_only = js_only
429
+ self.ignore_body_visibility = ignore_body_visibility
430
+ self.scan_full_page = scan_full_page
431
+ self.scroll_delay = scroll_delay
432
+ self.process_iframes = process_iframes
433
+ self.remove_overlay_elements = remove_overlay_elements
434
+ self.simulate_user = simulate_user
435
+ self.override_navigator = override_navigator
436
+ self.magic = magic
437
+ self.adjust_viewport_to_content = adjust_viewport_to_content
438
+
439
+ # Media Handling Parameters
440
+ self.screenshot = screenshot
441
+ self.screenshot_wait_for = screenshot_wait_for
442
+ self.screenshot_height_threshold = screenshot_height_threshold
443
+ self.pdf = pdf
444
+ self.image_description_min_word_threshold = image_description_min_word_threshold
445
+ self.image_score_threshold = image_score_threshold
446
+ self.exclude_external_images = exclude_external_images
447
+
448
+ # Link and Domain Handling Parameters
449
+ self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
450
+ self.exclude_external_links = exclude_external_links
451
+ self.exclude_social_media_links = exclude_social_media_links
452
+ self.exclude_domains = exclude_domains or []
453
+
454
+ # Debugging and Logging Parameters
455
+ self.verbose = verbose
456
+ self.log_console = log_console
457
+
458
+ # Validate type of extraction strategy and chunking strategy if they are provided
459
+ if self.extraction_strategy is not None and not isinstance(
460
+ self.extraction_strategy, ExtractionStrategy
461
+ ):
462
+ raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
463
+ if self.chunking_strategy is not None and not isinstance(
464
+ self.chunking_strategy, ChunkingStrategy
465
+ ):
466
+ raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
467
+
468
+ # Set default chunking strategy if None
469
+ if self.chunking_strategy is None:
470
+ from .chunking_strategy import RegexChunking
471
+ self.chunking_strategy = RegexChunking()
472
+
473
+ @staticmethod
474
+ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
475
+ return CrawlerRunConfig(
476
+ # Content Processing Parameters
477
+ word_count_threshold=kwargs.get("word_count_threshold", 200),
478
+ extraction_strategy=kwargs.get("extraction_strategy"),
479
+ chunking_strategy=kwargs.get("chunking_strategy"),
480
+ markdown_generator=kwargs.get("markdown_generator"),
481
+ content_filter=kwargs.get("content_filter"),
482
+ only_text=kwargs.get("only_text", False),
483
+ css_selector=kwargs.get("css_selector"),
484
+ excluded_tags=kwargs.get("excluded_tags", []),
485
+ excluded_selector=kwargs.get("excluded_selector", ""),
486
+ keep_data_attributes=kwargs.get("keep_data_attributes", False),
487
+ remove_forms=kwargs.get("remove_forms", False),
488
+ prettiify=kwargs.get("prettiify", False),
489
+ parser_type=kwargs.get("parser_type", "lxml"),
490
+
491
+ # SSL Parameters
492
+ fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
493
+
494
+ # Caching Parameters
495
+ cache_mode=kwargs.get("cache_mode"),
496
+ session_id=kwargs.get("session_id"),
497
+ bypass_cache=kwargs.get("bypass_cache", False),
498
+ disable_cache=kwargs.get("disable_cache", False),
499
+ no_cache_read=kwargs.get("no_cache_read", False),
500
+ no_cache_write=kwargs.get("no_cache_write", False),
501
+
502
+ # Page Navigation and Timing Parameters
503
+ wait_until=kwargs.get("wait_until", "domcontentloaded"),
504
+ page_timeout=kwargs.get("page_timeout", 60000),
505
+ wait_for=kwargs.get("wait_for"),
506
+ wait_for_images=kwargs.get("wait_for_images", False),
507
+ delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
508
+ mean_delay=kwargs.get("mean_delay", 0.1),
509
+ max_range=kwargs.get("max_range", 0.3),
510
+ semaphore_count=kwargs.get("semaphore_count", 5),
511
+
512
+ # Page Interaction Parameters
513
+ js_code=kwargs.get("js_code"),
514
+ js_only=kwargs.get("js_only", False),
515
+ ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
516
+ scan_full_page=kwargs.get("scan_full_page", False),
517
+ scroll_delay=kwargs.get("scroll_delay", 0.2),
518
+ process_iframes=kwargs.get("process_iframes", False),
519
+ remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
520
+ simulate_user=kwargs.get("simulate_user", False),
521
+ override_navigator=kwargs.get("override_navigator", False),
522
+ magic=kwargs.get("magic", False),
523
+ adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
524
+
525
+ # Media Handling Parameters
526
+ screenshot=kwargs.get("screenshot", False),
527
+ screenshot_wait_for=kwargs.get("screenshot_wait_for"),
528
+ screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
529
+ pdf=kwargs.get("pdf", False),
530
+ image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
531
+ image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
532
+ exclude_external_images=kwargs.get("exclude_external_images", False),
533
+
534
+ # Link and Domain Handling Parameters
535
+ exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
536
+ exclude_external_links=kwargs.get("exclude_external_links", False),
537
+ exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
538
+ exclude_domains=kwargs.get("exclude_domains", []),
539
+
540
+ # Debugging and Logging Parameters
541
+ verbose=kwargs.get("verbose", True),
542
+ log_console=kwargs.get("log_console", False),
543
+
544
+ url=kwargs.get("url"),
545
+ )
546
+
547
+ # Create a funciton returns dict of the object
548
+ def to_dict(self):
549
+ return {
550
+ "word_count_threshold": self.word_count_threshold,
551
+ "extraction_strategy": self.extraction_strategy,
552
+ "chunking_strategy": self.chunking_strategy,
553
+ "markdown_generator": self.markdown_generator,
554
+ "content_filter": self.content_filter,
555
+ "only_text": self.only_text,
556
+ "css_selector": self.css_selector,
557
+ "excluded_tags": self.excluded_tags,
558
+ "excluded_selector": self.excluded_selector,
559
+ "keep_data_attributes": self.keep_data_attributes,
560
+ "remove_forms": self.remove_forms,
561
+ "prettiify": self.prettiify,
562
+ "parser_type": self.parser_type,
563
+ "fetch_ssl_certificate": self.fetch_ssl_certificate,
564
+ "cache_mode": self.cache_mode,
565
+ "session_id": self.session_id,
566
+ "bypass_cache": self.bypass_cache,
567
+ "disable_cache": self.disable_cache,
568
+ "no_cache_read": self.no_cache_read,
569
+ "no_cache_write": self.no_cache_write,
570
+ "wait_until": self.wait_until,
571
+ "page_timeout": self.page_timeout,
572
+ "wait_for": self.wait_for,
573
+ "wait_for_images": self.wait_for_images,
574
+ "delay_before_return_html": self.delay_before_return_html,
575
+ "mean_delay": self.mean_delay,
576
+ "max_range": self.max_range,
577
+ "semaphore_count": self.semaphore_count,
578
+ "js_code": self.js_code,
579
+ "js_only": self.js_only,
580
+ "ignore_body_visibility": self.ignore_body_visibility,
581
+ "scan_full_page": self.scan_full_page,
582
+ "scroll_delay": self.scroll_delay,
583
+ "process_iframes": self.process_iframes,
584
+ "remove_overlay_elements": self.remove_overlay_elements,
585
+ "simulate_user": self.simulate_user,
586
+ "override_navigator": self.override_navigator,
587
+ "magic": self.magic,
588
+ "adjust_viewport_to_content": self.adjust_viewport_to_content,
589
+ "screenshot": self.screenshot,
590
+ "screenshot_wait_for": self.screenshot_wait_for,
591
+ "screenshot_height_threshold": self.screenshot_height_threshold,
592
+ "pdf": self.pdf,
593
+ "image_description_min_word_threshold": self.image_description_min_word_threshold,
594
+ "image_score_threshold": self.image_score_threshold,
595
+ "exclude_external_images": self.exclude_external_images,
596
+ "exclude_social_media_domains": self.exclude_social_media_domains,
597
+ "exclude_external_links": self.exclude_external_links,
598
+ "exclude_social_media_links": self.exclude_social_media_links,
599
+ "exclude_domains": self.exclude_domains,
600
+ "verbose": self.verbose,
601
+ "log_console": self.log_console,
602
+ "url": self.url,
603
+ }
crawl4ai/async_crawler_strategy.py ADDED
@@ -0,0 +1,2191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import time
4
+ from abc import ABC, abstractmethod
5
+ from typing import Callable, Dict, Any, List, Optional, Awaitable, Union
6
+ import os, sys, shutil
7
+ import tempfile, subprocess
8
+ from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext
9
+ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
10
+ from io import BytesIO
11
+ from PIL import Image, ImageDraw, ImageFont
12
+ from pathlib import Path
13
+ from playwright.async_api import ProxySettings
14
+ from pydantic import BaseModel
15
+ import hashlib
16
+ import json
17
+ import uuid
18
+ from .js_snippet import load_js_script
19
+ from .models import AsyncCrawlResponse
20
+ from .utils import get_error_context
21
+ from .user_agent_generator import UserAgentGenerator
22
+ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
23
+ from .async_configs import BrowserConfig, CrawlerRunConfig
24
+ from .async_logger import AsyncLogger
25
+ from playwright_stealth import StealthConfig, stealth_async
26
+ from .ssl_certificate import SSLCertificate
27
+
28
+ stealth_config = StealthConfig(
29
+ webdriver=True,
30
+ chrome_app=True,
31
+ chrome_csi=True,
32
+ chrome_load_times=True,
33
+ chrome_runtime=True,
34
+ navigator_languages=True,
35
+ navigator_plugins=True,
36
+ navigator_permissions=True,
37
+ webgl_vendor=True,
38
+ outerdimensions=True,
39
+ navigator_hardware_concurrency=True,
40
+ media_codecs=True,
41
+ )
42
+
43
+ BROWSER_DISABLE_OPTIONS = [
44
+ "--disable-background-networking",
45
+ "--disable-background-timer-throttling",
46
+ "--disable-backgrounding-occluded-windows",
47
+ "--disable-breakpad",
48
+ "--disable-client-side-phishing-detection",
49
+ "--disable-component-extensions-with-background-pages",
50
+ "--disable-default-apps",
51
+ "--disable-extensions",
52
+ "--disable-features=TranslateUI",
53
+ "--disable-hang-monitor",
54
+ "--disable-ipc-flooding-protection",
55
+ "--disable-popup-blocking",
56
+ "--disable-prompt-on-repost",
57
+ "--disable-sync",
58
+ "--force-color-profile=srgb",
59
+ "--metrics-recording-only",
60
+ "--no-first-run",
61
+ "--password-store=basic",
62
+ "--use-mock-keychain",
63
+ ]
64
+
65
+
66
+ class ManagedBrowser:
67
+ """
68
+ Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
69
+
70
+ Attributes:
71
+ browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
72
+ Default: "chromium".
73
+ user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
74
+ temporary directory may be used. Default: None.
75
+ headless (bool): Whether to run the browser in headless mode (no visible GUI).
76
+ Default: True.
77
+ browser_process (subprocess.Popen): The process object for the browser.
78
+ temp_dir (str): Temporary directory for user data if not provided.
79
+ debugging_port (int): Port for debugging the browser.
80
+ host (str): Host for debugging the browser.
81
+
82
+ Methods:
83
+ start(): Starts the browser process and returns the CDP endpoint URL.
84
+ _get_browser_path(): Returns the browser executable path based on OS and browser type.
85
+ _get_browser_args(): Returns browser-specific command line arguments.
86
+ _get_user_data_dir(): Returns the user data directory path.
87
+ _cleanup(): Terminates the browser process and removes the temporary directory.
88
+ """
89
+
90
+ browser_type: str
91
+ user_data_dir: str
92
+ headless: bool
93
+ browser_process: subprocess.Popen
94
+ temp_dir: str
95
+ debugging_port: int
96
+ host: str
97
+ def __init__(
98
+ self,
99
+ browser_type: str = "chromium",
100
+ user_data_dir: Optional[str] = None,
101
+ headless: bool = False,
102
+ logger=None,
103
+ host: str = "localhost",
104
+ debugging_port: int = 9222,
105
+ ):
106
+ """
107
+ Initialize the ManagedBrowser instance.
108
+
109
+ Args:
110
+ browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
111
+ Default: "chromium".
112
+ user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
113
+ temporary directory may be used. Default: None.
114
+ headless (bool): Whether to run the browser in headless mode (no visible GUI).
115
+ Default: True.
116
+ logger (logging.Logger): Logger instance for logging messages. Default: None.
117
+ host (str): Host for debugging the browser. Default: "localhost".
118
+ debugging_port (int): Port for debugging the browser. Default: 9222.
119
+ """
120
+ self.browser_type = browser_type
121
+ self.user_data_dir = user_data_dir
122
+ self.headless = headless
123
+ self.browser_process = None
124
+ self.temp_dir = None
125
+ self.debugging_port = debugging_port
126
+ self.host = host
127
+ self.logger = logger
128
+ self.shutting_down = False
129
+
130
+ async def start(self) -> str:
131
+ """
132
+ Starts the browser process and returns the CDP endpoint URL.
133
+ If user_data_dir is not provided, creates a temporary directory.
134
+ """
135
+
136
+ # Create temp dir if needed
137
+ if not self.user_data_dir:
138
+ self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
139
+ self.user_data_dir = self.temp_dir
140
+
141
+ # Get browser path and args based on OS and browser type
142
+ browser_path = self._get_browser_path()
143
+ args = self._get_browser_args()
144
+
145
+ # Start browser process
146
+ try:
147
+ self.browser_process = subprocess.Popen(
148
+ args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
149
+ )
150
+ # Monitor browser process output for errors
151
+ asyncio.create_task(self._monitor_browser_process())
152
+ await asyncio.sleep(2) # Give browser time to start
153
+ return f"http://{self.host}:{self.debugging_port}"
154
+ except Exception as e:
155
+ await self.cleanup()
156
+ raise Exception(f"Failed to start browser: {e}")
157
+
158
+ async def _monitor_browser_process(self):
159
+ """
160
+ Monitor the browser process for unexpected termination.
161
+
162
+ How it works:
163
+ 1. Read stdout and stderr from the browser process.
164
+ 2. If the process has terminated, log the error message and terminate the browser.
165
+ 3. If the shutting_down flag is set, log the normal termination message.
166
+ 4. If any other error occurs, log the error message.
167
+
168
+ Note: This method should be called in a separate task to avoid blocking the main event loop.
169
+ """
170
+ if self.browser_process:
171
+ try:
172
+ stdout, stderr = await asyncio.gather(
173
+ asyncio.to_thread(self.browser_process.stdout.read),
174
+ asyncio.to_thread(self.browser_process.stderr.read),
175
+ )
176
+
177
+ # Check shutting_down flag BEFORE logging anything
178
+ if self.browser_process.poll() is not None:
179
+ if not self.shutting_down:
180
+ self.logger.error(
181
+ message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
182
+ tag="ERROR",
183
+ params={
184
+ "code": self.browser_process.returncode,
185
+ "stdout": stdout.decode(),
186
+ "stderr": stderr.decode(),
187
+ },
188
+ )
189
+ await self.cleanup()
190
+ else:
191
+ self.logger.info(
192
+ message="Browser process terminated normally | Code: {code}",
193
+ tag="INFO",
194
+ params={"code": self.browser_process.returncode},
195
+ )
196
+ except Exception as e:
197
+ if not self.shutting_down:
198
+ self.logger.error(
199
+ message="Error monitoring browser process: {error}",
200
+ tag="ERROR",
201
+ params={"error": str(e)},
202
+ )
203
+
204
+ def _get_browser_path(self) -> str:
205
+ """Returns the browser executable path based on OS and browser type"""
206
+ if sys.platform == "darwin": # macOS
207
+ paths = {
208
+ "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
209
+ "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
210
+ "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
211
+ }
212
+ elif sys.platform == "win32": # Windows
213
+ paths = {
214
+ "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
215
+ "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
216
+ "webkit": None, # WebKit not supported on Windows
217
+ }
218
+ else: # Linux
219
+ paths = {
220
+ "chromium": "google-chrome",
221
+ "firefox": "firefox",
222
+ "webkit": None, # WebKit not supported on Linux
223
+ }
224
+
225
+ return paths.get(self.browser_type)
226
+
227
+ def _get_browser_args(self) -> List[str]:
228
+ """Returns browser-specific command line arguments"""
229
+ base_args = [self._get_browser_path()]
230
+
231
+ if self.browser_type == "chromium":
232
+ args = [
233
+ f"--remote-debugging-port={self.debugging_port}",
234
+ f"--user-data-dir={self.user_data_dir}",
235
+ ]
236
+ if self.headless:
237
+ args.append("--headless=new")
238
+ elif self.browser_type == "firefox":
239
+ args = [
240
+ "--remote-debugging-port",
241
+ str(self.debugging_port),
242
+ "--profile",
243
+ self.user_data_dir,
244
+ ]
245
+ if self.headless:
246
+ args.append("--headless")
247
+ else:
248
+ raise NotImplementedError(f"Browser type {self.browser_type} not supported")
249
+
250
+ return base_args + args
251
+
252
+ async def cleanup(self):
253
+ """Cleanup browser process and temporary directory"""
254
+ # Set shutting_down flag BEFORE any termination actions
255
+ self.shutting_down = True
256
+
257
+ if self.browser_process:
258
+ try:
259
+ self.browser_process.terminate()
260
+ # Wait for process to end gracefully
261
+ for _ in range(10): # 10 attempts, 100ms each
262
+ if self.browser_process.poll() is not None:
263
+ break
264
+ await asyncio.sleep(0.1)
265
+
266
+ # Force kill if still running
267
+ if self.browser_process.poll() is None:
268
+ self.browser_process.kill()
269
+ await asyncio.sleep(0.1) # Brief wait for kill to take effect
270
+
271
+ except Exception as e:
272
+ self.logger.error(
273
+ message="Error terminating browser: {error}",
274
+ tag="ERROR",
275
+ params={"error": str(e)},
276
+ )
277
+
278
+ if self.temp_dir and os.path.exists(self.temp_dir):
279
+ try:
280
+ shutil.rmtree(self.temp_dir)
281
+ except Exception as e:
282
+ self.logger.error(
283
+ message="Error removing temporary directory: {error}",
284
+ tag="ERROR",
285
+ params={"error": str(e)},
286
+ )
287
+
288
+
289
+ class BrowserManager:
290
+ """
291
+ Manages the browser instance and context.
292
+
293
+ Attributes:
294
+ config (BrowserConfig): Configuration object containing all browser settings
295
+ logger: Logger instance for recording events and errors
296
+ browser (Browser): The browser instance
297
+ default_context (BrowserContext): The default browser context
298
+ managed_browser (ManagedBrowser): The managed browser instance
299
+ playwright (Playwright): The Playwright instance
300
+ sessions (dict): Dictionary to store session information
301
+ session_ttl (int): Session timeout in seconds
302
+ """
303
+ def __init__(self, browser_config: BrowserConfig, logger=None):
304
+ """
305
+ Initialize the BrowserManager with a browser configuration.
306
+
307
+ Args:
308
+ browser_config (BrowserConfig): Configuration object containing all browser settings
309
+ logger: Logger instance for recording events and errors
310
+ """
311
+ self.config: BrowserConfig = browser_config
312
+ self.logger = logger
313
+
314
+ # Browser state
315
+ self.browser = None
316
+ self.default_context = None
317
+ self.managed_browser = None
318
+ self.playwright = None
319
+
320
+ # Session management
321
+ self.sessions = {}
322
+ self.session_ttl = 1800 # 30 minutes
323
+
324
+ # Initialize ManagedBrowser if needed
325
+ if self.config.use_managed_browser:
326
+ self.managed_browser = ManagedBrowser(
327
+ browser_type=self.config.browser_type,
328
+ user_data_dir=self.config.user_data_dir,
329
+ headless=self.config.headless,
330
+ logger=self.logger,
331
+ debugging_port=self.config.debugging_port,
332
+ )
333
+
334
+ async def start(self):
335
+ """
336
+ Start the browser instance and set up the default context.
337
+
338
+ How it works:
339
+ 1. Check if Playwright is already initialized.
340
+ 2. If not, initialize Playwright.
341
+ 3. If managed browser is used, start it and connect to the CDP endpoint.
342
+ 4. If managed browser is not used, launch the browser and set up the default context.
343
+
344
+ Note: This method should be called in a separate task to avoid blocking the main event loop.
345
+ """
346
+ if self.playwright is None:
347
+ from playwright.async_api import async_playwright
348
+
349
+ self.playwright = await async_playwright().start()
350
+
351
+ if self.config.use_managed_browser:
352
+ cdp_url = await self.managed_browser.start()
353
+ self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
354
+ contexts = self.browser.contexts
355
+ if contexts:
356
+ self.default_context = contexts[0]
357
+ else:
358
+ self.default_context = await self.create_browser_context()
359
+ # self.default_context = await self.browser.new_context(
360
+ # viewport={
361
+ # "width": self.config.viewport_width,
362
+ # "height": self.config.viewport_height,
363
+ # },
364
+ # storage_state=self.config.storage_state,
365
+ # user_agent=self.config.headers.get(
366
+ # "User-Agent", self.config.user_agent
367
+ # ),
368
+ # accept_downloads=self.config.accept_downloads,
369
+ # ignore_https_errors=self.config.ignore_https_errors,
370
+ # java_script_enabled=self.config.java_script_enabled,
371
+ # )
372
+ await self.setup_context(self.default_context)
373
+ else:
374
+ browser_args = self._build_browser_args()
375
+
376
+ # Launch appropriate browser type
377
+ if self.config.browser_type == "firefox":
378
+ self.browser = await self.playwright.firefox.launch(**browser_args)
379
+ elif self.config.browser_type == "webkit":
380
+ self.browser = await self.playwright.webkit.launch(**browser_args)
381
+ else:
382
+ self.browser = await self.playwright.chromium.launch(**browser_args)
383
+
384
+ self.default_context = self.browser
385
+
386
+ def _build_browser_args(self) -> dict:
387
+ """Build browser launch arguments from config."""
388
+ args = [
389
+ "--disable-gpu",
390
+ "--disable-gpu-compositing",
391
+ "--disable-software-rasterizer",
392
+ "--no-sandbox",
393
+ "--disable-dev-shm-usage",
394
+ "--no-first-run",
395
+ "--no-default-browser-check",
396
+ "--disable-infobars",
397
+ "--window-position=0,0",
398
+ "--ignore-certificate-errors",
399
+ "--ignore-certificate-errors-spki-list",
400
+ "--disable-blink-features=AutomationControlled",
401
+ "--window-position=400,0",
402
+ "--disable-renderer-backgrounding",
403
+ "--disable-ipc-flooding-protection",
404
+ "--force-color-profile=srgb",
405
+ "--mute-audio",
406
+ "--disable-background-timer-throttling",
407
+ # "--single-process",
408
+ f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
409
+ ]
410
+
411
+ if self.config.light_mode:
412
+ args.extend(BROWSER_DISABLE_OPTIONS)
413
+
414
+ if self.config.text_mode:
415
+ args.extend(
416
+ [
417
+ "--blink-settings=imagesEnabled=false",
418
+ "--disable-remote-fonts",
419
+ "--disable-images",
420
+ "--disable-javascript",
421
+ "--disable-software-rasterizer",
422
+ "--disable-dev-shm-usage",
423
+ ]
424
+ )
425
+
426
+ if self.config.extra_args:
427
+ args.extend(self.config.extra_args)
428
+
429
+ browser_args = {"headless": self.config.headless, "args": args}
430
+
431
+ if self.config.chrome_channel:
432
+ browser_args["channel"] = self.config.chrome_channel
433
+
434
+ if self.config.accept_downloads:
435
+ browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
436
+ os.getcwd(), "downloads"
437
+ )
438
+ os.makedirs(browser_args["downloads_path"], exist_ok=True)
439
+
440
+ if self.config.proxy or self.config.proxy_config:
441
+ from playwright.async_api import ProxySettings
442
+
443
+ proxy_settings = (
444
+ ProxySettings(server=self.config.proxy)
445
+ if self.config.proxy
446
+ else ProxySettings(
447
+ server=self.config.proxy_config.get("server"),
448
+ username=self.config.proxy_config.get("username"),
449
+ password=self.config.proxy_config.get("password"),
450
+ )
451
+ )
452
+ browser_args["proxy"] = proxy_settings
453
+
454
+ return browser_args
455
+
456
+ async def setup_context(
457
+ self,
458
+ context: BrowserContext,
459
+ crawlerRunConfig: CrawlerRunConfig,
460
+ is_default=False,
461
+ ):
462
+ """
463
+ Set up a browser context with the configured options.
464
+
465
+ How it works:
466
+ 1. Set extra HTTP headers if provided.
467
+ 2. Add cookies if provided.
468
+ 3. Load storage state if provided.
469
+ 4. Accept downloads if enabled.
470
+ 5. Set default timeouts for navigation and download.
471
+ 6. Set user agent if provided.
472
+ 7. Set browser hints if provided.
473
+ 8. Set proxy if provided.
474
+ 9. Set downloads path if provided.
475
+ 10. Set storage state if provided.
476
+ 11. Set cache if provided.
477
+ 12. Set extra HTTP headers if provided.
478
+ 13. Add cookies if provided.
479
+ 14. Set default timeouts for navigation and download if enabled.
480
+ 15. Set user agent if provided.
481
+ 16. Set browser hints if provided.
482
+
483
+ Args:
484
+ context (BrowserContext): The browser context to set up
485
+ crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
486
+ is_default (bool): Flag indicating if this is the default context
487
+ Returns:
488
+ None
489
+ """
490
+ if self.config.headers:
491
+ await context.set_extra_http_headers(self.config.headers)
492
+
493
+ if self.config.cookies:
494
+ await context.add_cookies(self.config.cookies)
495
+
496
+ if self.config.storage_state:
497
+ await context.storage_state(path=None)
498
+
499
+ if self.config.accept_downloads:
500
+ context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
501
+ context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
502
+ if self.config.downloads_path:
503
+ context._impl_obj._options["accept_downloads"] = True
504
+ context._impl_obj._options["downloads_path"] = (
505
+ self.config.downloads_path
506
+ )
507
+
508
+ # Handle user agent and browser hints
509
+ if self.config.user_agent:
510
+ combined_headers = {
511
+ "User-Agent": self.config.user_agent,
512
+ "sec-ch-ua": self.config.browser_hint,
513
+ }
514
+ combined_headers.update(self.config.headers)
515
+ await context.set_extra_http_headers(combined_headers)
516
+
517
+ # Add default cookie
518
+ await context.add_cookies(
519
+ [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}]
520
+ )
521
+
522
+ # Handle navigator overrides
523
+ if (
524
+ crawlerRunConfig.override_navigator
525
+ or crawlerRunConfig.simulate_user
526
+ or crawlerRunConfig.magic
527
+ ):
528
+ await context.add_init_script(load_js_script("navigator_overrider"))
529
+
530
+ async def create_browser_context(self):
531
+ """
532
+ Creates and returns a new browser context with configured settings.
533
+ Applies text-only mode settings if text_mode is enabled in config.
534
+
535
+ Returns:
536
+ Context: Browser context object with the specified configurations
537
+ """
538
+ # Base settings
539
+ user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
540
+ viewport_settings = {
541
+ "width": self.config.viewport_width,
542
+ "height": self.config.viewport_height,
543
+ }
544
+ proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
545
+
546
+ blocked_extensions = [
547
+ # Images
548
+ 'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd',
549
+ # Fonts
550
+ 'woff', 'woff2', 'ttf', 'otf', 'eot',
551
+ # Styles
552
+ # 'css', 'less', 'scss', 'sass',
553
+ # Media
554
+ 'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v',
555
+ 'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac',
556
+ # Documents
557
+ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
558
+ # Archives
559
+ 'zip', 'rar', '7z', 'tar', 'gz',
560
+ # Scripts and data
561
+ 'xml', 'swf', 'wasm'
562
+ ]
563
+
564
+ # Common context settings
565
+ context_settings = {
566
+ "user_agent": user_agent,
567
+ "viewport": viewport_settings,
568
+ "proxy": proxy_settings,
569
+ "accept_downloads": self.config.accept_downloads,
570
+ "storage_state": self.config.storage_state,
571
+ "ignore_https_errors": self.config.ignore_https_errors,
572
+ "device_scale_factor": 1.0,
573
+ "java_script_enabled": self.config.java_script_enabled,
574
+ }
575
+
576
+ if self.config.text_mode:
577
+ text_mode_settings = {
578
+ "has_touch": False,
579
+ "is_mobile": False,
580
+ }
581
+ # Update context settings with text mode settings
582
+ context_settings.update(text_mode_settings)
583
+
584
+ # Create and return the context with all settings
585
+ context = await self.browser.new_context(**context_settings)
586
+
587
+ # Apply text mode settings if enabled
588
+ if self.config.text_mode:
589
+ # Create and apply route patterns for each extension
590
+ for ext in blocked_extensions:
591
+ await context.route(f"**/*.{ext}", lambda route: route.abort())
592
+ return context
593
+
594
+ # async def get_page(self, session_id: Optional[str], user_agent: str):
595
+ async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
596
+ """
597
+ Get a page for the given session ID, creating a new one if needed.
598
+
599
+ Args:
600
+ crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
601
+
602
+ Returns:
603
+ Page: The page object for the given session ID.
604
+ BrowserContext: The browser context for the given session ID.
605
+ """
606
+ self._cleanup_expired_sessions()
607
+
608
+ if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
609
+ context, page, _ = self.sessions[crawlerRunConfig.session_id]
610
+ self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
611
+ return page, context
612
+
613
+ if self.config.use_managed_browser:
614
+ context = self.default_context
615
+ page = await context.new_page()
616
+ else:
617
+ context = await self.create_browser_context()
618
+ await self.setup_context(context, crawlerRunConfig)
619
+ page = await context.new_page()
620
+
621
+ if crawlerRunConfig.session_id:
622
+ self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
623
+
624
+ return page, context
625
+
626
+ async def kill_session(self, session_id: str):
627
+ """
628
+ Kill a browser session and clean up resources.
629
+
630
+ Args:
631
+ session_id (str): The session ID to kill.
632
+ """
633
+ if session_id in self.sessions:
634
+ context, page, _ = self.sessions[session_id]
635
+ await page.close()
636
+ if not self.config.use_managed_browser:
637
+ await context.close()
638
+ del self.sessions[session_id]
639
+
640
+ def _cleanup_expired_sessions(self):
641
+ """Clean up expired sessions based on TTL."""
642
+ current_time = time.time()
643
+ expired_sessions = [
644
+ sid
645
+ for sid, (_, _, last_used) in self.sessions.items()
646
+ if current_time - last_used > self.session_ttl
647
+ ]
648
+ for sid in expired_sessions:
649
+ asyncio.create_task(self.kill_session(sid))
650
+
651
+ async def close(self):
652
+ """Close all browser resources and clean up."""
653
+ if self.config.sleep_on_close:
654
+ await asyncio.sleep(0.5)
655
+
656
+ session_ids = list(self.sessions.keys())
657
+ for session_id in session_ids:
658
+ await self.kill_session(session_id)
659
+
660
+ if self.browser:
661
+ await self.browser.close()
662
+ self.browser = None
663
+
664
+ if self.managed_browser:
665
+ await asyncio.sleep(0.5)
666
+ await self.managed_browser.cleanup()
667
+ self.managed_browser = None
668
+
669
+ if self.playwright:
670
+ await self.playwright.stop()
671
+ self.playwright = None
672
+
673
+
674
+ class AsyncCrawlerStrategy(ABC):
675
+ """
676
+ Abstract base class for crawler strategies.
677
+ Subclasses must implement the crawl method.
678
+ """
679
+ @abstractmethod
680
+ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
681
+ pass # 4 + 3
682
+
683
+
684
+
685
+ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
686
+ """
687
+ Crawler strategy using Playwright.
688
+
689
+ Attributes:
690
+ browser_config (BrowserConfig): Configuration object containing browser settings.
691
+ logger (AsyncLogger): Logger instance for recording events and errors.
692
+ _downloaded_files (List[str]): List of downloaded file paths.
693
+ hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
694
+ browser_manager (BrowserManager): Manager for browser creation and management.
695
+
696
+ Methods:
697
+ __init__(self, browser_config=None, logger=None, **kwargs):
698
+ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
699
+ __aenter__(self):
700
+ Start the browser and initialize the browser manager.
701
+ __aexit__(self, exc_type, exc_val, exc_tb):
702
+ Close the browser and clean up resources.
703
+ start(self):
704
+ Start the browser and initialize the browser manager.
705
+ close(self):
706
+ Close the browser and clean up resources.
707
+ kill_session(self, session_id):
708
+ Kill a browser session and clean up resources.
709
+ crawl(self, url, **kwargs):
710
+ Run the crawler for a single URL.
711
+
712
+ """
713
+ def __init__(
714
+ self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
715
+ ):
716
+ """
717
+ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
718
+
719
+ Args:
720
+ browser_config (BrowserConfig): Configuration object containing browser settings.
721
+ If None, will be created from kwargs for backwards compatibility.
722
+ logger: Logger instance for recording events and errors.
723
+ **kwargs: Additional arguments for backwards compatibility and extending functionality.
724
+ """
725
+ # Initialize browser config, either from provided object or kwargs
726
+ self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs)
727
+ self.logger = logger
728
+
729
+ # Initialize session management
730
+ self._downloaded_files = []
731
+
732
+ # Initialize hooks system
733
+ self.hooks = {
734
+ "on_browser_created": None,
735
+ "on_page_context_created": None,
736
+ "on_user_agent_updated": None,
737
+ "on_execution_started": None,
738
+ "before_goto": None,
739
+ "after_goto": None,
740
+ "before_return_html": None,
741
+ "before_retrieve_html": None,
742
+ }
743
+
744
+ # Initialize browser manager with config
745
+ self.browser_manager = BrowserManager(
746
+ browser_config=self.browser_config, logger=self.logger
747
+ )
748
+
749
+ async def __aenter__(self):
750
+ await self.start()
751
+ return self
752
+
753
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
754
+ await self.close()
755
+
756
+ async def start(self):
757
+ """
758
+ Start the browser and initialize the browser manager.
759
+ """
760
+ await self.browser_manager.start()
761
+ await self.execute_hook(
762
+ "on_browser_created",
763
+ self.browser_manager.browser,
764
+ context=self.browser_manager.default_context,
765
+ )
766
+
767
+ async def close(self):
768
+ """
769
+ Close the browser and clean up resources.
770
+ """
771
+ await self.browser_manager.close()
772
+
773
+ async def kill_session(self, session_id: str):
774
+ """
775
+ Kill a browser session and clean up resources.
776
+
777
+ Args:
778
+ session_id (str): The ID of the session to kill.
779
+
780
+ Returns:
781
+ None
782
+ """
783
+ # Log a warning message and no need kill session, in new version auto kill session
784
+ self.logger.warning(
785
+ message="Session auto-kill is enabled in the new version. No need to manually kill sessions.",
786
+ tag="WARNING",
787
+ )
788
+ await self.browser_manager.kill_session(session_id)
789
+
790
+ def set_hook(self, hook_type: str, hook: Callable):
791
+ """
792
+ Set a hook function for a specific hook type. Following are list of hook types:
793
+ - on_browser_created: Called when a new browser instance is created.
794
+ - on_page_context_created: Called when a new page context is created.
795
+ - on_user_agent_updated: Called when the user agent is updated.
796
+ - on_execution_started: Called when the execution starts.
797
+ - before_goto: Called before a goto operation.
798
+ - after_goto: Called after a goto operation.
799
+ - before_return_html: Called before returning HTML content.
800
+ - before_retrieve_html: Called before retrieving HTML content.
801
+
802
+ All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.
803
+
804
+ Args:
805
+ hook_type (str): The type of the hook.
806
+ hook (Callable): The hook function to set.
807
+
808
+ Returns:
809
+ None
810
+ """
811
+ if hook_type in self.hooks:
812
+ self.hooks[hook_type] = hook
813
+ else:
814
+ raise ValueError(f"Invalid hook type: {hook_type}")
815
+
816
+ async def execute_hook(self, hook_type: str, *args, **kwargs):
817
+ """
818
+ Execute a hook function for a specific hook type.
819
+
820
+ Args:
821
+ hook_type (str): The type of the hook.
822
+ *args: Variable length positional arguments.
823
+ **kwargs: Keyword arguments.
824
+
825
+ Returns:
826
+ The return value of the hook function, if any.
827
+ """
828
+ hook = self.hooks.get(hook_type)
829
+ if hook:
830
+ if asyncio.iscoroutinefunction(hook):
831
+ return await hook(*args, **kwargs)
832
+ else:
833
+ return hook(*args, **kwargs)
834
+ return args[0] if args else None
835
+
836
+ def update_user_agent(self, user_agent: str):
837
+ """
838
+ Update the user agent for the browser.
839
+
840
+ Args:
841
+ user_agent (str): The new user agent string.
842
+
843
+ Returns:
844
+ None
845
+ """
846
+ self.user_agent = user_agent
847
+
848
+ def set_custom_headers(self, headers: Dict[str, str]):
849
+ """
850
+ Set custom headers for the browser.
851
+
852
+ Args:
853
+ headers (Dict[str, str]): A dictionary of headers to set.
854
+
855
+ Returns:
856
+ None
857
+ """
858
+ self.headers = headers
859
+
860
+ async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
861
+ """
862
+ Wait for a condition in a smart way. This functions works as below:
863
+
864
+ 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
865
+ 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
866
+ 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
867
+ 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.
868
+
869
+ This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
870
+ Args:
871
+ page: Playwright page object
872
+ wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
873
+ timeout (float): Maximum time to wait in milliseconds
874
+
875
+ Returns:
876
+ None
877
+ """
878
+ wait_for = wait_for.strip()
879
+
880
+ if wait_for.startswith("js:"):
881
+ # Explicitly specified JavaScript
882
+ js_code = wait_for[3:].strip()
883
+ return await self.csp_compliant_wait(page, js_code, timeout)
884
+ elif wait_for.startswith("css:"):
885
+ # Explicitly specified CSS selector
886
+ css_selector = wait_for[4:].strip()
887
+ try:
888
+ await page.wait_for_selector(css_selector, timeout=timeout)
889
+ except Error as e:
890
+ if "Timeout" in str(e):
891
+ raise TimeoutError(
892
+ f"Timeout after {timeout}ms waiting for selector '{css_selector}'"
893
+ )
894
+ else:
895
+ raise ValueError(f"Invalid CSS selector: '{css_selector}'")
896
+ else:
897
+ # Auto-detect based on content
898
+ if wait_for.startswith("()") or wait_for.startswith("function"):
899
+ # It's likely a JavaScript function
900
+ return await self.csp_compliant_wait(page, wait_for, timeout)
901
+ else:
902
+ # Assume it's a CSS selector first
903
+ try:
904
+ await page.wait_for_selector(wait_for, timeout=timeout)
905
+ except Error as e:
906
+ if "Timeout" in str(e):
907
+ raise TimeoutError(
908
+ f"Timeout after {timeout}ms waiting for selector '{wait_for}'"
909
+ )
910
+ else:
911
+ # If it's not a timeout error, it might be an invalid selector
912
+ # Let's try to evaluate it as a JavaScript function as a fallback
913
+ try:
914
+ return await self.csp_compliant_wait(
915
+ page, f"() => {{{wait_for}}}", timeout
916
+ )
917
+ except Error:
918
+ raise ValueError(
919
+ f"Invalid wait_for parameter: '{wait_for}'. "
920
+ "It should be either a valid CSS selector, a JavaScript function, "
921
+ "or explicitly prefixed with 'js:' or 'css:'."
922
+ )
923
+
924
+ async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ):
925
+ """
926
+ Wait for a condition in a CSP-compliant way.
927
+
928
+ Args:
929
+ page: Playwright page object
930
+ user_wait_function: JavaScript function as string that returns boolean
931
+ timeout: Maximum time to wait in milliseconds
932
+
933
+ Returns:
934
+ bool: True if condition was met, False if timed out
935
+
936
+ Raises:
937
+ RuntimeError: If there's an error evaluating the condition
938
+ """
939
+ wrapper_js = f"""
940
+ async () => {{
941
+ const userFunction = {user_wait_function};
942
+ const startTime = Date.now();
943
+ try {{
944
+ while (true) {{
945
+ if (await userFunction()) {{
946
+ return true;
947
+ }}
948
+ if (Date.now() - startTime > {timeout}) {{
949
+ return false; // Return false instead of throwing
950
+ }}
951
+ await new Promise(resolve => setTimeout(resolve, 100));
952
+ }}
953
+ }} catch (error) {{
954
+ throw new Error(`Error evaluating condition: ${{error.message}}`);
955
+ }}
956
+ }}
957
+ """
958
+
959
+ try:
960
+ result = await page.evaluate(wrapper_js)
961
+ return result
962
+ except Exception as e:
963
+ if "Error evaluating condition" in str(e):
964
+ raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}")
965
+ # For timeout or other cases, just return False
966
+ return False
967
+
968
+ async def process_iframes(self, page):
969
+ """
970
+ Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.
971
+
972
+ Args:
973
+ page: Playwright page object
974
+
975
+ Returns:
976
+ Playwright page object
977
+ """
978
+ # Find all iframes
979
+ iframes = await page.query_selector_all("iframe")
980
+
981
+ for i, iframe in enumerate(iframes):
982
+ try:
983
+ # Add a unique identifier to the iframe
984
+ await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
985
+
986
+ # Get the frame associated with this iframe
987
+ frame = await iframe.content_frame()
988
+
989
+ if frame:
990
+ # Wait for the frame to load
991
+ await frame.wait_for_load_state(
992
+ "load", timeout=30000
993
+ ) # 30 seconds timeout
994
+
995
+ # Extract the content of the iframe's body
996
+ iframe_content = await frame.evaluate(
997
+ "() => document.body.innerHTML"
998
+ )
999
+
1000
+ # Generate a unique class name for this iframe
1001
+ class_name = f"extracted-iframe-content-{i}"
1002
+
1003
+ # Replace the iframe with a div containing the extracted content
1004
+ _iframe = iframe_content.replace("`", "\\`")
1005
+ await page.evaluate(
1006
+ f"""
1007
+ () => {{
1008
+ const iframe = document.getElementById('iframe-{i}');
1009
+ const div = document.createElement('div');
1010
+ div.innerHTML = `{_iframe}`;
1011
+ div.className = '{class_name}';
1012
+ iframe.replaceWith(div);
1013
+ }}
1014
+ """
1015
+ )
1016
+ else:
1017
+ self.logger.warning(
1018
+ message="Could not access content frame for iframe {index}",
1019
+ tag="SCRAPE",
1020
+ params={"index": i},
1021
+ )
1022
+ except Exception as e:
1023
+ self.logger.error(
1024
+ message="Error processing iframe {index}: {error}",
1025
+ tag="ERROR",
1026
+ params={"index": i, "error": str(e)},
1027
+ )
1028
+
1029
+ # Return the page object
1030
+ return page
1031
+
1032
+ async def create_session(self, **kwargs) -> str:
1033
+ """
1034
+ Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
1035
+ This function is asynchronous and returns a string representing the session ID.
1036
+
1037
+ Args:
1038
+ **kwargs: Optional keyword arguments to configure the session.
1039
+
1040
+ Returns:
1041
+ str: The session ID.
1042
+ """
1043
+ await self.start()
1044
+
1045
+ session_id = kwargs.get("session_id") or str(uuid.uuid4())
1046
+
1047
+ user_agent = kwargs.get("user_agent", self.user_agent)
1048
+ # Use browser_manager to get a fresh page & context assigned to this session_id
1049
+ page, context = await self.browser_manager.get_page(session_id, user_agent)
1050
+ return session_id
1051
+
1052
+ async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse:
1053
+ """
1054
+ Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
1055
+
1056
+ Args:
1057
+ url (str): The URL to crawl. Supported prefixes:
1058
+ - 'http://' or 'https://': Web URL to crawl.
1059
+ - 'file://': Local file path to process.
1060
+ - 'raw://': Raw HTML content to process.
1061
+ **kwargs: Additional parameters:
1062
+ - 'screenshot' (bool): Whether to take a screenshot.
1063
+ - ... [other existing parameters]
1064
+
1065
+ Returns:
1066
+ AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
1067
+ """
1068
+ config = config or CrawlerRunConfig.from_kwargs(kwargs)
1069
+ response_headers = {}
1070
+ status_code = 200 # Default for local/raw HTML
1071
+ screenshot_data = None
1072
+
1073
+ if url.startswith(("http://", "https://")):
1074
+ return await self._crawl_web(url, config)
1075
+
1076
+ elif url.startswith("file://"):
1077
+ # Process local file
1078
+ local_file_path = url[7:] # Remove 'file://' prefix
1079
+ if not os.path.exists(local_file_path):
1080
+ raise FileNotFoundError(f"Local file not found: {local_file_path}")
1081
+ with open(local_file_path, "r", encoding="utf-8") as f:
1082
+ html = f.read()
1083
+ if config.screenshot:
1084
+ screenshot_data = await self._generate_screenshot_from_html(html)
1085
+ return AsyncCrawlResponse(
1086
+ html=html,
1087
+ response_headers=response_headers,
1088
+ status_code=status_code,
1089
+ screenshot=screenshot_data,
1090
+ get_delayed_content=None,
1091
+ )
1092
+
1093
+ elif url.startswith("raw:") or url.startswith("raw://"):
1094
+ # Process raw HTML content
1095
+ raw_html = url[4:] if url[:4] == "raw:" else url[7:]
1096
+ html = raw_html
1097
+ if config.screenshot:
1098
+ screenshot_data = await self._generate_screenshot_from_html(html)
1099
+ return AsyncCrawlResponse(
1100
+ html=html,
1101
+ response_headers=response_headers,
1102
+ status_code=status_code,
1103
+ screenshot=screenshot_data,
1104
+ get_delayed_content=None,
1105
+ )
1106
+ else:
1107
+ raise ValueError(
1108
+ "URL must start with 'http://', 'https://', 'file://', or 'raw:'"
1109
+ )
1110
+
1111
+ async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse:
1112
+ """
1113
+ Internal method to crawl web URLs with the specified configuration.
1114
+
1115
+ Args:
1116
+ url (str): The web URL to crawl
1117
+ config (CrawlerRunConfig): Configuration object controlling the crawl behavior
1118
+
1119
+ Returns:
1120
+ AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data
1121
+ """
1122
+ config.url = url
1123
+ response_headers = {}
1124
+ status_code = None
1125
+
1126
+ # Reset downloaded files list for new crawl
1127
+ self._downloaded_files = []
1128
+
1129
+ # Handle user agent with magic mode
1130
+ user_agent = self.browser_config.user_agent
1131
+ if config.magic and self.browser_config.user_agent_mode != "random":
1132
+ self.browser_config.user_agent = UserAgentGenerator().generate(
1133
+ **(self.browser_config.user_agent_generator_config or {})
1134
+ )
1135
+
1136
+ # Get page for session
1137
+ page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
1138
+
1139
+ # Add default cookie
1140
+ await context.add_cookies(
1141
+ [{"name": "cookiesEnabled", "value": "true", "url": url}]
1142
+ )
1143
+
1144
+ # Handle navigator overrides
1145
+ if config.override_navigator or config.simulate_user or config.magic:
1146
+ await context.add_init_script(load_js_script("navigator_overrider"))
1147
+
1148
+ # Call hook after page creation
1149
+ await self.execute_hook("on_page_context_created", page, context=context)
1150
+
1151
+ # Set up console logging if requested
1152
+ if config.log_console:
1153
+
1154
+ def log_consol(
1155
+ msg, console_log_type="debug"
1156
+ ): # Corrected the parameter syntax
1157
+ if console_log_type == "error":
1158
+ self.logger.error(
1159
+ message=f"Console error: {msg}", # Use f-string for variable interpolation
1160
+ tag="CONSOLE",
1161
+ params={"msg": msg.text},
1162
+ )
1163
+ elif console_log_type == "debug":
1164
+ self.logger.debug(
1165
+ message=f"Console: {msg}", # Use f-string for variable interpolation
1166
+ tag="CONSOLE",
1167
+ params={"msg": msg.text},
1168
+ )
1169
+
1170
+ page.on("console", log_consol)
1171
+ page.on("pageerror", lambda e: log_consol(e, "error"))
1172
+
1173
+ try:
1174
+ # Get SSL certificate information if requested and URL is HTTPS
1175
+ ssl_cert = None
1176
+ if config.fetch_ssl_certificate:
1177
+ ssl_cert = SSLCertificate.from_url(url)
1178
+
1179
+ # Set up download handling
1180
+ if self.browser_config.accept_downloads:
1181
+ page.on(
1182
+ "download",
1183
+ lambda download: asyncio.create_task(
1184
+ self._handle_download(download)
1185
+ ),
1186
+ )
1187
+
1188
+ # Handle page navigation and content loading
1189
+ if not config.js_only:
1190
+ await self.execute_hook("before_goto", page, context=context, url=url)
1191
+
1192
+ try:
1193
+ # Generate a unique nonce for this request
1194
+ nonce = hashlib.sha256(os.urandom(32)).hexdigest()
1195
+
1196
+ # Add CSP headers to the request
1197
+ await page.set_extra_http_headers({
1198
+ 'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
1199
+ })
1200
+
1201
+ response = await page.goto(
1202
+ url, wait_until=config.wait_until, timeout=config.page_timeout
1203
+ )
1204
+ except Error as e:
1205
+ raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
1206
+
1207
+ await self.execute_hook("after_goto", page, context=context, url=url, response=response)
1208
+
1209
+ if response is None:
1210
+ status_code = 200
1211
+ response_headers = {}
1212
+ else:
1213
+ status_code = response.status
1214
+ response_headers = response.headers
1215
+
1216
+ else:
1217
+ status_code = 200
1218
+ response_headers = {}
1219
+
1220
+ # Wait for body element and visibility
1221
+ try:
1222
+ await page.wait_for_selector("body", state="attached", timeout=30000)
1223
+
1224
+ # Use the new check_visibility function with csp_compliant_wait
1225
+ is_visible = await self.csp_compliant_wait(
1226
+ page,
1227
+ """() => {
1228
+ const element = document.body;
1229
+ if (!element) return false;
1230
+ const style = window.getComputedStyle(element);
1231
+ const isVisible = style.display !== 'none' &&
1232
+ style.visibility !== 'hidden' &&
1233
+ style.opacity !== '0';
1234
+ return isVisible;
1235
+ }""",
1236
+ timeout=30000
1237
+ )
1238
+
1239
+ if not is_visible and not config.ignore_body_visibility:
1240
+ visibility_info = await self.check_visibility(page)
1241
+ raise Error(f"Body element is hidden: {visibility_info}")
1242
+
1243
+ except Error as e:
1244
+ visibility_info = await self.check_visibility(page)
1245
+
1246
+ if self.config.verbose:
1247
+ self.logger.debug(
1248
+ message="Body visibility info: {info}",
1249
+ tag="DEBUG",
1250
+ params={"info": visibility_info},
1251
+ )
1252
+
1253
+ if not config.ignore_body_visibility:
1254
+ raise Error(f"Body element is hidden: {visibility_info}")
1255
+
1256
+
1257
+ # try:
1258
+ # await page.wait_for_selector("body", state="attached", timeout=30000)
1259
+
1260
+ # await page.wait_for_function(
1261
+ # """
1262
+ # () => {
1263
+ # const body = document.body;
1264
+ # const style = window.getComputedStyle(body);
1265
+ # return style.display !== 'none' &&
1266
+ # style.visibility !== 'hidden' &&
1267
+ # style.opacity !== '0';
1268
+ # }
1269
+ # """,
1270
+ # timeout=30000,
1271
+ # )
1272
+ # except Error as e:
1273
+ # visibility_info = await page.evaluate(
1274
+ # """
1275
+ # () => {
1276
+ # const body = document.body;
1277
+ # const style = window.getComputedStyle(body);
1278
+ # return {
1279
+ # display: style.display,
1280
+ # visibility: style.visibility,
1281
+ # opacity: style.opacity,
1282
+ # hasContent: body.innerHTML.length,
1283
+ # classList: Array.from(body.classList)
1284
+ # }
1285
+ # }
1286
+ # """
1287
+ # )
1288
+
1289
+ # if self.config.verbose:
1290
+ # self.logger.debug(
1291
+ # message="Body visibility info: {info}",
1292
+ # tag="DEBUG",
1293
+ # params={"info": visibility_info},
1294
+ # )
1295
+
1296
+ # if not config.ignore_body_visibility:
1297
+ # raise Error(f"Body element is hidden: {visibility_info}")
1298
+
1299
+ # Handle content loading and viewport adjustment
1300
+ if not self.browser_config.text_mode and (
1301
+ config.wait_for_images or config.adjust_viewport_to_content
1302
+ ):
1303
+ await page.wait_for_load_state("domcontentloaded")
1304
+ await asyncio.sleep(0.1)
1305
+
1306
+ # Check for image loading with improved error handling
1307
+ images_loaded = await self.csp_compliant_wait(
1308
+ page,
1309
+ "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)",
1310
+ timeout=1000
1311
+ )
1312
+
1313
+ if not images_loaded and self.logger:
1314
+ self.logger.warning(
1315
+ message="Some images failed to load within timeout",
1316
+ tag="SCRAPE",
1317
+ )
1318
+
1319
+ # Adjust viewport if needed
1320
+ if not self.browser_config.text_mode and config.adjust_viewport_to_content:
1321
+ try:
1322
+ dimensions = await self.get_page_dimensions(page)
1323
+ page_height = dimensions['height']
1324
+ page_width = dimensions['width']
1325
+ # page_width = await page.evaluate(
1326
+ # "document.documentElement.scrollWidth"
1327
+ # )
1328
+ # page_height = await page.evaluate(
1329
+ # "document.documentElement.scrollHeight"
1330
+ # )
1331
+
1332
+ target_width = self.browser_config.viewport_width
1333
+ target_height = int(target_width * page_width / page_height * 0.95)
1334
+ await page.set_viewport_size(
1335
+ {"width": target_width, "height": target_height}
1336
+ )
1337
+
1338
+ scale = min(target_width / page_width, target_height / page_height)
1339
+ cdp = await page.context.new_cdp_session(page)
1340
+ await cdp.send(
1341
+ "Emulation.setDeviceMetricsOverride",
1342
+ {
1343
+ "width": page_width,
1344
+ "height": page_height,
1345
+ "deviceScaleFactor": 1,
1346
+ "mobile": False,
1347
+ "scale": scale,
1348
+ },
1349
+ )
1350
+ except Exception as e:
1351
+ self.logger.warning(
1352
+ message="Failed to adjust viewport to content: {error}",
1353
+ tag="VIEWPORT",
1354
+ params={"error": str(e)},
1355
+ )
1356
+
1357
+ # Handle full page scanning
1358
+ if config.scan_full_page:
1359
+ await self._handle_full_page_scan(page, config.scroll_delay)
1360
+
1361
+ # Execute JavaScript if provided
1362
+ # if config.js_code:
1363
+ # if isinstance(config.js_code, str):
1364
+ # await page.evaluate(config.js_code)
1365
+ # elif isinstance(config.js_code, list):
1366
+ # for js in config.js_code:
1367
+ # await page.evaluate(js)
1368
+
1369
+ if config.js_code:
1370
+ # execution_result = await self.execute_user_script(page, config.js_code)
1371
+ execution_result = await self.robust_execute_user_script(page, config.js_code)
1372
+ if not execution_result["success"]:
1373
+ self.logger.warning(
1374
+ message="User script execution had issues: {error}",
1375
+ tag="JS_EXEC",
1376
+ params={"error": execution_result.get("error")}
1377
+ )
1378
+
1379
+ await self.execute_hook("on_execution_started", page, context=context)
1380
+
1381
+ # Handle user simulation
1382
+ if config.simulate_user or config.magic:
1383
+ await page.mouse.move(100, 100)
1384
+ await page.mouse.down()
1385
+ await page.mouse.up()
1386
+ await page.keyboard.press("ArrowDown")
1387
+
1388
+ # Handle wait_for condition
1389
+ if config.wait_for:
1390
+ try:
1391
+ await self.smart_wait(
1392
+ page, config.wait_for, timeout=config.page_timeout
1393
+ )
1394
+ except Exception as e:
1395
+ raise RuntimeError(f"Wait condition failed: {str(e)}")
1396
+
1397
+ # Update image dimensions if needed
1398
+ if not self.browser_config.text_mode:
1399
+ update_image_dimensions_js = load_js_script("update_image_dimensions")
1400
+ try:
1401
+ try:
1402
+ await page.wait_for_load_state("domcontentloaded", timeout=5)
1403
+ except PlaywrightTimeoutError:
1404
+ pass
1405
+ await page.evaluate(update_image_dimensions_js)
1406
+ except Exception as e:
1407
+ self.logger.error(
1408
+ message="Error updating image dimensions: {error}",
1409
+ tag="ERROR",
1410
+ params={"error": str(e)},
1411
+ )
1412
+
1413
+ # Process iframes if needed
1414
+ if config.process_iframes:
1415
+ page = await self.process_iframes(page)
1416
+
1417
+ # Pre-content retrieval hooks and delay
1418
+ await self.execute_hook("before_retrieve_html", page, context=context)
1419
+ if config.delay_before_return_html:
1420
+ await asyncio.sleep(config.delay_before_return_html)
1421
+
1422
+ # Handle overlay removal
1423
+ if config.remove_overlay_elements:
1424
+ await self.remove_overlay_elements(page)
1425
+
1426
+ # Get final HTML content
1427
+ html = await page.content()
1428
+ await self.execute_hook("before_return_html", page = page, html = html, context=context)
1429
+
1430
+ # Handle PDF and screenshot generation
1431
+ start_export_time = time.perf_counter()
1432
+ pdf_data = None
1433
+ screenshot_data = None
1434
+
1435
+ if config.pdf:
1436
+ pdf_data = await self.export_pdf(page)
1437
+
1438
+ if config.screenshot:
1439
+ if config.screenshot_wait_for:
1440
+ await asyncio.sleep(config.screenshot_wait_for)
1441
+ screenshot_data = await self.take_screenshot(
1442
+ page, screenshot_height_threshold=config.screenshot_height_threshold
1443
+ )
1444
+
1445
+ if screenshot_data or pdf_data:
1446
+ self.logger.info(
1447
+ message="Exporting PDF and taking screenshot took {duration:.2f}s",
1448
+ tag="EXPORT",
1449
+ params={"duration": time.perf_counter() - start_export_time},
1450
+ )
1451
+
1452
+ # Define delayed content getter
1453
+ async def get_delayed_content(delay: float = 5.0) -> str:
1454
+ self.logger.info(
1455
+ message="Waiting for {delay} seconds before retrieving content for {url}",
1456
+ tag="INFO",
1457
+ params={"delay": delay, "url": url},
1458
+ )
1459
+ await asyncio.sleep(delay)
1460
+ return await page.content()
1461
+
1462
+ # Return complete response
1463
+ return AsyncCrawlResponse(
1464
+ html=html,
1465
+ response_headers=response_headers,
1466
+ status_code=status_code,
1467
+ screenshot=screenshot_data,
1468
+ pdf_data=pdf_data,
1469
+ get_delayed_content=get_delayed_content,
1470
+ ssl_certificate=ssl_cert,
1471
+ downloaded_files=(
1472
+ self._downloaded_files if self._downloaded_files else None
1473
+ ),
1474
+ )
1475
+
1476
+ except Exception as e:
1477
+ raise e
1478
+
1479
+ finally:
1480
+ # If no session_id is given we should close the page
1481
+ if not config.session_id:
1482
+ await page.close()
1483
+
1484
+ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
1485
+ """
1486
+ Helper method to handle full page scanning.
1487
+
1488
+ How it works:
1489
+ 1. Get the viewport height.
1490
+ 2. Scroll to the bottom of the page.
1491
+ 3. Get the total height of the page.
1492
+ 4. Scroll back to the top of the page.
1493
+ 5. Scroll to the bottom of the page again.
1494
+ 6. Continue scrolling until the bottom of the page is reached.
1495
+
1496
+ Args:
1497
+ page (Page): The Playwright page object
1498
+ scroll_delay (float): The delay between page scrolls
1499
+
1500
+ """
1501
+ try:
1502
+ viewport_height = page.viewport_size.get(
1503
+ "height", self.browser_config.viewport_height
1504
+ )
1505
+ current_position = viewport_height
1506
+
1507
+ # await page.evaluate(f"window.scrollTo(0, {current_position})")
1508
+ await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
1509
+ # await self.csp_scroll_to(page, 0, current_position)
1510
+ # await asyncio.sleep(scroll_delay)
1511
+
1512
+ # total_height = await page.evaluate("document.documentElement.scrollHeight")
1513
+ dimensions = await self.get_page_dimensions(page)
1514
+ total_height = dimensions['height']
1515
+
1516
+ while current_position < total_height:
1517
+ current_position = min(current_position + viewport_height, total_height)
1518
+ await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
1519
+ # await page.evaluate(f"window.scrollTo(0, {current_position})")
1520
+ # await asyncio.sleep(scroll_delay)
1521
+
1522
+ # new_height = await page.evaluate("document.documentElement.scrollHeight")
1523
+ dimensions = await self.get_page_dimensions(page)
1524
+ new_height = dimensions['height']
1525
+
1526
+ if new_height > total_height:
1527
+ total_height = new_height
1528
+
1529
+ # await page.evaluate("window.scrollTo(0, 0)")
1530
+ await self.safe_scroll(page, 0, 0)
1531
+
1532
+ except Exception as e:
1533
+ self.logger.warning(
1534
+ message="Failed to perform full page scan: {error}",
1535
+ tag="PAGE_SCAN",
1536
+ params={"error": str(e)},
1537
+ )
1538
+ else:
1539
+ # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
1540
+ await self.safe_scroll(page, 0, total_height)
1541
+
1542
+ async def _handle_download(self, download):
1543
+ """
1544
+ Handle file downloads.
1545
+
1546
+ How it works:
1547
+ 1. Get the suggested filename.
1548
+ 2. Get the download path.
1549
+ 3. Log the download.
1550
+ 4. Start the download.
1551
+ 5. Save the downloaded file.
1552
+ 6. Log the completion.
1553
+
1554
+ Args:
1555
+ download (Download): The Playwright download object
1556
+
1557
+ Returns:
1558
+ None
1559
+ """
1560
+ try:
1561
+ suggested_filename = download.suggested_filename
1562
+ download_path = os.path.join(self.downloads_path, suggested_filename)
1563
+
1564
+ self.logger.info(
1565
+ message="Downloading {filename} to {path}",
1566
+ tag="FETCH",
1567
+ params={"filename": suggested_filename, "path": download_path},
1568
+ )
1569
+
1570
+ start_time = time.perf_counter()
1571
+ await download.save_as(download_path)
1572
+ end_time = time.perf_counter()
1573
+ self._downloaded_files.append(download_path)
1574
+
1575
+ self.logger.success(
1576
+ message="Downloaded {filename} successfully",
1577
+ tag="COMPLETE",
1578
+ params={
1579
+ "filename": suggested_filename,
1580
+ "path": download_path,
1581
+ "duration": f"{end_time - start_time:.2f}s",
1582
+ },
1583
+ )
1584
+ except Exception as e:
1585
+ self.logger.error(
1586
+ message="Failed to handle download: {error}",
1587
+ tag="ERROR",
1588
+ params={"error": str(e)},
1589
+ )
1590
+
1591
+ async def remove_overlay_elements(self, page: Page) -> None:
1592
+ """
1593
+ Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
1594
+
1595
+ Args:
1596
+ page (Page): The Playwright page instance
1597
+ """
1598
+ remove_overlays_js = load_js_script("remove_overlay_elements")
1599
+
1600
+ try:
1601
+ await page.evaluate(f"""
1602
+ (() => {{
1603
+ try {{
1604
+ {remove_overlays_js}
1605
+ return {{ success: true }};
1606
+ }} catch (error) {{
1607
+ return {{
1608
+ success: false,
1609
+ error: error.toString(),
1610
+ stack: error.stack
1611
+ }};
1612
+ }}
1613
+ }})()
1614
+ """)
1615
+ await page.wait_for_timeout(500) # Wait for any animations to complete
1616
+ except Exception as e:
1617
+ self.logger.warning(
1618
+ message="Failed to remove overlay elements: {error}",
1619
+ tag="SCRAPE",
1620
+ params={"error": str(e)},
1621
+ )
1622
+
1623
+ async def export_pdf(self, page: Page) -> bytes:
1624
+ """
1625
+ Exports the current page as a PDF.
1626
+
1627
+ Args:
1628
+ page (Page): The Playwright page object
1629
+
1630
+ Returns:
1631
+ bytes: The PDF data
1632
+ """
1633
+ pdf_data = await page.pdf(print_background=True)
1634
+ return pdf_data
1635
+
1636
+ async def take_screenshot(self, page, **kwargs) -> str:
1637
+ """
1638
+ Take a screenshot of the current page.
1639
+
1640
+ Args:
1641
+ page (Page): The Playwright page object
1642
+ kwargs: Additional keyword arguments
1643
+
1644
+ Returns:
1645
+ str: The base64-encoded screenshot data
1646
+ """
1647
+ need_scroll = await self.page_need_scroll(page)
1648
+
1649
+ if not need_scroll:
1650
+ # Page is short enough, just take a screenshot
1651
+ return await self.take_screenshot_naive(page)
1652
+ else:
1653
+ # Page is too long, try to take a full-page screenshot
1654
+ return await self.take_screenshot_scroller(page, **kwargs)
1655
+ # return await self.take_screenshot_from_pdf(await self.export_pdf(page))
1656
+
1657
+ async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
1658
+ """
1659
+ Convert the first page of the PDF to a screenshot.
1660
+
1661
+ Requires pdf2image and poppler.
1662
+
1663
+ Args:
1664
+ pdf_data (bytes): The PDF data
1665
+
1666
+ Returns:
1667
+ str: The base64-encoded screenshot data
1668
+ """
1669
+ try:
1670
+ from pdf2image import convert_from_bytes
1671
+
1672
+ images = convert_from_bytes(pdf_data)
1673
+ final_img = images[0].convert("RGB")
1674
+ buffered = BytesIO()
1675
+ final_img.save(buffered, format="JPEG")
1676
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
1677
+ except Exception as e:
1678
+ error_message = f"Failed to take PDF-based screenshot: {str(e)}"
1679
+ self.logger.error(
1680
+ message="PDF Screenshot failed: {error}",
1681
+ tag="ERROR",
1682
+ params={"error": error_message},
1683
+ )
1684
+ # Return error image as fallback
1685
+ img = Image.new("RGB", (800, 600), color="black")
1686
+ draw = ImageDraw.Draw(img)
1687
+ font = ImageFont.load_default()
1688
+ draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
1689
+ buffered = BytesIO()
1690
+ img.save(buffered, format="JPEG")
1691
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
1692
+
1693
+ async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
1694
+ """
1695
+ Attempt to set a large viewport and take a full-page screenshot.
1696
+ If still too large, segment the page as before.
1697
+
1698
+ Requires pdf2image and poppler.
1699
+
1700
+ Args:
1701
+ page (Page): The Playwright page object
1702
+ kwargs: Additional keyword arguments
1703
+
1704
+ Returns:
1705
+ str: The base64-encoded screenshot data
1706
+ """
1707
+ try:
1708
+ # Get page height
1709
+ dimensions = await self.get_page_dimensions(page)
1710
+ page_width = dimensions['width']
1711
+ page_height = dimensions['height']
1712
+ # page_height = await page.evaluate("document.documentElement.scrollHeight")
1713
+ # page_width = await page.evaluate("document.documentElement.scrollWidth")
1714
+
1715
+ # Set a large viewport
1716
+ large_viewport_height = min(
1717
+ page_height,
1718
+ kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
1719
+ )
1720
+ await page.set_viewport_size(
1721
+ {"width": page_width, "height": large_viewport_height}
1722
+ )
1723
+
1724
+ # Page still too long, segment approach
1725
+ segments = []
1726
+ viewport_size = page.viewport_size
1727
+ viewport_height = viewport_size["height"]
1728
+
1729
+ num_segments = (page_height // viewport_height) + 1
1730
+ for i in range(num_segments):
1731
+ y_offset = i * viewport_height
1732
+ await page.evaluate(f"window.scrollTo(0, {y_offset})")
1733
+ await asyncio.sleep(0.01) # wait for render
1734
+ seg_shot = await page.screenshot(full_page=False)
1735
+ img = Image.open(BytesIO(seg_shot)).convert("RGB")
1736
+ segments.append(img)
1737
+
1738
+ total_height = sum(img.height for img in segments)
1739
+ stitched = Image.new("RGB", (segments[0].width, total_height))
1740
+ offset = 0
1741
+ for img in segments:
1742
+ # stitched.paste(img, (0, offset))
1743
+ stitched.paste(img.convert("RGB"), (0, offset))
1744
+ offset += img.height
1745
+
1746
+ buffered = BytesIO()
1747
+ stitched = stitched.convert("RGB")
1748
+ stitched.save(buffered, format="BMP", quality=85)
1749
+ encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
1750
+
1751
+ return encoded
1752
+ except Exception as e:
1753
+ error_message = f"Failed to take large viewport screenshot: {str(e)}"
1754
+ self.logger.error(
1755
+ message="Large viewport screenshot failed: {error}",
1756
+ tag="ERROR",
1757
+ params={"error": error_message},
1758
+ )
1759
+ # return error image
1760
+ img = Image.new("RGB", (800, 600), color="black")
1761
+ draw = ImageDraw.Draw(img)
1762
+ font = ImageFont.load_default()
1763
+ draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
1764
+ buffered = BytesIO()
1765
+ img.save(buffered, format="JPEG")
1766
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
1767
+ finally:
1768
+ await page.close()
1769
+
1770
+ async def take_screenshot_naive(self, page: Page) -> str:
1771
+ """
1772
+ Takes a screenshot of the current page.
1773
+
1774
+ Args:
1775
+ page (Page): The Playwright page instance
1776
+
1777
+ Returns:
1778
+ str: Base64-encoded screenshot image
1779
+ """
1780
+ try:
1781
+ # The page is already loaded, just take the screenshot
1782
+ screenshot = await page.screenshot(full_page=False)
1783
+ return base64.b64encode(screenshot).decode("utf-8")
1784
+ except Exception as e:
1785
+ error_message = f"Failed to take screenshot: {str(e)}"
1786
+ self.logger.error(
1787
+ message="Screenshot failed: {error}",
1788
+ tag="ERROR",
1789
+ params={"error": error_message},
1790
+ )
1791
+
1792
+ # Generate an error image
1793
+ img = Image.new("RGB", (800, 600), color="black")
1794
+ draw = ImageDraw.Draw(img)
1795
+ font = ImageFont.load_default()
1796
+ draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
1797
+
1798
+ buffered = BytesIO()
1799
+ img.save(buffered, format="JPEG")
1800
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
1801
+ finally:
1802
+ await page.close()
1803
+
1804
+ async def export_storage_state(self, path: str = None) -> dict:
1805
+ """
1806
+ Exports the current storage state (cookies, localStorage, sessionStorage)
1807
+ to a JSON file at the specified path.
1808
+
1809
+ Args:
1810
+ path (str): The path to save the storage state JSON file
1811
+
1812
+ Returns:
1813
+ dict: The exported storage state
1814
+ """
1815
+ if self.default_context:
1816
+ state = await self.default_context.storage_state(path=path)
1817
+ self.logger.info(
1818
+ message="Exported storage state to {path}",
1819
+ tag="INFO",
1820
+ params={"path": path},
1821
+ )
1822
+ return state
1823
+ else:
1824
+ self.logger.warning(
1825
+ message="No default_context available to export storage state.",
1826
+ tag="WARNING",
1827
+ )
1828
+
1829
+ async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
1830
+ """
1831
+ Executes user-provided JavaScript code with proper error handling and context,
1832
+ supporting both synchronous and async user code, plus navigations.
1833
+
1834
+ How it works:
1835
+ 1. Wait for load state 'domcontentloaded'
1836
+ 2. If js_code is a string, execute it directly
1837
+ 3. If js_code is a list, execute each element in sequence
1838
+ 4. Wait for load state 'networkidle'
1839
+ 5. Return results
1840
+
1841
+ Args:
1842
+ page (Page): The Playwright page instance
1843
+ js_code (Union[str, List[str]]): The JavaScript code to execute
1844
+
1845
+ Returns:
1846
+ Dict[str, Any]: The results of the execution
1847
+ """
1848
+ try:
1849
+ await page.wait_for_load_state('domcontentloaded')
1850
+
1851
+ if isinstance(js_code, str):
1852
+ scripts = [js_code]
1853
+ else:
1854
+ scripts = js_code
1855
+
1856
+ results = []
1857
+ for script in scripts:
1858
+ try:
1859
+ # Attempt the evaluate
1860
+ # If the user code triggers navigation, we catch the "context destroyed" error
1861
+ # then wait for the new page to load before continuing
1862
+ result = None
1863
+ try:
1864
+ result = await page.evaluate(f"""
1865
+ (async () => {{
1866
+ try {{
1867
+ {script}
1868
+ return {{ success: true }};
1869
+ }} catch (err) {{
1870
+ return {{ success: false, error: err.toString(), stack: err.stack }};
1871
+ }}
1872
+ }})();
1873
+ """)
1874
+ except Error as e:
1875
+ # If it's due to navigation destroying the context, handle gracefully
1876
+ if "Execution context was destroyed" in str(e):
1877
+ self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC")
1878
+ try:
1879
+ await page.wait_for_load_state('load', timeout=30000)
1880
+ except Error as nav_err:
1881
+ self.logger.warning(
1882
+ message="Navigation wait failed: {error}",
1883
+ tag="JS_EXEC",
1884
+ params={"error": str(nav_err)}
1885
+ )
1886
+ try:
1887
+ await page.wait_for_load_state('networkidle', timeout=30000)
1888
+ except Error as nav_err:
1889
+ self.logger.warning(
1890
+ message="Network idle wait failed: {error}",
1891
+ tag="JS_EXEC",
1892
+ params={"error": str(nav_err)}
1893
+ )
1894
+ # Return partial success, or adapt as you see fit
1895
+ result = {
1896
+ "success": True,
1897
+ "info": "Navigation triggered, ignoring context destroyed error"
1898
+ }
1899
+ else:
1900
+ # It's some other error, log and continue
1901
+ self.logger.error(
1902
+ message="Playwright execution error: {error}",
1903
+ tag="JS_EXEC",
1904
+ params={"error": str(e)}
1905
+ )
1906
+ result = {"success": False, "error": str(e)}
1907
+
1908
+ # If we made it this far with no repeated error, do post-load waits
1909
+ t1 = time.time()
1910
+ try:
1911
+ await page.wait_for_load_state('domcontentloaded', timeout=5000)
1912
+ print("DOM content loaded after script execution in", time.time() - t1)
1913
+ except Error as e:
1914
+ self.logger.warning(
1915
+ message="DOM content load timeout: {error}",
1916
+ tag="JS_EXEC",
1917
+ params={"error": str(e)}
1918
+ )
1919
+
1920
+ # t1 = time.time()
1921
+ # try:
1922
+ # await page.wait_for_load_state('networkidle', timeout=5000)
1923
+ # print("Network idle after script execution in", time.time() - t1)
1924
+ # except Error as e:
1925
+ # self.logger.warning(
1926
+ # message="Network idle timeout: {error}",
1927
+ # tag="JS_EXEC",
1928
+ # params={"error": str(e)}
1929
+ # )
1930
+
1931
+ results.append(result if result else {"success": True})
1932
+
1933
+ except Exception as e:
1934
+ # Catch anything else
1935
+ self.logger.error(
1936
+ message="Script chunk failed: {error}",
1937
+ tag="JS_EXEC",
1938
+ params={"error": str(e)}
1939
+ )
1940
+ results.append({"success": False, "error": str(e)})
1941
+
1942
+ return {"success": True, "results": results}
1943
+
1944
+ except Exception as e:
1945
+ self.logger.error(
1946
+ message="Script execution failed: {error}",
1947
+ tag="JS_EXEC",
1948
+ params={"error": str(e)}
1949
+ )
1950
+ return {"success": False, "error": str(e)}
1951
+
1952
+ async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
1953
+ """
1954
+ Executes user-provided JavaScript code with proper error handling and context.
1955
+
1956
+ Args:
1957
+ page: Playwright page object
1958
+ js_code: Single JavaScript string or list of JavaScript code strings
1959
+
1960
+ Returns:
1961
+ Dict containing execution status and results/errors
1962
+ """
1963
+ try:
1964
+ # Ensure the page is ready for script execution
1965
+ await page.wait_for_load_state('domcontentloaded')
1966
+
1967
+ # Handle single script or multiple scripts
1968
+ if isinstance(js_code, str):
1969
+ scripts = [js_code]
1970
+ else:
1971
+ scripts = js_code
1972
+
1973
+ results = []
1974
+ for script in scripts:
1975
+ try:
1976
+ # Execute the script and wait for network idle
1977
+ result = await page.evaluate(f"""
1978
+ (() => {{
1979
+ return new Promise((resolve) => {{
1980
+ try {{
1981
+ const result = (function() {{
1982
+ {script}
1983
+ }})();
1984
+
1985
+ // If result is a promise, wait for it
1986
+ if (result instanceof Promise) {{
1987
+ result.then(() => {{
1988
+ // Wait a bit for any triggered effects
1989
+ setTimeout(() => resolve({{ success: true }}), 100);
1990
+ }}).catch(error => {{
1991
+ resolve({{
1992
+ success: false,
1993
+ error: error.toString(),
1994
+ stack: error.stack
1995
+ }});
1996
+ }});
1997
+ }} else {{
1998
+ // For non-promise results, still wait a bit for effects
1999
+ setTimeout(() => resolve({{ success: true }}), 100);
2000
+ }}
2001
+ }} catch (error) {{
2002
+ resolve({{
2003
+ success: false,
2004
+ error: error.toString(),
2005
+ stack: error.stack
2006
+ }});
2007
+ }}
2008
+ }});
2009
+ }})()
2010
+ """)
2011
+
2012
+ # Wait for network idle after script execution
2013
+ t1 = time.time()
2014
+ await page.wait_for_load_state('domcontentloaded', timeout=5000)
2015
+ print("DOM content loaded after script execution in", time.time() - t1)
2016
+
2017
+ t1 = time.time()
2018
+ await page.wait_for_load_state('networkidle', timeout=5000)
2019
+ print("Network idle after script execution in", time.time() - t1)
2020
+
2021
+ results.append(result if result else {"success": True})
2022
+
2023
+ except Error as e:
2024
+ # Handle Playwright-specific errors
2025
+ self.logger.error(
2026
+ message="Playwright execution error: {error}",
2027
+ tag="JS_EXEC",
2028
+ params={"error": str(e)}
2029
+ )
2030
+ results.append({"success": False, "error": str(e)})
2031
+
2032
+ return {"success": True, "results": results}
2033
+
2034
+ except Exception as e:
2035
+ self.logger.error(
2036
+ message="Script execution failed: {error}",
2037
+ tag="JS_EXEC",
2038
+ params={"error": str(e)}
2039
+ )
2040
+ return {"success": False, "error": str(e)}
2041
+
2042
+ except Exception as e:
2043
+ self.logger.error(
2044
+ message="Script execution failed: {error}",
2045
+ tag="JS_EXEC",
2046
+ params={"error": str(e)}
2047
+ )
2048
+ return {"success": False, "error": str(e)}
2049
+
2050
+ async def check_visibility(self, page):
2051
+ """
2052
+ Checks if an element is visible on the page.
2053
+
2054
+ Args:
2055
+ page: Playwright page object
2056
+
2057
+ Returns:
2058
+ Boolean indicating visibility
2059
+ """
2060
+ return await page.evaluate("""
2061
+ () => {
2062
+ const element = document.body;
2063
+ if (!element) return false;
2064
+ const style = window.getComputedStyle(element);
2065
+ const isVisible = style.display !== 'none' &&
2066
+ style.visibility !== 'hidden' &&
2067
+ style.opacity !== '0';
2068
+ return isVisible;
2069
+ }
2070
+ """)
2071
+
2072
+ async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1):
2073
+ """
2074
+ Safely scroll the page with rendering time.
2075
+
2076
+ Args:
2077
+ page: Playwright page object
2078
+ x: Horizontal scroll position
2079
+ y: Vertical scroll position
2080
+ """
2081
+ result = await self.csp_scroll_to(page, x, y)
2082
+ if result['success']:
2083
+ await page.wait_for_timeout(delay * 1000)
2084
+ return result
2085
+
2086
+ async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:
2087
+ """
2088
+ Performs a CSP-compliant scroll operation and returns the result status.
2089
+
2090
+ Args:
2091
+ page: Playwright page object
2092
+ x: Horizontal scroll position
2093
+ y: Vertical scroll position
2094
+
2095
+ Returns:
2096
+ Dict containing scroll status and position information
2097
+ """
2098
+ try:
2099
+ result = await page.evaluate(
2100
+ f"""() => {{
2101
+ try {{
2102
+ const startX = window.scrollX;
2103
+ const startY = window.scrollY;
2104
+ window.scrollTo({x}, {y});
2105
+
2106
+ // Get final position after scroll
2107
+ const endX = window.scrollX;
2108
+ const endY = window.scrollY;
2109
+
2110
+ return {{
2111
+ success: true,
2112
+ startPosition: {{ x: startX, y: startY }},
2113
+ endPosition: {{ x: endX, y: endY }},
2114
+ targetPosition: {{ x: {x}, y: {y} }},
2115
+ delta: {{
2116
+ x: Math.abs(endX - {x}),
2117
+ y: Math.abs(endY - {y})
2118
+ }}
2119
+ }};
2120
+ }} catch (e) {{
2121
+ return {{
2122
+ success: false,
2123
+ error: e.toString()
2124
+ }};
2125
+ }}
2126
+ }}"""
2127
+ )
2128
+
2129
+ if not result['success']:
2130
+ self.logger.warning(
2131
+ message="Scroll operation failed: {error}",
2132
+ tag="SCROLL",
2133
+ params={"error": result.get('error')}
2134
+ )
2135
+
2136
+ return result
2137
+
2138
+ except Exception as e:
2139
+ self.logger.error(
2140
+ message="Failed to execute scroll: {error}",
2141
+ tag="SCROLL",
2142
+ params={"error": str(e)}
2143
+ )
2144
+ return {
2145
+ "success": False,
2146
+ "error": str(e)
2147
+ }
2148
+
2149
+ async def get_page_dimensions(self, page: Page):
2150
+ """
2151
+ Get the dimensions of the page.
2152
+
2153
+ Args:
2154
+ page: Playwright page object
2155
+
2156
+ Returns:
2157
+ Dict containing width and height of the page
2158
+ """
2159
+ return await page.evaluate("""
2160
+ () => {
2161
+ const {scrollWidth, scrollHeight} = document.documentElement;
2162
+ return {width: scrollWidth, height: scrollHeight};
2163
+ }
2164
+ """)
2165
+
2166
+ async def page_need_scroll(self, page: Page) -> bool:
2167
+ """
2168
+ Determine whether the page need to scroll
2169
+
2170
+ Args:
2171
+ page: Playwright page object
2172
+
2173
+ Returns:
2174
+ bool: True if page needs scrolling
2175
+ """
2176
+ try:
2177
+ need_scroll = await page.evaluate("""
2178
+ () => {
2179
+ const scrollHeight = document.documentElement.scrollHeight;
2180
+ const viewportHeight = window.innerHeight;
2181
+ return scrollHeight > viewportHeight;
2182
+ }
2183
+ """)
2184
+ return need_scroll
2185
+ except Exception as e:
2186
+ self.logger.warning(
2187
+ message="Failed to check scroll need: {error}. Defaulting to True for safety.",
2188
+ tag="SCROLL",
2189
+ params={"error": str(e)}
2190
+ )
2191
+ return True # Default to scrolling if check fails
crawl4ai/async_database.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ from pathlib import Path
3
+ import aiosqlite
4
+ import asyncio
5
+ from typing import Optional, Tuple, Dict
6
+ from contextlib import asynccontextmanager
7
+ import logging
8
+ import json # Added for serialization/deserialization
9
+ from .utils import ensure_content_dirs, generate_content_hash
10
+ from .models import CrawlResult, MarkdownGenerationResult
11
+ import xxhash
12
+ import aiofiles
13
+ from .config import NEED_MIGRATION
14
+ from .version_manager import VersionManager
15
+ from .async_logger import AsyncLogger
16
+ from .utils import get_error_context, create_box_message
17
+ # Set up logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
22
+ os.makedirs(DB_PATH, exist_ok=True)
23
+ DB_PATH = os.path.join(base_directory, "crawl4ai.db")
24
+
25
+ class AsyncDatabaseManager:
26
+ def __init__(self, pool_size: int = 10, max_retries: int = 3):
27
+ self.db_path = DB_PATH
28
+ self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH))
29
+ self.pool_size = pool_size
30
+ self.max_retries = max_retries
31
+ self.connection_pool: Dict[int, aiosqlite.Connection] = {}
32
+ self.pool_lock = asyncio.Lock()
33
+ self.init_lock = asyncio.Lock()
34
+ self.connection_semaphore = asyncio.Semaphore(pool_size)
35
+ self._initialized = False
36
+ self.version_manager = VersionManager()
37
+ self.logger = AsyncLogger(
38
+ log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
39
+ verbose=False,
40
+ tag_width=10
41
+ )
42
+
43
+
44
+ async def initialize(self):
45
+ """Initialize the database and connection pool"""
46
+ try:
47
+ self.logger.info("Initializing database", tag="INIT")
48
+ # Ensure the database file exists
49
+ os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
50
+
51
+ # Check if version update is needed
52
+ needs_update = self.version_manager.needs_update()
53
+
54
+ # Always ensure base table exists
55
+ await self.ainit_db()
56
+
57
+ # Verify the table exists
58
+ async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
59
+ async with db.execute(
60
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
61
+ ) as cursor:
62
+ result = await cursor.fetchone()
63
+ if not result:
64
+ raise Exception("crawled_data table was not created")
65
+
66
+ # If version changed or fresh install, run updates
67
+ if needs_update:
68
+ self.logger.info("New version detected, running updates", tag="INIT")
69
+ await self.update_db_schema()
70
+ from .migrations import run_migration # Import here to avoid circular imports
71
+ await run_migration()
72
+ self.version_manager.update_version() # Update stored version after successful migration
73
+ self.logger.success("Version update completed successfully", tag="COMPLETE")
74
+ else:
75
+ self.logger.success("Database initialization completed successfully", tag="COMPLETE")
76
+
77
+
78
+ except Exception as e:
79
+ self.logger.error(
80
+ message="Database initialization error: {error}",
81
+ tag="ERROR",
82
+ params={"error": str(e)}
83
+ )
84
+ self.logger.info(
85
+ message="Database will be initialized on first use",
86
+ tag="INIT"
87
+ )
88
+
89
+ raise
90
+
91
+
92
+ async def cleanup(self):
93
+ """Cleanup connections when shutting down"""
94
+ async with self.pool_lock:
95
+ for conn in self.connection_pool.values():
96
+ await conn.close()
97
+ self.connection_pool.clear()
98
+
99
+ @asynccontextmanager
100
+ async def get_connection(self):
101
+ """Connection pool manager with enhanced error handling"""
102
+ if not self._initialized:
103
+ async with self.init_lock:
104
+ if not self._initialized:
105
+ try:
106
+ await self.initialize()
107
+ self._initialized = True
108
+ except Exception as e:
109
+ import sys
110
+ error_context = get_error_context(sys.exc_info())
111
+ self.logger.error(
112
+ message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
113
+ tag="ERROR",
114
+ force_verbose=True,
115
+ params={
116
+ "error": str(e),
117
+ "context": error_context["code_context"],
118
+ "traceback": error_context["full_traceback"]
119
+ }
120
+ )
121
+ raise
122
+
123
+ await self.connection_semaphore.acquire()
124
+ task_id = id(asyncio.current_task())
125
+
126
+ try:
127
+ async with self.pool_lock:
128
+ if task_id not in self.connection_pool:
129
+ try:
130
+ conn = await aiosqlite.connect(
131
+ self.db_path,
132
+ timeout=30.0
133
+ )
134
+ await conn.execute('PRAGMA journal_mode = WAL')
135
+ await conn.execute('PRAGMA busy_timeout = 5000')
136
+
137
+ # Verify database structure
138
+ async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
139
+ columns = await cursor.fetchall()
140
+ column_names = [col[1] for col in columns]
141
+ expected_columns = {
142
+ 'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
143
+ 'success', 'media', 'links', 'metadata', 'screenshot',
144
+ 'response_headers', 'downloaded_files'
145
+ }
146
+ missing_columns = expected_columns - set(column_names)
147
+ if missing_columns:
148
+ raise ValueError(f"Database missing columns: {missing_columns}")
149
+
150
+ self.connection_pool[task_id] = conn
151
+ except Exception as e:
152
+ import sys
153
+ error_context = get_error_context(sys.exc_info())
154
+ error_message = (
155
+ f"Unexpected error in db get_connection at line {error_context['line_no']} "
156
+ f"in {error_context['function']} ({error_context['filename']}):\n"
157
+ f"Error: {str(e)}\n\n"
158
+ f"Code context:\n{error_context['code_context']}"
159
+ )
160
+ self.logger.error(
161
+ message=create_box_message(error_message, type= "error"),
162
+ )
163
+
164
+ raise
165
+
166
+ yield self.connection_pool[task_id]
167
+
168
+ except Exception as e:
169
+ import sys
170
+ error_context = get_error_context(sys.exc_info())
171
+ error_message = (
172
+ f"Unexpected error in db get_connection at line {error_context['line_no']} "
173
+ f"in {error_context['function']} ({error_context['filename']}):\n"
174
+ f"Error: {str(e)}\n\n"
175
+ f"Code context:\n{error_context['code_context']}"
176
+ )
177
+ self.logger.error(
178
+ message=create_box_message(error_message, type= "error"),
179
+ )
180
+ raise
181
+ finally:
182
+ async with self.pool_lock:
183
+ if task_id in self.connection_pool:
184
+ await self.connection_pool[task_id].close()
185
+ del self.connection_pool[task_id]
186
+ self.connection_semaphore.release()
187
+
188
+
189
+ async def execute_with_retry(self, operation, *args):
190
+ """Execute database operations with retry logic"""
191
+ for attempt in range(self.max_retries):
192
+ try:
193
+ async with self.get_connection() as db:
194
+ result = await operation(db, *args)
195
+ await db.commit()
196
+ return result
197
+ except Exception as e:
198
+ if attempt == self.max_retries - 1:
199
+ self.logger.error(
200
+ message="Operation failed after {retries} attempts: {error}",
201
+ tag="ERROR",
202
+ force_verbose=True,
203
+ params={
204
+ "retries": self.max_retries,
205
+ "error": str(e)
206
+ }
207
+ )
208
+ raise
209
+ await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
210
+
211
+ async def ainit_db(self):
212
+ """Initialize database schema"""
213
+ async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
214
+ await db.execute('''
215
+ CREATE TABLE IF NOT EXISTS crawled_data (
216
+ url TEXT PRIMARY KEY,
217
+ html TEXT,
218
+ cleaned_html TEXT,
219
+ markdown TEXT,
220
+ extracted_content TEXT,
221
+ success BOOLEAN,
222
+ media TEXT DEFAULT "{}",
223
+ links TEXT DEFAULT "{}",
224
+ metadata TEXT DEFAULT "{}",
225
+ screenshot TEXT DEFAULT "",
226
+ response_headers TEXT DEFAULT "{}",
227
+ downloaded_files TEXT DEFAULT "{}" -- New column added
228
+ )
229
+ ''')
230
+ await db.commit()
231
+
232
+
233
+
234
+ async def update_db_schema(self):
235
+ """Update database schema if needed"""
236
+ async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
237
+ cursor = await db.execute("PRAGMA table_info(crawled_data)")
238
+ columns = await cursor.fetchall()
239
+ column_names = [column[1] for column in columns]
240
+
241
+ # List of new columns to add
242
+ new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
243
+
244
+ for column in new_columns:
245
+ if column not in column_names:
246
+ await self.aalter_db_add_column(column, db)
247
+ await db.commit()
248
+
249
+ async def aalter_db_add_column(self, new_column: str, db):
250
+ """Add new column to the database"""
251
+ if new_column == 'response_headers':
252
+ await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
253
+ else:
254
+ await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
255
+ self.logger.info(
256
+ message="Added column '{column}' to the database",
257
+ tag="INIT",
258
+ params={"column": new_column}
259
+ )
260
+
261
+
262
+ async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
263
+ """Retrieve cached URL data as CrawlResult"""
264
+ async def _get(db):
265
+ async with db.execute(
266
+ 'SELECT * FROM crawled_data WHERE url = ?', (url,)
267
+ ) as cursor:
268
+ row = await cursor.fetchone()
269
+ if not row:
270
+ return None
271
+
272
+ # Get column names
273
+ columns = [description[0] for description in cursor.description]
274
+ # Create dict from row data
275
+ row_dict = dict(zip(columns, row))
276
+
277
+ # Load content from files using stored hashes
278
+ content_fields = {
279
+ 'html': row_dict['html'],
280
+ 'cleaned_html': row_dict['cleaned_html'],
281
+ 'markdown': row_dict['markdown'],
282
+ 'extracted_content': row_dict['extracted_content'],
283
+ 'screenshot': row_dict['screenshot'],
284
+ 'screenshots': row_dict['screenshot'],
285
+ }
286
+
287
+ for field, hash_value in content_fields.items():
288
+ if hash_value:
289
+ content = await self._load_content(
290
+ hash_value,
291
+ field.split('_')[0] # Get content type from field name
292
+ )
293
+ row_dict[field] = content or ""
294
+ else:
295
+ row_dict[field] = ""
296
+
297
+ # Parse JSON fields
298
+ json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
299
+ for field in json_fields:
300
+ try:
301
+ row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
302
+ except json.JSONDecodeError:
303
+ row_dict[field] = {}
304
+
305
+ if isinstance(row_dict['markdown'], Dict):
306
+ row_dict['markdown_v2'] = row_dict['markdown']
307
+ if row_dict['markdown'].get('raw_markdown'):
308
+ row_dict['markdown'] = row_dict['markdown']['raw_markdown']
309
+
310
+ # Parse downloaded_files
311
+ try:
312
+ row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
313
+ except json.JSONDecodeError:
314
+ row_dict['downloaded_files'] = []
315
+
316
+ # Remove any fields not in CrawlResult model
317
+ valid_fields = CrawlResult.__annotations__.keys()
318
+ filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
319
+
320
+ return CrawlResult(**filtered_dict)
321
+
322
+ try:
323
+ return await self.execute_with_retry(_get)
324
+ except Exception as e:
325
+ self.logger.error(
326
+ message="Error retrieving cached URL: {error}",
327
+ tag="ERROR",
328
+ force_verbose=True,
329
+ params={"error": str(e)}
330
+ )
331
+ return None
332
+
333
+ async def acache_url(self, result: CrawlResult):
334
+ """Cache CrawlResult data"""
335
+ # Store content files and get hashes
336
+ content_map = {
337
+ 'html': (result.html, 'html'),
338
+ 'cleaned_html': (result.cleaned_html or "", 'cleaned'),
339
+ 'markdown': None,
340
+ 'extracted_content': (result.extracted_content or "", 'extracted'),
341
+ 'screenshot': (result.screenshot or "", 'screenshots')
342
+ }
343
+
344
+ try:
345
+ if isinstance(result.markdown, MarkdownGenerationResult):
346
+ content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
347
+ elif hasattr(result, 'markdown_v2'):
348
+ content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
349
+ elif isinstance(result.markdown, str):
350
+ markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
351
+ content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
352
+ else:
353
+ content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
354
+ except Exception as e:
355
+ self.logger.warning(
356
+ message=f"Error processing markdown content: {str(e)}",
357
+ tag="WARNING"
358
+ )
359
+ # Fallback to empty markdown result
360
+ content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
361
+
362
+ content_hashes = {}
363
+ for field, (content, content_type) in content_map.items():
364
+ content_hashes[field] = await self._store_content(content, content_type)
365
+
366
+ async def _cache(db):
367
+ await db.execute('''
368
+ INSERT INTO crawled_data (
369
+ url, html, cleaned_html, markdown,
370
+ extracted_content, success, media, links, metadata,
371
+ screenshot, response_headers, downloaded_files
372
+ )
373
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
374
+ ON CONFLICT(url) DO UPDATE SET
375
+ html = excluded.html,
376
+ cleaned_html = excluded.cleaned_html,
377
+ markdown = excluded.markdown,
378
+ extracted_content = excluded.extracted_content,
379
+ success = excluded.success,
380
+ media = excluded.media,
381
+ links = excluded.links,
382
+ metadata = excluded.metadata,
383
+ screenshot = excluded.screenshot,
384
+ response_headers = excluded.response_headers,
385
+ downloaded_files = excluded.downloaded_files
386
+ ''', (
387
+ result.url,
388
+ content_hashes['html'],
389
+ content_hashes['cleaned_html'],
390
+ content_hashes['markdown'],
391
+ content_hashes['extracted_content'],
392
+ result.success,
393
+ json.dumps(result.media),
394
+ json.dumps(result.links),
395
+ json.dumps(result.metadata or {}),
396
+ content_hashes['screenshot'],
397
+ json.dumps(result.response_headers or {}),
398
+ json.dumps(result.downloaded_files or [])
399
+ ))
400
+
401
+ try:
402
+ await self.execute_with_retry(_cache)
403
+ except Exception as e:
404
+ self.logger.error(
405
+ message="Error caching URL: {error}",
406
+ tag="ERROR",
407
+ force_verbose=True,
408
+ params={"error": str(e)}
409
+ )
410
+
411
+
412
+ async def aget_total_count(self) -> int:
413
+ """Get total number of cached URLs"""
414
+ async def _count(db):
415
+ async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
416
+ result = await cursor.fetchone()
417
+ return result[0] if result else 0
418
+
419
+ try:
420
+ return await self.execute_with_retry(_count)
421
+ except Exception as e:
422
+ self.logger.error(
423
+ message="Error getting total count: {error}",
424
+ tag="ERROR",
425
+ force_verbose=True,
426
+ params={"error": str(e)}
427
+ )
428
+ return 0
429
+
430
+ async def aclear_db(self):
431
+ """Clear all data from the database"""
432
+ async def _clear(db):
433
+ await db.execute('DELETE FROM crawled_data')
434
+
435
+ try:
436
+ await self.execute_with_retry(_clear)
437
+ except Exception as e:
438
+ self.logger.error(
439
+ message="Error clearing database: {error}",
440
+ tag="ERROR",
441
+ force_verbose=True,
442
+ params={"error": str(e)}
443
+ )
444
+
445
+ async def aflush_db(self):
446
+ """Drop the entire table"""
447
+ async def _flush(db):
448
+ await db.execute('DROP TABLE IF EXISTS crawled_data')
449
+
450
+ try:
451
+ await self.execute_with_retry(_flush)
452
+ except Exception as e:
453
+ self.logger.error(
454
+ message="Error flushing database: {error}",
455
+ tag="ERROR",
456
+ force_verbose=True,
457
+ params={"error": str(e)}
458
+ )
459
+
460
+
461
+ async def _store_content(self, content: str, content_type: str) -> str:
462
+ """Store content in filesystem and return hash"""
463
+ if not content:
464
+ return ""
465
+
466
+ content_hash = generate_content_hash(content)
467
+ file_path = os.path.join(self.content_paths[content_type], content_hash)
468
+
469
+ # Only write if file doesn't exist
470
+ if not os.path.exists(file_path):
471
+ async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
472
+ await f.write(content)
473
+
474
+ return content_hash
475
+
476
+ async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
477
+ """Load content from filesystem by hash"""
478
+ if not content_hash:
479
+ return None
480
+
481
+ file_path = os.path.join(self.content_paths[content_type], content_hash)
482
+ try:
483
+ async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
484
+ return await f.read()
485
+ except:
486
+ self.logger.error(
487
+ message="Failed to load content: {file_path}",
488
+ tag="ERROR",
489
+ force_verbose=True,
490
+ params={"file_path": file_path}
491
+ )
492
+ return None
493
+
494
+ # Create a singleton instance
495
+ async_db_manager = AsyncDatabaseManager()
crawl4ai/async_logger.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import Optional, Dict, Any, Union
3
+ from colorama import Fore, Back, Style, init
4
+ import time
5
+ import os
6
+ from datetime import datetime
7
+
8
+ class LogLevel(Enum):
9
+ DEBUG = 1
10
+ INFO = 2
11
+ SUCCESS = 3
12
+ WARNING = 4
13
+ ERROR = 5
14
+
15
+ class AsyncLogger:
16
+ """
17
+ Asynchronous logger with support for colored console output and file logging.
18
+ Supports templated messages with colored components.
19
+ """
20
+
21
+ DEFAULT_ICONS = {
22
+ 'INIT': '→',
23
+ 'READY': '✓',
24
+ 'FETCH': '↓',
25
+ 'SCRAPE': '◆',
26
+ 'EXTRACT': '■',
27
+ 'COMPLETE': '●',
28
+ 'ERROR': '×',
29
+ 'DEBUG': '⋯',
30
+ 'INFO': 'ℹ',
31
+ 'WARNING': '⚠',
32
+ }
33
+
34
+ DEFAULT_COLORS = {
35
+ LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
36
+ LogLevel.INFO: Fore.CYAN,
37
+ LogLevel.SUCCESS: Fore.GREEN,
38
+ LogLevel.WARNING: Fore.YELLOW,
39
+ LogLevel.ERROR: Fore.RED,
40
+ }
41
+
42
+ def __init__(
43
+ self,
44
+ log_file: Optional[str] = None,
45
+ log_level: LogLevel = LogLevel.DEBUG,
46
+ tag_width: int = 10,
47
+ icons: Optional[Dict[str, str]] = None,
48
+ colors: Optional[Dict[LogLevel, str]] = None,
49
+ verbose: bool = True
50
+ ):
51
+ """
52
+ Initialize the logger.
53
+
54
+ Args:
55
+ log_file: Optional file path for logging
56
+ log_level: Minimum log level to display
57
+ tag_width: Width for tag formatting
58
+ icons: Custom icons for different tags
59
+ colors: Custom colors for different log levels
60
+ verbose: Whether to output to console
61
+ """
62
+ init() # Initialize colorama
63
+ self.log_file = log_file
64
+ self.log_level = log_level
65
+ self.tag_width = tag_width
66
+ self.icons = icons or self.DEFAULT_ICONS
67
+ self.colors = colors or self.DEFAULT_COLORS
68
+ self.verbose = verbose
69
+
70
+ # Create log file directory if needed
71
+ if log_file:
72
+ os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
73
+
74
+ def _format_tag(self, tag: str) -> str:
75
+ """Format a tag with consistent width."""
76
+ return f"[{tag}]".ljust(self.tag_width, ".")
77
+
78
+ def _get_icon(self, tag: str) -> str:
79
+ """Get the icon for a tag, defaulting to info icon if not found."""
80
+ return self.icons.get(tag, self.icons['INFO'])
81
+
82
+ def _write_to_file(self, message: str):
83
+ """Write a message to the log file if configured."""
84
+ if self.log_file:
85
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
86
+ with open(self.log_file, 'a', encoding='utf-8') as f:
87
+ # Strip ANSI color codes for file output
88
+ clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
89
+ for color in vars(Fore).values():
90
+ if isinstance(color, str):
91
+ clean_message = clean_message.replace(color, '')
92
+ f.write(f"[{timestamp}] {clean_message}\n")
93
+
94
+ def _log(
95
+ self,
96
+ level: LogLevel,
97
+ message: str,
98
+ tag: str,
99
+ params: Optional[Dict[str, Any]] = None,
100
+ colors: Optional[Dict[str, str]] = None,
101
+ base_color: Optional[str] = None,
102
+ **kwargs
103
+ ):
104
+ """
105
+ Core logging method that handles message formatting and output.
106
+
107
+ Args:
108
+ level: Log level for this message
109
+ message: Message template string
110
+ tag: Tag for the message
111
+ params: Parameters to format into the message
112
+ colors: Color overrides for specific parameters
113
+ base_color: Base color for the entire message
114
+ """
115
+ if level.value < self.log_level.value:
116
+ return
117
+
118
+ # Format the message with parameters if provided
119
+ if params:
120
+ try:
121
+ # First format the message with raw parameters
122
+ formatted_message = message.format(**params)
123
+
124
+ # Then apply colors if specified
125
+ if colors:
126
+ for key, color in colors.items():
127
+ # Find the formatted value in the message and wrap it with color
128
+ if key in params:
129
+ value_str = str(params[key])
130
+ formatted_message = formatted_message.replace(
131
+ value_str,
132
+ f"{color}{value_str}{Style.RESET_ALL}"
133
+ )
134
+
135
+ except KeyError as e:
136
+ formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
137
+ level = LogLevel.ERROR
138
+ else:
139
+ formatted_message = message
140
+
141
+ # Construct the full log line
142
+ color = base_color or self.colors[level]
143
+ log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
144
+
145
+ # Output to console if verbose
146
+ if self.verbose or kwargs.get("force_verbose", False):
147
+ print(log_line)
148
+
149
+ # Write to file if configured
150
+ self._write_to_file(log_line)
151
+
152
+ def debug(self, message: str, tag: str = "DEBUG", **kwargs):
153
+ """Log a debug message."""
154
+ self._log(LogLevel.DEBUG, message, tag, **kwargs)
155
+
156
+ def info(self, message: str, tag: str = "INFO", **kwargs):
157
+ """Log an info message."""
158
+ self._log(LogLevel.INFO, message, tag, **kwargs)
159
+
160
+ def success(self, message: str, tag: str = "SUCCESS", **kwargs):
161
+ """Log a success message."""
162
+ self._log(LogLevel.SUCCESS, message, tag, **kwargs)
163
+
164
+ def warning(self, message: str, tag: str = "WARNING", **kwargs):
165
+ """Log a warning message."""
166
+ self._log(LogLevel.WARNING, message, tag, **kwargs)
167
+
168
+ def error(self, message: str, tag: str = "ERROR", **kwargs):
169
+ """Log an error message."""
170
+ self._log(LogLevel.ERROR, message, tag, **kwargs)
171
+
172
+ def url_status(
173
+ self,
174
+ url: str,
175
+ success: bool,
176
+ timing: float,
177
+ tag: str = "FETCH",
178
+ url_length: int = 50
179
+ ):
180
+ """
181
+ Convenience method for logging URL fetch status.
182
+
183
+ Args:
184
+ url: The URL being processed
185
+ success: Whether the operation was successful
186
+ timing: Time taken for the operation
187
+ tag: Tag for the message
188
+ url_length: Maximum length for URL in log
189
+ """
190
+ self._log(
191
+ level=LogLevel.SUCCESS if success else LogLevel.ERROR,
192
+ message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
193
+ tag=tag,
194
+ params={
195
+ "url": url,
196
+ "url_length": url_length,
197
+ "status": success,
198
+ "timing": timing
199
+ },
200
+ colors={
201
+ "status": Fore.GREEN if success else Fore.RED,
202
+ "timing": Fore.YELLOW
203
+ }
204
+ )
205
+
206
+ def error_status(
207
+ self,
208
+ url: str,
209
+ error: str,
210
+ tag: str = "ERROR",
211
+ url_length: int = 50
212
+ ):
213
+ """
214
+ Convenience method for logging error status.
215
+
216
+ Args:
217
+ url: The URL being processed
218
+ error: Error message
219
+ tag: Tag for the message
220
+ url_length: Maximum length for URL in log
221
+ """
222
+ self._log(
223
+ level=LogLevel.ERROR,
224
+ message="{url:.{url_length}}... | Error: {error}",
225
+ tag=tag,
226
+ params={
227
+ "url": url,
228
+ "url_length": url_length,
229
+ "error": error
230
+ }
231
+ )
crawl4ai/async_webcrawler.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import time
3
+ import warnings
4
+ from enum import Enum
5
+ from colorama import init, Fore, Back, Style
6
+ from pathlib import Path
7
+ from typing import Optional, List, Union
8
+ import json
9
+ import asyncio
10
+ # from contextlib import nullcontext, asynccontextmanager
11
+ from contextlib import asynccontextmanager
12
+ from .models import CrawlResult, MarkdownGenerationResult
13
+ from .async_database import async_db_manager
14
+ from .chunking_strategy import *
15
+ from .content_filter_strategy import *
16
+ from .extraction_strategy import *
17
+ from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
18
+ from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
19
+ from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
20
+ from .content_scraping_strategy import WebScrapingStrategy
21
+ from .async_logger import AsyncLogger
22
+ from .async_configs import BrowserConfig, CrawlerRunConfig
23
+ from .config import (
24
+ MIN_WORD_THRESHOLD,
25
+ IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
26
+ URL_LOG_SHORTEN_LENGTH
27
+ )
28
+ from .utils import (
29
+ sanitize_input_encode,
30
+ InvalidCSSSelectorError,
31
+ format_html,
32
+ fast_format_html,
33
+ create_box_message
34
+ )
35
+
36
+ from urllib.parse import urlparse
37
+ import random
38
+ from .__version__ import __version__ as crawl4ai_version
39
+
40
+
41
+ class AsyncWebCrawler:
42
+ """
43
+ Asynchronous web crawler with flexible caching capabilities.
44
+
45
+ There are two ways to use the crawler:
46
+
47
+ 1. Using context manager (recommended for simple cases):
48
+ ```python
49
+ async with AsyncWebCrawler() as crawler:
50
+ result = await crawler.arun(url="https://example.com")
51
+ ```
52
+
53
+ 2. Using explicit lifecycle management (recommended for long-running applications):
54
+ ```python
55
+ crawler = AsyncWebCrawler()
56
+ await crawler.start()
57
+
58
+ # Use the crawler multiple times
59
+ result1 = await crawler.arun(url="https://example.com")
60
+ result2 = await crawler.arun(url="https://another.com")
61
+
62
+ await crawler.close()
63
+ ```
64
+
65
+ Migration Guide:
66
+ Old way (deprecated):
67
+ crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
68
+
69
+ New way (recommended):
70
+ browser_config = BrowserConfig(browser_type="chromium", headless=True)
71
+ crawler = AsyncWebCrawler(config=browser_config)
72
+
73
+
74
+ Attributes:
75
+ browser_config (BrowserConfig): Configuration object for browser settings.
76
+ crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
77
+ logger (AsyncLogger): Logger instance for recording events and errors.
78
+ always_bypass_cache (bool): Whether to always bypass cache.
79
+ crawl4ai_folder (str): Directory for storing cache.
80
+ base_directory (str): Base directory for storing cache.
81
+ ready (bool): Whether the crawler is ready for use.
82
+
83
+ Methods:
84
+ start(): Start the crawler explicitly without using context manager.
85
+ close(): Close the crawler explicitly without using context manager.
86
+ arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
87
+ awarmup(): Perform warmup sequence.
88
+ arun_many(): Run the crawler for multiple sources.
89
+ aprocess_html(): Process HTML content.
90
+
91
+ Typical Usage:
92
+ async with AsyncWebCrawler() as crawler:
93
+ result = await crawler.arun(url="https://example.com")
94
+ print(result.markdown)
95
+
96
+ Using configuration:
97
+ browser_config = BrowserConfig(browser_type="chromium", headless=True)
98
+ async with AsyncWebCrawler(config=browser_config) as crawler:
99
+ crawler_config = CrawlerRunConfig(
100
+ cache_mode=CacheMode.BYPASS
101
+ )
102
+ result = await crawler.arun(url="https://example.com", config=crawler_config)
103
+ print(result.markdown)
104
+ """
105
+ _domain_last_hit = {}
106
+
107
+ def __init__(
108
+ self,
109
+ crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
110
+ config: Optional[BrowserConfig] = None,
111
+ always_bypass_cache: bool = False,
112
+ always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
113
+ base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
114
+ thread_safe: bool = False,
115
+ **kwargs,
116
+ ):
117
+ """
118
+ Initialize the AsyncWebCrawler.
119
+
120
+ Args:
121
+ crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
122
+ config: Configuration object for browser settings. If None, will be created from kwargs
123
+ always_bypass_cache: Whether to always bypass cache (new parameter)
124
+ always_by_pass_cache: Deprecated, use always_bypass_cache instead
125
+ base_directory: Base directory for storing cache
126
+ thread_safe: Whether to use thread-safe operations
127
+ **kwargs: Additional arguments for backwards compatibility
128
+ """
129
+ # Handle browser configuration
130
+ browser_config = config
131
+ if browser_config is not None:
132
+ if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
133
+ self.logger.warning(
134
+ message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
135
+ tag="WARNING"
136
+ )
137
+ else:
138
+ # Create browser config from kwargs for backwards compatibility
139
+ browser_config = BrowserConfig.from_kwargs(kwargs)
140
+
141
+ self.browser_config = browser_config
142
+
143
+ # Initialize logger first since other components may need it
144
+ self.logger = AsyncLogger(
145
+ log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
146
+ verbose=self.browser_config.verbose,
147
+ tag_width=10
148
+ )
149
+
150
+
151
+ # Initialize crawler strategy
152
+ params = {
153
+ k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
154
+ }
155
+ self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
156
+ browser_config=browser_config,
157
+ logger=self.logger,
158
+ **params # Pass remaining kwargs for backwards compatibility
159
+ )
160
+
161
+ # If craweler strategy doesnt have logger, use crawler logger
162
+ if not self.crawler_strategy.logger:
163
+ self.crawler_strategy.logger = self.logger
164
+
165
+ # Handle deprecated cache parameter
166
+ if always_by_pass_cache is not None:
167
+ if kwargs.get("warning", True):
168
+ warnings.warn(
169
+ "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
170
+ "Use 'always_bypass_cache' instead. "
171
+ "Pass warning=False to suppress this warning.",
172
+ DeprecationWarning,
173
+ stacklevel=2
174
+ )
175
+ self.always_bypass_cache = always_by_pass_cache
176
+ else:
177
+ self.always_bypass_cache = always_bypass_cache
178
+
179
+ # Thread safety setup
180
+ self._lock = asyncio.Lock() if thread_safe else None
181
+
182
+ # Initialize directories
183
+ self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
184
+ os.makedirs(self.crawl4ai_folder, exist_ok=True)
185
+ os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
186
+
187
+ self.ready = False
188
+
189
+ async def start(self):
190
+ """
191
+ Start the crawler explicitly without using context manager.
192
+ This is equivalent to using 'async with' but gives more control over the lifecycle.
193
+
194
+ This method will:
195
+ 1. Initialize the browser and context
196
+ 2. Perform warmup sequence
197
+ 3. Return the crawler instance for method chaining
198
+
199
+ Returns:
200
+ AsyncWebCrawler: The initialized crawler instance
201
+ """
202
+ await self.crawler_strategy.__aenter__()
203
+ await self.awarmup()
204
+ return self
205
+
206
+ async def close(self):
207
+ """
208
+ Close the crawler explicitly without using context manager.
209
+ This should be called when you're done with the crawler if you used start().
210
+
211
+ This method will:
212
+ 1. Clean up browser resources
213
+ 2. Close any open pages and contexts
214
+ """
215
+ await self.crawler_strategy.__aexit__(None, None, None)
216
+
217
+ async def __aenter__(self):
218
+ return await self.start()
219
+
220
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
221
+ await self.close()
222
+
223
+ async def awarmup(self):
224
+ """
225
+ Initialize the crawler with warm-up sequence.
226
+
227
+ This method:
228
+ 1. Logs initialization info
229
+ 2. Sets up browser configuration
230
+ 3. Marks the crawler as ready
231
+ """
232
+ self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
233
+ self.ready = True
234
+
235
+ @asynccontextmanager
236
+ async def nullcontext(self):
237
+ """异步空上下文管理器"""
238
+ yield
239
+
240
+ async def arun(
241
+ self,
242
+ url: str,
243
+ config: Optional[CrawlerRunConfig] = None,
244
+ # Legacy parameters maintained for backwards compatibility
245
+ word_count_threshold=MIN_WORD_THRESHOLD,
246
+ extraction_strategy: ExtractionStrategy = None,
247
+ chunking_strategy: ChunkingStrategy = RegexChunking(),
248
+ content_filter: RelevantContentFilter = None,
249
+ cache_mode: Optional[CacheMode] = None,
250
+ # Deprecated cache parameters
251
+ bypass_cache: bool = False,
252
+ disable_cache: bool = False,
253
+ no_cache_read: bool = False,
254
+ no_cache_write: bool = False,
255
+ # Other legacy parameters
256
+ css_selector: str = None,
257
+ screenshot: bool = False,
258
+ pdf: bool = False,
259
+ user_agent: str = None,
260
+ verbose=True,
261
+ **kwargs,
262
+ ) -> CrawlResult:
263
+ """
264
+ Runs the crawler for a single source: URL (web, local file, or raw HTML).
265
+
266
+ Migration Guide:
267
+ Old way (deprecated):
268
+ result = await crawler.arun(
269
+ url="https://example.com",
270
+ word_count_threshold=200,
271
+ screenshot=True,
272
+ ...
273
+ )
274
+
275
+ New way (recommended):
276
+ config = CrawlerRunConfig(
277
+ word_count_threshold=200,
278
+ screenshot=True,
279
+ ...
280
+ )
281
+ result = await crawler.arun(url="https://example.com", crawler_config=config)
282
+
283
+ Args:
284
+ url: The URL to crawl (http://, https://, file://, or raw:)
285
+ crawler_config: Configuration object controlling crawl behavior
286
+ [other parameters maintained for backwards compatibility]
287
+
288
+ Returns:
289
+ CrawlResult: The result of crawling and processing
290
+ """
291
+ crawler_config = config
292
+ if not isinstance(url, str) or not url:
293
+ raise ValueError("Invalid URL, make sure the URL is a non-empty string")
294
+
295
+ async with self._lock or self.nullcontext():
296
+ try:
297
+ # Handle configuration
298
+ if crawler_config is not None:
299
+ # if any(param is not None for param in [
300
+ # word_count_threshold, extraction_strategy, chunking_strategy,
301
+ # content_filter, cache_mode, css_selector, screenshot, pdf
302
+ # ]):
303
+ # self.logger.warning(
304
+ # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
305
+ # tag="WARNING"
306
+ # )
307
+ config = crawler_config
308
+ else:
309
+ # Merge all parameters into a single kwargs dict for config creation
310
+ config_kwargs = {
311
+ "word_count_threshold": word_count_threshold,
312
+ "extraction_strategy": extraction_strategy,
313
+ "chunking_strategy": chunking_strategy,
314
+ "content_filter": content_filter,
315
+ "cache_mode": cache_mode,
316
+ "bypass_cache": bypass_cache,
317
+ "disable_cache": disable_cache,
318
+ "no_cache_read": no_cache_read,
319
+ "no_cache_write": no_cache_write,
320
+ "css_selector": css_selector,
321
+ "screenshot": screenshot,
322
+ "pdf": pdf,
323
+ "verbose": verbose,
324
+ **kwargs
325
+ }
326
+ config = CrawlerRunConfig.from_kwargs(config_kwargs)
327
+
328
+ # Handle deprecated cache parameters
329
+ if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
330
+ if kwargs.get("warning", True):
331
+ warnings.warn(
332
+ "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
333
+ "Use 'cache_mode' parameter instead.",
334
+ DeprecationWarning,
335
+ stacklevel=2
336
+ )
337
+
338
+ # Convert legacy parameters if cache_mode not provided
339
+ if config.cache_mode is None:
340
+ config.cache_mode = _legacy_to_cache_mode(
341
+ disable_cache=disable_cache,
342
+ bypass_cache=bypass_cache,
343
+ no_cache_read=no_cache_read,
344
+ no_cache_write=no_cache_write
345
+ )
346
+
347
+ # Default to ENABLED if no cache mode specified
348
+ if config.cache_mode is None:
349
+ config.cache_mode = CacheMode.ENABLED
350
+
351
+ # Create cache context
352
+ cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)
353
+
354
+ # Initialize processing variables
355
+ async_response: AsyncCrawlResponse = None
356
+ cached_result: CrawlResult = None
357
+ screenshot_data = None
358
+ pdf_data = None
359
+ extracted_content = None
360
+ start_time = time.perf_counter()
361
+
362
+ # Try to get cached result if appropriate
363
+ if cache_context.should_read():
364
+ cached_result = await async_db_manager.aget_cached_url(url)
365
+
366
+ if cached_result:
367
+ html = sanitize_input_encode(cached_result.html)
368
+ extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
369
+ extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
370
+ # If screenshot is requested but its not in cache, then set cache_result to None
371
+ screenshot_data = cached_result.screenshot
372
+ pdf_data = cached_result.pdf
373
+ if config.screenshot and not screenshot or config.pdf and not pdf:
374
+ cached_result = None
375
+
376
+ self.logger.url_status(
377
+ url=cache_context.display_url,
378
+ success=bool(html),
379
+ timing=time.perf_counter() - start_time,
380
+ tag="FETCH"
381
+ )
382
+
383
+ # Fetch fresh content if needed
384
+ if not cached_result or not html:
385
+ t1 = time.perf_counter()
386
+
387
+ if user_agent:
388
+ self.crawler_strategy.update_user_agent(user_agent)
389
+
390
+ # Pass config to crawl method
391
+ async_response = await self.crawler_strategy.crawl(
392
+ url,
393
+ config=config # Pass the entire config object
394
+ )
395
+
396
+ html = sanitize_input_encode(async_response.html)
397
+ screenshot_data = async_response.screenshot
398
+ pdf_data = async_response.pdf_data
399
+
400
+ t2 = time.perf_counter()
401
+ self.logger.url_status(
402
+ url=cache_context.display_url,
403
+ success=bool(html),
404
+ timing=t2 - t1,
405
+ tag="FETCH"
406
+ )
407
+
408
+ # Process the HTML content
409
+ crawl_result = await self.aprocess_html(
410
+ url=url,
411
+ html=html,
412
+ extracted_content=extracted_content,
413
+ config=config, # Pass the config object instead of individual parameters
414
+ screenshot=screenshot_data,
415
+ pdf_data=pdf_data,
416
+ verbose=config.verbose,
417
+ is_raw_html = True if url.startswith("raw:") else False,
418
+ **kwargs
419
+ )
420
+
421
+ crawl_result.status_code = async_response.status_code
422
+ crawl_result.response_headers = async_response.response_headers
423
+ crawl_result.downloaded_files = async_response.downloaded_files
424
+ crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
425
+
426
+ # # Check and set values from async_response to crawl_result
427
+ # try:
428
+ # for key in vars(async_response):
429
+ # if hasattr(crawl_result, key):
430
+ # value = getattr(async_response, key, None)
431
+ # current_value = getattr(crawl_result, key, None)
432
+ # if value is not None and not current_value:
433
+ # try:
434
+ # setattr(crawl_result, key, value)
435
+ # except Exception as e:
436
+ # self.logger.warning(
437
+ # message=f"Failed to set attribute {key}: {str(e)}",
438
+ # tag="WARNING"
439
+ # )
440
+ # except Exception as e:
441
+ # self.logger.warning(
442
+ # message=f"Error copying response attributes: {str(e)}",
443
+ # tag="WARNING"
444
+ # )
445
+
446
+ crawl_result.success = bool(html)
447
+ crawl_result.session_id = getattr(config, 'session_id', None)
448
+
449
+ self.logger.success(
450
+ message="{url:.50}... | Status: {status} | Total: {timing}",
451
+ tag="COMPLETE",
452
+ params={
453
+ "url": cache_context.display_url,
454
+ "status": crawl_result.success,
455
+ "timing": f"{time.perf_counter() - start_time:.2f}s"
456
+ },
457
+ colors={
458
+ "status": Fore.GREEN if crawl_result.success else Fore.RED,
459
+ "timing": Fore.YELLOW
460
+ }
461
+ )
462
+
463
+ # Update cache if appropriate
464
+ if cache_context.should_write() and not bool(cached_result):
465
+ await async_db_manager.acache_url(crawl_result)
466
+
467
+ return crawl_result
468
+
469
+ else:
470
+ self.logger.success(
471
+ message="{url:.50}... | Status: {status} | Total: {timing}",
472
+ tag="COMPLETE",
473
+ params={
474
+ "url": cache_context.display_url,
475
+ "status": True,
476
+ "timing": f"{time.perf_counter() - start_time:.2f}s"
477
+ },
478
+ colors={
479
+ "status": Fore.GREEN,
480
+ "timing": Fore.YELLOW
481
+ }
482
+ )
483
+
484
+ cached_result.success = bool(html)
485
+ cached_result.session_id = getattr(config, 'session_id', None)
486
+ return cached_result
487
+
488
+ except Exception as e:
489
+ error_context = get_error_context(sys.exc_info())
490
+
491
+ error_message = (
492
+ f"Unexpected error in _crawl_web at line {error_context['line_no']} "
493
+ f"in {error_context['function']} ({error_context['filename']}):\n"
494
+ f"Error: {str(e)}\n\n"
495
+ f"Code context:\n{error_context['code_context']}"
496
+ )
497
+ # if not hasattr(e, "msg"):
498
+ # e.msg = str(e)
499
+
500
+ self.logger.error_status(
501
+ url=url,
502
+ error=create_box_message(error_message, type="error"),
503
+ tag="ERROR"
504
+ )
505
+
506
+ return CrawlResult(
507
+ url=url,
508
+ html="",
509
+ success=False,
510
+ error_message=error_message
511
+ )
512
+
513
+ async def aprocess_html(
514
+ self,
515
+ url: str,
516
+ html: str,
517
+ extracted_content: str,
518
+ config: CrawlerRunConfig,
519
+ screenshot: str,
520
+ pdf_data: str,
521
+ verbose: bool,
522
+ **kwargs,
523
+ ) -> CrawlResult:
524
+ """
525
+ Process HTML content using the provided configuration.
526
+
527
+ Args:
528
+ url: The URL being processed
529
+ html: Raw HTML content
530
+ extracted_content: Previously extracted content (if any)
531
+ config: Configuration object controlling processing behavior
532
+ screenshot: Screenshot data (if any)
533
+ pdf_data: PDF data (if any)
534
+ verbose: Whether to enable verbose logging
535
+ **kwargs: Additional parameters for backwards compatibility
536
+
537
+ Returns:
538
+ CrawlResult: Processed result containing extracted and formatted content
539
+ """
540
+ try:
541
+ _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
542
+ t1 = time.perf_counter()
543
+
544
+ # Initialize scraping strategy
545
+ scrapping_strategy = WebScrapingStrategy(logger=self.logger)
546
+
547
+ # Process HTML content
548
+ params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
549
+ # add keys from kwargs to params that doesn't exist in params
550
+ params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
551
+
552
+ result = scrapping_strategy.scrap(
553
+ url,
554
+ html,
555
+ **params,
556
+ # word_count_threshold=config.word_count_threshold,
557
+ # css_selector=config.css_selector,
558
+ # only_text=config.only_text,
559
+ # image_description_min_word_threshold=config.image_description_min_word_threshold,
560
+ # content_filter=config.content_filter,
561
+ # **kwargs
562
+ )
563
+
564
+ if result is None:
565
+ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
566
+
567
+ except InvalidCSSSelectorError as e:
568
+ raise ValueError(str(e))
569
+ except Exception as e:
570
+ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
571
+
572
+
573
+
574
+ # Extract results
575
+ cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
576
+ fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
577
+ fit_html = sanitize_input_encode(result.get("fit_html", ""))
578
+ media = result.get("media", [])
579
+ links = result.get("links", [])
580
+ metadata = result.get("metadata", {})
581
+
582
+ # Markdown Generation
583
+ markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
584
+
585
+ # Uncomment if by default we want to use PruningContentFilter
586
+ # if not config.content_filter and not markdown_generator.content_filter:
587
+ # markdown_generator.content_filter = PruningContentFilter()
588
+
589
+ markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
590
+ cleaned_html=cleaned_html,
591
+ base_url=url,
592
+ # html2text_options=kwargs.get('html2text', {})
593
+ )
594
+ markdown_v2 = markdown_result
595
+ markdown = sanitize_input_encode(markdown_result.raw_markdown)
596
+
597
+ # Log processing completion
598
+ self.logger.info(
599
+ message="Processed {url:.50}... | Time: {timing}ms",
600
+ tag="SCRAPE",
601
+ params={
602
+ "url": _url,
603
+ "timing": int((time.perf_counter() - t1) * 1000)
604
+ }
605
+ )
606
+
607
+ # Handle content extraction if needed
608
+ if (extracted_content is None and
609
+ config.extraction_strategy and
610
+ config.chunking_strategy and
611
+ not isinstance(config.extraction_strategy, NoExtractionStrategy)):
612
+
613
+ t1 = time.perf_counter()
614
+
615
+ # Choose content based on input_format
616
+ content_format = config.extraction_strategy.input_format
617
+ if content_format == "fit_markdown" and not markdown_result.fit_markdown:
618
+ self.logger.warning(
619
+ message="Fit markdown requested but not available. Falling back to raw markdown.",
620
+ tag="EXTRACT",
621
+ params={"url": _url}
622
+ )
623
+ content_format = "markdown"
624
+
625
+ content = {
626
+ "markdown": markdown,
627
+ "html": html,
628
+ "fit_markdown": markdown_result.raw_markdown
629
+ }.get(content_format, markdown)
630
+
631
+ # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
632
+ chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
633
+ sections = chunking.chunk(content)
634
+ extracted_content = config.extraction_strategy.run(url, sections)
635
+ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
636
+
637
+ # Log extraction completion
638
+ self.logger.info(
639
+ message="Completed for {url:.50}... | Time: {timing}s",
640
+ tag="EXTRACT",
641
+ params={
642
+ "url": _url,
643
+ "timing": time.perf_counter() - t1
644
+ }
645
+ )
646
+
647
+ # Handle screenshot and PDF data
648
+ screenshot_data = None if not screenshot else screenshot
649
+ pdf_data = None if not pdf_data else pdf_data
650
+
651
+ # Apply HTML formatting if requested
652
+ if config.prettiify:
653
+ cleaned_html = fast_format_html(cleaned_html)
654
+
655
+ # Return complete crawl result
656
+ return CrawlResult(
657
+ url=url,
658
+ html=html,
659
+ cleaned_html=cleaned_html,
660
+ markdown_v2=markdown_v2,
661
+ markdown=markdown,
662
+ fit_markdown=fit_markdown,
663
+ fit_html=fit_html,
664
+ media=media,
665
+ links=links,
666
+ metadata=metadata,
667
+ screenshot=screenshot_data,
668
+ pdf=pdf_data,
669
+ extracted_content=extracted_content,
670
+ success=True,
671
+ error_message="",
672
+ )
673
+
674
+ async def arun_many(
675
+ self,
676
+ urls: List[str],
677
+ config: Optional[CrawlerRunConfig] = None,
678
+ # Legacy parameters maintained for backwards compatibility
679
+ word_count_threshold=MIN_WORD_THRESHOLD,
680
+ extraction_strategy: ExtractionStrategy = None,
681
+ chunking_strategy: ChunkingStrategy = RegexChunking(),
682
+ content_filter: RelevantContentFilter = None,
683
+ cache_mode: Optional[CacheMode] = None,
684
+ bypass_cache: bool = False,
685
+ css_selector: str = None,
686
+ screenshot: bool = False,
687
+ pdf: bool = False,
688
+ user_agent: str = None,
689
+ verbose=True,
690
+ **kwargs,
691
+ ) -> List[CrawlResult]:
692
+ """
693
+ Runs the crawler for multiple URLs concurrently.
694
+
695
+ Migration Guide:
696
+ Old way (deprecated):
697
+ results = await crawler.arun_many(
698
+ urls,
699
+ word_count_threshold=200,
700
+ screenshot=True,
701
+ ...
702
+ )
703
+
704
+ New way (recommended):
705
+ config = CrawlerRunConfig(
706
+ word_count_threshold=200,
707
+ screenshot=True,
708
+ ...
709
+ )
710
+ results = await crawler.arun_many(urls, crawler_config=config)
711
+
712
+ Args:
713
+ urls: List of URLs to crawl
714
+ crawler_config: Configuration object controlling crawl behavior for all URLs
715
+ [other parameters maintained for backwards compatibility]
716
+
717
+ Returns:
718
+ List[CrawlResult]: Results for each URL
719
+ """
720
+ crawler_config = config
721
+ # Handle configuration
722
+ if crawler_config is not None:
723
+ if any(param is not None for param in [
724
+ word_count_threshold, extraction_strategy, chunking_strategy,
725
+ content_filter, cache_mode, css_selector, screenshot, pdf
726
+ ]):
727
+ self.logger.warning(
728
+ message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
729
+ tag="WARNING"
730
+ )
731
+ config = crawler_config
732
+ else:
733
+ # Merge all parameters into a single kwargs dict for config creation
734
+ config_kwargs = {
735
+ "word_count_threshold": word_count_threshold,
736
+ "extraction_strategy": extraction_strategy,
737
+ "chunking_strategy": chunking_strategy,
738
+ "content_filter": content_filter,
739
+ "cache_mode": cache_mode,
740
+ "bypass_cache": bypass_cache,
741
+ "css_selector": css_selector,
742
+ "screenshot": screenshot,
743
+ "pdf": pdf,
744
+ "verbose": verbose,
745
+ **kwargs
746
+ }
747
+ config = CrawlerRunConfig.from_kwargs(config_kwargs)
748
+
749
+ if bypass_cache:
750
+ if kwargs.get("warning", True):
751
+ warnings.warn(
752
+ "'bypass_cache' is deprecated and will be removed in version 0.5.0. "
753
+ "Use 'cache_mode=CacheMode.BYPASS' instead. "
754
+ "Pass warning=False to suppress this warning.",
755
+ DeprecationWarning,
756
+ stacklevel=2
757
+ )
758
+ if config.cache_mode is None:
759
+ config.cache_mode = CacheMode.BYPASS
760
+
761
+ semaphore_count = config.semaphore_count or 5
762
+ semaphore = asyncio.Semaphore(semaphore_count)
763
+
764
+ async def crawl_with_semaphore(url):
765
+ # Handle rate limiting per domain
766
+ domain = urlparse(url).netloc
767
+ current_time = time.time()
768
+
769
+ self.logger.debug(
770
+ message="Started task for {url:.50}...",
771
+ tag="PARALLEL",
772
+ params={"url": url}
773
+ )
774
+
775
+ # Get delay settings from config
776
+ mean_delay = config.mean_delay
777
+ max_range = config.max_range
778
+
779
+ # Apply rate limiting
780
+ if domain in self._domain_last_hit:
781
+ time_since_last = current_time - self._domain_last_hit[domain]
782
+ if time_since_last < mean_delay:
783
+ delay = mean_delay + random.uniform(0, max_range)
784
+ await asyncio.sleep(delay)
785
+
786
+ self._domain_last_hit[domain] = current_time
787
+
788
+ async with semaphore:
789
+ return await self.arun(
790
+ url,
791
+ crawler_config=config, # Pass the entire config object
792
+ user_agent=user_agent # Maintain user_agent override capability
793
+ )
794
+
795
+ # Log start of concurrent crawling
796
+ self.logger.info(
797
+ message="Starting concurrent crawling for {count} URLs...",
798
+ tag="INIT",
799
+ params={"count": len(urls)}
800
+ )
801
+
802
+ # Execute concurrent crawls
803
+ start_time = time.perf_counter()
804
+ tasks = [crawl_with_semaphore(url) for url in urls]
805
+ results = await asyncio.gather(*tasks, return_exceptions=True)
806
+ end_time = time.perf_counter()
807
+
808
+ # Log completion
809
+ self.logger.success(
810
+ message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
811
+ tag="COMPLETE",
812
+ params={
813
+ "count": len(urls),
814
+ "timing": f"{end_time - start_time:.2f}s"
815
+ },
816
+ colors={
817
+ "timing": Fore.YELLOW
818
+ }
819
+ )
820
+
821
+ return [result if not isinstance(result, Exception) else str(result) for result in results]
822
+
823
+ async def aclear_cache(self):
824
+ """Clear the cache database."""
825
+ await async_db_manager.cleanup()
826
+
827
+ async def aflush_cache(self):
828
+ """Flush the cache database."""
829
+ await async_db_manager.aflush_db()
830
+
831
+ async def aget_cache_size(self):
832
+ """Get the total number of cached items."""
833
+ return await async_db_manager.aget_total_count()
crawl4ai/cache_context.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class CacheMode(Enum):
5
+ """
6
+ Defines the caching behavior for web crawling operations.
7
+
8
+ Modes:
9
+ - ENABLED: Normal caching behavior (read and write)
10
+ - DISABLED: No caching at all
11
+ - READ_ONLY: Only read from cache, don't write
12
+ - WRITE_ONLY: Only write to cache, don't read
13
+ - BYPASS: Bypass cache for this operation
14
+ """
15
+ ENABLED = "enabled"
16
+ DISABLED = "disabled"
17
+ READ_ONLY = "read_only"
18
+ WRITE_ONLY = "write_only"
19
+ BYPASS = "bypass"
20
+
21
+
22
+ class CacheContext:
23
+ """
24
+ Encapsulates cache-related decisions and URL handling.
25
+
26
+ This class centralizes all cache-related logic and URL type checking,
27
+ making the caching behavior more predictable and maintainable.
28
+
29
+ Attributes:
30
+ url (str): The URL being processed.
31
+ cache_mode (CacheMode): The cache mode for the current operation.
32
+ always_bypass (bool): If True, bypasses caching for this operation.
33
+ is_cacheable (bool): True if the URL is cacheable, False otherwise.
34
+ is_web_url (bool): True if the URL is a web URL, False otherwise.
35
+ is_local_file (bool): True if the URL is a local file, False otherwise.
36
+ is_raw_html (bool): True if the URL is raw HTML, False otherwise.
37
+ _url_display (str): The display name for the URL (web, local file, or raw HTML).
38
+ """
39
+ def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
40
+ """
41
+ Initializes the CacheContext with the provided URL and cache mode.
42
+
43
+ Args:
44
+ url (str): The URL being processed.
45
+ cache_mode (CacheMode): The cache mode for the current operation.
46
+ always_bypass (bool): If True, bypasses caching for this operation.
47
+ """
48
+ self.url = url
49
+ self.cache_mode = cache_mode
50
+ self.always_bypass = always_bypass
51
+ self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
52
+ self.is_web_url = url.startswith(('http://', 'https://'))
53
+ self.is_local_file = url.startswith("file://")
54
+ self.is_raw_html = url.startswith("raw:")
55
+ self._url_display = url if not self.is_raw_html else "Raw HTML"
56
+
57
+ def should_read(self) -> bool:
58
+ """
59
+ Determines if cache should be read based on context.
60
+
61
+ How it works:
62
+ 1. If always_bypass is True or is_cacheable is False, return False.
63
+ 2. If cache_mode is ENABLED or READ_ONLY, return True.
64
+
65
+ Returns:
66
+ bool: True if cache should be read, False otherwise.
67
+ """
68
+ if self.always_bypass or not self.is_cacheable:
69
+ return False
70
+ return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
71
+
72
+ def should_write(self) -> bool:
73
+ """
74
+ Determines if cache should be written based on context.
75
+
76
+ How it works:
77
+ 1. If always_bypass is True or is_cacheable is False, return False.
78
+ 2. If cache_mode is ENABLED or WRITE_ONLY, return True.
79
+
80
+ Returns:
81
+ bool: True if cache should be written, False otherwise.
82
+ """
83
+ if self.always_bypass or not self.is_cacheable:
84
+ return False
85
+ return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
86
+
87
+ @property
88
+ def display_url(self) -> str:
89
+ """Returns the URL in display format."""
90
+ return self._url_display
91
+
92
+
93
+ def _legacy_to_cache_mode(
94
+ disable_cache: bool = False,
95
+ bypass_cache: bool = False,
96
+ no_cache_read: bool = False,
97
+ no_cache_write: bool = False
98
+ ) -> CacheMode:
99
+ """
100
+ Converts legacy cache parameters to the new CacheMode enum.
101
+
102
+ This is an internal function to help transition from the old boolean flags
103
+ to the new CacheMode system.
104
+ """
105
+ if disable_cache:
106
+ return CacheMode.DISABLED
107
+ if bypass_cache:
108
+ return CacheMode.BYPASS
109
+ if no_cache_read and no_cache_write:
110
+ return CacheMode.DISABLED
111
+ if no_cache_read:
112
+ return CacheMode.WRITE_ONLY
113
+ if no_cache_write:
114
+ return CacheMode.READ_ONLY
115
+ return CacheMode.ENABLED
crawl4ai/chunking_strategy.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import re
3
+ from collections import Counter
4
+ import string
5
+ from .model_loader import load_nltk_punkt
6
+ from .utils import *
7
+
8
+ # Define the abstract base class for chunking strategies
9
+ class ChunkingStrategy(ABC):
10
+ """
11
+ Abstract base class for chunking strategies.
12
+ """
13
+
14
+ @abstractmethod
15
+ def chunk(self, text: str) -> list:
16
+ """
17
+ Abstract method to chunk the given text.
18
+
19
+ Args:
20
+ text (str): The text to chunk.
21
+
22
+ Returns:
23
+ list: A list of chunks.
24
+ """
25
+ pass
26
+
27
+ # Create an identity chunking strategy f(x) = [x]
28
+ class IdentityChunking(ChunkingStrategy):
29
+ """
30
+ Chunking strategy that returns the input text as a single chunk.
31
+ """
32
+ def chunk(self, text: str) -> list:
33
+ return [text]
34
+
35
+ # Regex-based chunking
36
+ class RegexChunking(ChunkingStrategy):
37
+ """
38
+ Chunking strategy that splits text based on regular expression patterns.
39
+ """
40
+ def __init__(self, patterns=None, **kwargs):
41
+ """
42
+ Initialize the RegexChunking object.
43
+
44
+ Args:
45
+ patterns (list): A list of regular expression patterns to split text.
46
+ """
47
+ if patterns is None:
48
+ patterns = [r'\n\n'] # Default split pattern
49
+ self.patterns = patterns
50
+
51
+ def chunk(self, text: str) -> list:
52
+ paragraphs = [text]
53
+ for pattern in self.patterns:
54
+ new_paragraphs = []
55
+ for paragraph in paragraphs:
56
+ new_paragraphs.extend(re.split(pattern, paragraph))
57
+ paragraphs = new_paragraphs
58
+ return paragraphs
59
+
60
+ # NLP-based sentence chunking
61
+ class NlpSentenceChunking(ChunkingStrategy):
62
+ """
63
+ Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
64
+ """
65
+ def __init__(self, **kwargs):
66
+ """
67
+ Initialize the NlpSentenceChunking object.
68
+ """
69
+ load_nltk_punkt()
70
+
71
+
72
+ def chunk(self, text: str) -> list:
73
+ # Improved regex for sentence splitting
74
+ # sentence_endings = re.compile(
75
+ # r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
76
+ # )
77
+ # sentences = sentence_endings.split(text)
78
+ # sens = [sent.strip() for sent in sentences if sent]
79
+ from nltk.tokenize import sent_tokenize
80
+ sentences = sent_tokenize(text)
81
+ sens = [sent.strip() for sent in sentences]
82
+
83
+ return list(set(sens))
84
+
85
+ # Topic-based segmentation using TextTiling
86
+ class TopicSegmentationChunking(ChunkingStrategy):
87
+ """
88
+ Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
89
+
90
+ How it works:
91
+ 1. Segment the text into topics using TextTilingTokenizer
92
+ 2. Extract keywords for each topic segment
93
+ """
94
+
95
+ def __init__(self, num_keywords=3, **kwargs):
96
+ """
97
+ Initialize the TopicSegmentationChunking object.
98
+
99
+ Args:
100
+ num_keywords (int): The number of keywords to extract for each topic segment.
101
+ """
102
+ import nltk as nl
103
+ self.tokenizer = nl.tokenize.TextTilingTokenizer()
104
+ self.num_keywords = num_keywords
105
+
106
+ def chunk(self, text: str) -> list:
107
+ # Use the TextTilingTokenizer to segment the text
108
+ segmented_topics = self.tokenizer.tokenize(text)
109
+ return segmented_topics
110
+
111
+ def extract_keywords(self, text: str) -> list:
112
+ # Tokenize and remove stopwords and punctuation
113
+ import nltk as nl
114
+ tokens = nl.toknize.word_tokenize(text)
115
+ tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
116
+
117
+ # Calculate frequency distribution
118
+ freq_dist = Counter(tokens)
119
+ keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
120
+ return keywords
121
+
122
+ def chunk_with_topics(self, text: str) -> list:
123
+ # Segment the text into topics
124
+ segments = self.chunk(text)
125
+ # Extract keywords for each topic segment
126
+ segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
127
+ return segments_with_topics
128
+
129
+ # Fixed-length word chunks
130
+ class FixedLengthWordChunking(ChunkingStrategy):
131
+ """
132
+ Chunking strategy that splits text into fixed-length word chunks.
133
+
134
+ How it works:
135
+ 1. Split the text into words
136
+ 2. Create chunks of fixed length
137
+ 3. Return the list of chunks
138
+ """
139
+ def __init__(self, chunk_size=100, **kwargs):
140
+ """
141
+ Initialize the fixed-length word chunking strategy with the given chunk size.
142
+
143
+ Args:
144
+ chunk_size (int): The size of each chunk in words.
145
+ """
146
+ self.chunk_size = chunk_size
147
+
148
+ def chunk(self, text: str) -> list:
149
+ words = text.split()
150
+ return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
151
+
152
+ # Sliding window chunking
153
+ class SlidingWindowChunking(ChunkingStrategy):
154
+ """
155
+ Chunking strategy that splits text into overlapping word chunks.
156
+
157
+ How it works:
158
+ 1. Split the text into words
159
+ 2. Create chunks of fixed length
160
+ 3. Return the list of chunks
161
+ """
162
+ def __init__(self, window_size=100, step=50, **kwargs):
163
+ """
164
+ Initialize the sliding window chunking strategy with the given window size and
165
+ step size.
166
+
167
+ Args:
168
+ window_size (int): The size of the sliding window in words.
169
+ step (int): The step size for sliding the window in words.
170
+ """
171
+ self.window_size = window_size
172
+ self.step = step
173
+
174
+ def chunk(self, text: str) -> list:
175
+ words = text.split()
176
+ chunks = []
177
+
178
+ if len(words) <= self.window_size:
179
+ return [text]
180
+
181
+ for i in range(0, len(words) - self.window_size + 1, self.step):
182
+ chunk = ' '.join(words[i:i + self.window_size])
183
+ chunks.append(chunk)
184
+
185
+ # Handle the last chunk if it doesn't align perfectly
186
+ if i + self.window_size < len(words):
187
+ chunks.append(' '.join(words[-self.window_size:]))
188
+
189
+ return chunks
190
+
191
+ class OverlappingWindowChunking(ChunkingStrategy):
192
+ """
193
+ Chunking strategy that splits text into overlapping word chunks.
194
+
195
+ How it works:
196
+ 1. Split the text into words using whitespace
197
+ 2. Create chunks of fixed length equal to the window size
198
+ 3. Slide the window by the overlap size
199
+ 4. Return the list of chunks
200
+ """
201
+ def __init__(self, window_size=1000, overlap=100, **kwargs):
202
+ """
203
+ Initialize the overlapping window chunking strategy with the given window size and
204
+ overlap size.
205
+
206
+ Args:
207
+ window_size (int): The size of the window in words.
208
+ overlap (int): The size of the overlap between consecutive chunks in words.
209
+ """
210
+ self.window_size = window_size
211
+ self.overlap = overlap
212
+
213
+ def chunk(self, text: str) -> list:
214
+ words = text.split()
215
+ chunks = []
216
+
217
+ if len(words) <= self.window_size:
218
+ return [text]
219
+
220
+ start = 0
221
+ while start < len(words):
222
+ end = start + self.window_size
223
+ chunk = ' '.join(words[start:end])
224
+ chunks.append(chunk)
225
+
226
+ if end >= len(words):
227
+ break
228
+
229
+ start = end - self.overlap
230
+
231
+ return chunks
crawl4ai/cli.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import click
2
+ import sys
3
+ import asyncio
4
+ from typing import List
5
+ from .docs_manager import DocsManager
6
+ from .async_logger import AsyncLogger
7
+
8
+ logger = AsyncLogger(verbose=True)
9
+ docs_manager = DocsManager(logger)
10
+
11
+ def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
12
+ """Print formatted table with headers and rows"""
13
+ widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
14
+ border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
15
+
16
+ def format_row(row):
17
+ return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
18
+ for cell, w in zip(row, widths)) + '|'
19
+
20
+ click.echo(border)
21
+ click.echo(format_row(headers))
22
+ click.echo(border)
23
+ for row in rows:
24
+ click.echo(format_row(row))
25
+ click.echo(border)
26
+
27
+ @click.group()
28
+ def cli():
29
+ """Crawl4AI Command Line Interface"""
30
+ pass
31
+
32
+ @cli.group()
33
+ def docs():
34
+ """Documentation operations"""
35
+ pass
36
+
37
+ @docs.command()
38
+ @click.argument('sections', nargs=-1)
39
+ @click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
40
+ def combine(sections: tuple, mode: str):
41
+ """Combine documentation sections"""
42
+ try:
43
+ asyncio.run(docs_manager.ensure_docs_exist())
44
+ click.echo(docs_manager.generate(sections, mode))
45
+ except Exception as e:
46
+ logger.error(str(e), tag="ERROR")
47
+ sys.exit(1)
48
+
49
+ @docs.command()
50
+ @click.argument('query')
51
+ @click.option('--top-k', '-k', default=5)
52
+ @click.option('--build-index', is_flag=True, help='Build index if missing')
53
+ def search(query: str, top_k: int, build_index: bool):
54
+ """Search documentation"""
55
+ try:
56
+ result = docs_manager.search(query, top_k)
57
+ if result == "No search index available. Call build_search_index() first.":
58
+ if build_index or click.confirm('No search index found. Build it now?'):
59
+ asyncio.run(docs_manager.llm_text.generate_index_files())
60
+ result = docs_manager.search(query, top_k)
61
+ click.echo(result)
62
+ except Exception as e:
63
+ click.echo(f"Error: {str(e)}", err=True)
64
+ sys.exit(1)
65
+
66
+ @docs.command()
67
+ def update():
68
+ """Update docs from GitHub"""
69
+ try:
70
+ asyncio.run(docs_manager.fetch_docs())
71
+ click.echo("Documentation updated successfully")
72
+ except Exception as e:
73
+ click.echo(f"Error: {str(e)}", err=True)
74
+ sys.exit(1)
75
+
76
+ @docs.command()
77
+ @click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
78
+ @click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
79
+ def index(force_facts: bool, clear_cache: bool):
80
+ """Build or rebuild search indexes"""
81
+ try:
82
+ asyncio.run(docs_manager.ensure_docs_exist())
83
+ asyncio.run(docs_manager.llm_text.generate_index_files(
84
+ force_generate_facts=force_facts,
85
+ clear_bm25_cache=clear_cache
86
+ ))
87
+ click.echo("Search indexes built successfully")
88
+ except Exception as e:
89
+ click.echo(f"Error: {str(e)}", err=True)
90
+ sys.exit(1)
91
+
92
+ # Add docs list command
93
+ @docs.command()
94
+ def list():
95
+ """List available documentation sections"""
96
+ try:
97
+ sections = docs_manager.list()
98
+ print_table(["Sections"], [[section] for section in sections])
99
+
100
+ except Exception as e:
101
+ click.echo(f"Error: {str(e)}", err=True)
102
+ sys.exit(1)
103
+
104
+ if __name__ == '__main__':
105
+ cli()
crawl4ai/config.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv() # Load environment variables from .env file
5
+
6
+ # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
7
+ DEFAULT_PROVIDER = "openai/gpt-4o-mini"
8
+ MODEL_REPO_BRANCH = "new-release-0.0.2"
9
+ # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
10
+ PROVIDER_MODELS = {
11
+ "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
12
+ "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
13
+ "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
14
+ "openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
15
+ "openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
16
+ "openai/o1-mini": os.getenv("OPENAI_API_KEY"),
17
+ "openai/o1-preview": os.getenv("OPENAI_API_KEY"),
18
+ "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
19
+ "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
20
+ "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
21
+ "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
22
+ }
23
+
24
+ # Chunk token threshold
25
+ CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
26
+ OVERLAP_RATE = 0.1
27
+ WORD_TOKEN_RATE = 1.3
28
+
29
+ # Threshold for the minimum number of word in a HTML tag to be considered
30
+ MIN_WORD_THRESHOLD = 1
31
+ IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
32
+
33
+ IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
34
+ ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
35
+ SOCIAL_MEDIA_DOMAINS = [
36
+ 'facebook.com',
37
+ 'twitter.com',
38
+ 'x.com',
39
+ 'linkedin.com',
40
+ 'instagram.com',
41
+ 'pinterest.com',
42
+ 'tiktok.com',
43
+ 'snapchat.com',
44
+ 'reddit.com',
45
+ ]
46
+
47
+ # Threshold for the Image extraction - Range is 1 to 6
48
+ # Images are scored based on point based system, to filter based on usefulness. Points are assigned
49
+ # to each image based on the following aspects.
50
+ # If either height or width exceeds 150px
51
+ # If image size is greater than 10Kb
52
+ # If alt property is set
53
+ # If image format is in jpg, png or webp
54
+ # If image is in the first half of the total images extracted from the page
55
+ IMAGE_SCORE_THRESHOLD = 2
56
+
57
+ MAX_METRICS_HISTORY = 1000
58
+
59
+ NEED_MIGRATION = True
60
+ URL_LOG_SHORTEN_LENGTH = 30
61
+ SHOW_DEPRECATION_WARNINGS = True
62
+ SCREENSHOT_HEIGHT_TRESHOLD = 10000
63
+ PAGE_TIMEOUT=60000
64
+ DOWNLOAD_PAGE_TIMEOUT=60000
crawl4ai/content_filter_strategy.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup, Tag
3
+ from typing import List, Tuple, Dict
4
+ from rank_bm25 import BM25Okapi
5
+ from time import perf_counter
6
+ from collections import deque
7
+ from bs4 import BeautifulSoup, NavigableString, Tag, Comment
8
+ from .utils import clean_tokens
9
+ from abc import ABC, abstractmethod
10
+ import math
11
+ from snowballstemmer import stemmer
12
+ class RelevantContentFilter(ABC):
13
+ """Abstract base class for content filtering strategies"""
14
+ def __init__(self, user_query: str = None):
15
+ self.user_query = user_query
16
+ self.included_tags = {
17
+ # Primary structure
18
+ 'article', 'main', 'section', 'div',
19
+ # List structures
20
+ 'ul', 'ol', 'li', 'dl', 'dt', 'dd',
21
+ # Text content
22
+ 'p', 'span', 'blockquote', 'pre', 'code',
23
+ # Headers
24
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
25
+ # Tables
26
+ 'table', 'thead', 'tbody', 'tr', 'td', 'th',
27
+ # Other semantic elements
28
+ 'figure', 'figcaption', 'details', 'summary',
29
+ # Text formatting
30
+ 'em', 'strong', 'b', 'i', 'mark', 'small',
31
+ # Rich content
32
+ 'time', 'address', 'cite', 'q'
33
+ }
34
+ self.excluded_tags = {
35
+ 'nav', 'footer', 'header', 'aside', 'script',
36
+ 'style', 'form', 'iframe', 'noscript'
37
+ }
38
+ self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
39
+ self.negative_patterns = re.compile(
40
+ r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share',
41
+ re.I
42
+ )
43
+ self.min_word_count = 2
44
+
45
+ @abstractmethod
46
+ def filter_content(self, html: str) -> List[str]:
47
+ """Abstract method to be implemented by specific filtering strategies"""
48
+ pass
49
+
50
+ def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
51
+ """Common method to extract page metadata with fallbacks"""
52
+ if self.user_query:
53
+ return self.user_query
54
+
55
+ query_parts = []
56
+
57
+ # Title
58
+ try:
59
+ title = soup.title.string
60
+ if title:
61
+ query_parts.append(title)
62
+ except Exception:
63
+ pass
64
+
65
+ if soup.find('h1'):
66
+ query_parts.append(soup.find('h1').get_text())
67
+
68
+ # Meta tags
69
+ temp = ""
70
+ for meta_name in ['keywords', 'description']:
71
+ meta = soup.find('meta', attrs={'name': meta_name})
72
+ if meta and meta.get('content'):
73
+ query_parts.append(meta['content'])
74
+ temp += meta['content']
75
+
76
+ # If still empty, grab first significant paragraph
77
+ if not temp:
78
+ # Find the first tag P thatits text contains more than 50 characters
79
+ for p in body.find_all('p'):
80
+ if len(p.get_text()) > 150:
81
+ query_parts.append(p.get_text()[:150])
82
+ break
83
+
84
+ return ' '.join(filter(None, query_parts))
85
+
86
+ def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
87
+ """
88
+ Extracts text chunks from a BeautifulSoup body element while preserving order.
89
+ Returns list of tuples (text, tag_name) for classification.
90
+
91
+ Args:
92
+ body: BeautifulSoup Tag object representing the body element
93
+
94
+ Returns:
95
+ List of (text, tag_name) tuples
96
+ """
97
+ # Tags to ignore - inline elements that shouldn't break text flow
98
+ INLINE_TAGS = {
99
+ 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
100
+ 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
101
+ 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
102
+ 'textarea', 'time', 'tt', 'var'
103
+ }
104
+
105
+ # Tags that typically contain meaningful headers
106
+ HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'}
107
+
108
+ chunks = []
109
+ current_text = []
110
+ chunk_index = 0
111
+
112
+ def should_break_chunk(tag: Tag) -> bool:
113
+ """Determine if a tag should cause a break in the current text chunk"""
114
+ return (
115
+ tag.name not in INLINE_TAGS
116
+ and not (tag.name == 'p' and len(current_text) == 0)
117
+ )
118
+
119
+ # Use deque for efficient push/pop operations
120
+ stack = deque([(body, False)])
121
+
122
+ while stack:
123
+ element, visited = stack.pop()
124
+
125
+ if visited:
126
+ # End of block element - flush accumulated text
127
+ if current_text and should_break_chunk(element):
128
+ text = ' '.join(''.join(current_text).split())
129
+ if text:
130
+ tag_type = 'header' if element.name in HEADER_TAGS else 'content'
131
+ chunks.append((chunk_index, text, tag_type, element))
132
+ chunk_index += 1
133
+ current_text = []
134
+ continue
135
+
136
+ if isinstance(element, NavigableString):
137
+ if str(element).strip():
138
+ current_text.append(str(element).strip())
139
+ continue
140
+
141
+ # Pre-allocate children to avoid multiple list operations
142
+ children = list(element.children)
143
+ if not children:
144
+ continue
145
+
146
+ # Mark block for revisit after processing children
147
+ stack.append((element, True))
148
+
149
+ # Add children in reverse order for correct processing
150
+ for child in reversed(children):
151
+ if isinstance(child, (Tag, NavigableString)):
152
+ stack.append((child, False))
153
+
154
+ # Handle any remaining text
155
+ if current_text:
156
+ text = ' '.join(''.join(current_text).split())
157
+ if text:
158
+ chunks.append((chunk_index, text, 'content', body))
159
+
160
+ if min_word_threshold:
161
+ chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
162
+
163
+ return chunks
164
+
165
+ def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
166
+ """Common method for extracting text chunks"""
167
+ _text_cache = {}
168
+ def fast_text(element: Tag) -> str:
169
+ elem_id = id(element)
170
+ if elem_id in _text_cache:
171
+ return _text_cache[elem_id]
172
+ texts = []
173
+ for content in element.contents:
174
+ if isinstance(content, str):
175
+ text = content.strip()
176
+ if text:
177
+ texts.append(text)
178
+ result = ' '.join(texts)
179
+ _text_cache[elem_id] = result
180
+ return result
181
+
182
+ candidates = []
183
+ index = 0
184
+
185
+ def dfs(element):
186
+ nonlocal index
187
+ if isinstance(element, Tag):
188
+ if element.name in self.included_tags:
189
+ if not self.is_excluded(element):
190
+ text = fast_text(element)
191
+ word_count = len(text.split())
192
+
193
+ # Headers pass through with adjusted minimum
194
+ if element.name in self.header_tags:
195
+ if word_count >= 3: # Minimal sanity check for headers
196
+ candidates.append((index, text, element))
197
+ index += 1
198
+ # Regular content uses standard minimum
199
+ elif word_count >= self.min_word_count:
200
+ candidates.append((index, text, element))
201
+ index += 1
202
+
203
+ for child in element.children:
204
+ dfs(child)
205
+
206
+ dfs(soup.body if soup.body else soup)
207
+ return candidates
208
+
209
+ def is_excluded(self, tag: Tag) -> bool:
210
+ """Common method for exclusion logic"""
211
+ if tag.name in self.excluded_tags:
212
+ return True
213
+ class_id = ' '.join(filter(None, [
214
+ ' '.join(tag.get('class', [])),
215
+ tag.get('id', '')
216
+ ]))
217
+ return bool(self.negative_patterns.search(class_id))
218
+
219
+ def clean_element(self, tag: Tag) -> str:
220
+ """Common method for cleaning HTML elements with minimal overhead"""
221
+ if not tag or not isinstance(tag, Tag):
222
+ return ""
223
+
224
+ unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'}
225
+ unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'}
226
+
227
+ # Use string builder pattern for better performance
228
+ builder = []
229
+
230
+ def render_tag(elem):
231
+ if not isinstance(elem, Tag):
232
+ if isinstance(elem, str):
233
+ builder.append(elem.strip())
234
+ return
235
+
236
+ if elem.name in unwanted_tags:
237
+ return
238
+
239
+ # Start tag
240
+ builder.append(f'<{elem.name}')
241
+
242
+ # Add cleaned attributes
243
+ attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
244
+ for key, value in attrs.items():
245
+ builder.append(f' {key}="{value}"')
246
+
247
+ builder.append('>')
248
+
249
+ # Process children
250
+ for child in elem.children:
251
+ render_tag(child)
252
+
253
+ # Close tag
254
+ builder.append(f'</{elem.name}>')
255
+
256
+ try:
257
+ render_tag(tag)
258
+ return ''.join(builder)
259
+ except Exception:
260
+ return str(tag) # Fallback to original if anything fails
261
+
262
+ class BM25ContentFilter(RelevantContentFilter):
263
+ """
264
+ Content filtering using BM25 algorithm with priority tag handling.
265
+
266
+ How it works:
267
+ 1. Extracts page metadata with fallbacks.
268
+ 2. Extracts text chunks from the body element.
269
+ 3. Tokenizes the corpus and query.
270
+ 4. Applies BM25 algorithm to calculate scores for each chunk.
271
+ 5. Filters out chunks below the threshold.
272
+ 6. Sorts chunks by score in descending order.
273
+ 7. Returns the top N chunks.
274
+
275
+ Attributes:
276
+ user_query (str): User query for filtering (optional).
277
+ bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
278
+ language (str): Language for stemming (default: 'english').
279
+
280
+ Methods:
281
+ filter_content(self, html: str, min_word_threshold: int = None)
282
+ """
283
+ def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
284
+ """
285
+ Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
286
+
287
+ Note:
288
+ If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
289
+
290
+ Args:
291
+ user_query (str): User query for filtering (optional).
292
+ bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
293
+ language (str): Language for stemming (default: 'english').
294
+ """
295
+ super().__init__(user_query=user_query)
296
+ self.bm25_threshold = bm25_threshold
297
+ self.priority_tags = {
298
+ 'h1': 5.0,
299
+ 'h2': 4.0,
300
+ 'h3': 3.0,
301
+ 'title': 4.0,
302
+ 'strong': 2.0,
303
+ 'b': 1.5,
304
+ 'em': 1.5,
305
+ 'blockquote': 2.0,
306
+ 'code': 2.0,
307
+ 'pre': 1.5,
308
+ 'th': 1.5, # Table headers
309
+ }
310
+ self.stemmer = stemmer(language)
311
+
312
+ def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
313
+ """
314
+ Implements content filtering using BM25 algorithm with priority tag handling.
315
+
316
+ Note:
317
+ This method implements the filtering logic for the BM25ContentFilter class.
318
+ It takes HTML content as input and returns a list of filtered text chunks.
319
+
320
+ Args:
321
+ html (str): HTML content to be filtered.
322
+ min_word_threshold (int): Minimum word threshold for filtering (optional).
323
+
324
+ Returns:
325
+ List[str]: List of filtered text chunks.
326
+ """
327
+ if not html or not isinstance(html, str):
328
+ return []
329
+
330
+ soup = BeautifulSoup(html, 'lxml')
331
+
332
+ # Check if body is present
333
+ if not soup.body:
334
+ # Wrap in body tag if missing
335
+ soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
336
+ body = soup.find('body')
337
+
338
+ query = self.extract_page_query(soup, body)
339
+
340
+ if not query:
341
+ return []
342
+ # return [self.clean_element(soup)]
343
+
344
+ candidates = self.extract_text_chunks(body, min_word_threshold)
345
+
346
+ if not candidates:
347
+ return []
348
+
349
+ # Tokenize corpus
350
+ # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
351
+ # tokenized_query = query.lower().split()
352
+
353
+ # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
354
+ # for _, chunk, _, _ in candidates]
355
+ # tokenized_query = [ps.stem(word) for word in query.lower().split()]
356
+
357
+ tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()]
358
+ for _, chunk, _, _ in candidates]
359
+ tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
360
+
361
+ # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
362
+ # for _, chunk, _, _ in candidates]
363
+ # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
364
+
365
+ # Clean from stop words and noise
366
+ tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
367
+ tokenized_query = clean_tokens(tokenized_query)
368
+
369
+ bm25 = BM25Okapi(tokenized_corpus)
370
+ scores = bm25.get_scores(tokenized_query)
371
+
372
+ # Adjust scores with tag weights
373
+ adjusted_candidates = []
374
+ for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
375
+ tag_weight = self.priority_tags.get(tag.name, 1.0)
376
+ adjusted_score = score * tag_weight
377
+ adjusted_candidates.append((adjusted_score, index, chunk, tag))
378
+
379
+ # Filter candidates by threshold
380
+ selected_candidates = [
381
+ (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
382
+ if adjusted_score >= self.bm25_threshold
383
+ ]
384
+
385
+ if not selected_candidates:
386
+ return []
387
+
388
+ # Sort selected candidates by original document order
389
+ selected_candidates.sort(key=lambda x: x[0])
390
+
391
+ return [self.clean_element(tag) for _, _, tag in selected_candidates]
392
+
393
+ class PruningContentFilter(RelevantContentFilter):
394
+ """
395
+ Content filtering using pruning algorithm with dynamic threshold.
396
+
397
+ How it works:
398
+ 1. Extracts page metadata with fallbacks.
399
+ 2. Extracts text chunks from the body element.
400
+ 3. Applies pruning algorithm to calculate scores for each chunk.
401
+ 4. Filters out chunks below the threshold.
402
+ 5. Sorts chunks by score in descending order.
403
+ 6. Returns the top N chunks.
404
+
405
+ Attributes:
406
+ user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
407
+ min_word_threshold (int): Minimum word threshold for filtering (optional).
408
+ threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
409
+ threshold (float): Fixed threshold value (default: 0.48).
410
+
411
+ Methods:
412
+ filter_content(self, html: str, min_word_threshold: int = None):
413
+ """
414
+ def __init__(self, user_query: str = None, min_word_threshold: int = None,
415
+ threshold_type: str = 'fixed', threshold: float = 0.48):
416
+ """
417
+ Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
418
+
419
+ Note:
420
+ If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
421
+
422
+ Args:
423
+ user_query (str): User query for filtering (optional).
424
+ min_word_threshold (int): Minimum word threshold for filtering (optional).
425
+ threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
426
+ threshold (float): Fixed threshold value (default: 0.48).
427
+ """
428
+ super().__init__(None)
429
+ self.min_word_threshold = min_word_threshold
430
+ self.threshold_type = threshold_type
431
+ self.threshold = threshold
432
+
433
+ # Add tag importance for dynamic threshold
434
+ self.tag_importance = {
435
+ 'article': 1.5,
436
+ 'main': 1.4,
437
+ 'section': 1.3,
438
+ 'p': 1.2,
439
+ 'h1': 1.4,
440
+ 'h2': 1.3,
441
+ 'h3': 1.2,
442
+ 'div': 0.7,
443
+ 'span': 0.6
444
+ }
445
+
446
+ # Metric configuration
447
+ self.metric_config = {
448
+ 'text_density': True,
449
+ 'link_density': True,
450
+ 'tag_weight': True,
451
+ 'class_id_weight': True,
452
+ 'text_length': True,
453
+ }
454
+
455
+ self.metric_weights = {
456
+ 'text_density': 0.4,
457
+ 'link_density': 0.2,
458
+ 'tag_weight': 0.2,
459
+ 'class_id_weight': 0.1,
460
+ 'text_length': 0.1,
461
+ }
462
+
463
+ self.tag_weights = {
464
+ 'div': 0.5,
465
+ 'p': 1.0,
466
+ 'article': 1.5,
467
+ 'section': 1.0,
468
+ 'span': 0.3,
469
+ 'li': 0.5,
470
+ 'ul': 0.5,
471
+ 'ol': 0.5,
472
+ 'h1': 1.2,
473
+ 'h2': 1.1,
474
+ 'h3': 1.0,
475
+ 'h4': 0.9,
476
+ 'h5': 0.8,
477
+ 'h6': 0.7,
478
+ }
479
+
480
+ def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
481
+ """
482
+ Implements content filtering using pruning algorithm with dynamic threshold.
483
+
484
+ Note:
485
+ This method implements the filtering logic for the PruningContentFilter class.
486
+ It takes HTML content as input and returns a list of filtered text chunks.
487
+
488
+ Args:
489
+ html (str): HTML content to be filtered.
490
+ min_word_threshold (int): Minimum word threshold for filtering (optional).
491
+
492
+ Returns:
493
+ List[str]: List of filtered text chunks.
494
+ """
495
+ if not html or not isinstance(html, str):
496
+ return []
497
+
498
+ soup = BeautifulSoup(html, 'lxml')
499
+ if not soup.body:
500
+ soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
501
+
502
+ # Remove comments and unwanted tags
503
+ self._remove_comments(soup)
504
+ self._remove_unwanted_tags(soup)
505
+
506
+ # Prune tree starting from body
507
+ body = soup.find('body')
508
+ self._prune_tree(body)
509
+
510
+ # Extract remaining content as list of HTML strings
511
+ content_blocks = []
512
+ for element in body.children:
513
+ if isinstance(element, str) or not hasattr(element, 'name'):
514
+ continue
515
+ if len(element.get_text(strip=True)) > 0:
516
+ content_blocks.append(str(element))
517
+
518
+ return content_blocks
519
+
520
+ def _remove_comments(self, soup):
521
+ """Removes HTML comments"""
522
+ for element in soup(text=lambda text: isinstance(text, Comment)):
523
+ element.extract()
524
+
525
+ def _remove_unwanted_tags(self, soup):
526
+ """Removes unwanted tags"""
527
+ for tag in self.excluded_tags:
528
+ for element in soup.find_all(tag):
529
+ element.decompose()
530
+
531
+ def _prune_tree(self, node):
532
+ """
533
+ Prunes the tree starting from the given node.
534
+
535
+ Args:
536
+ node (Tag): The node from which the pruning starts.
537
+ """
538
+ if not node or not hasattr(node, 'name') or node.name is None:
539
+ return
540
+
541
+ text_len = len(node.get_text(strip=True))
542
+ tag_len = len(node.encode_contents().decode('utf-8'))
543
+ link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
544
+
545
+ metrics = {
546
+ 'node': node,
547
+ 'tag_name': node.name,
548
+ 'text_len': text_len,
549
+ 'tag_len': tag_len,
550
+ 'link_text_len': link_text_len
551
+ }
552
+
553
+ score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
554
+
555
+ if self.threshold_type == 'fixed':
556
+ should_remove = score < self.threshold
557
+ else: # dynamic
558
+ tag_importance = self.tag_importance.get(node.name, 0.7)
559
+ text_ratio = text_len / tag_len if tag_len > 0 else 0
560
+ link_ratio = link_text_len / text_len if text_len > 0 else 1
561
+
562
+ threshold = self.threshold # base threshold
563
+ if tag_importance > 1:
564
+ threshold *= 0.8
565
+ if text_ratio > 0.4:
566
+ threshold *= 0.9
567
+ if link_ratio > 0.6:
568
+ threshold *= 1.2
569
+
570
+ should_remove = score < threshold
571
+
572
+ if should_remove:
573
+ node.decompose()
574
+ else:
575
+ children = [child for child in node.children if hasattr(child, 'name')]
576
+ for child in children:
577
+ self._prune_tree(child)
578
+
579
+ def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
580
+ """Computes the composite score"""
581
+ if self.min_word_threshold:
582
+ # Get raw text from metrics node - avoid extra processing
583
+ text = metrics['node'].get_text(strip=True)
584
+ word_count = text.count(' ') + 1
585
+ if word_count < self.min_word_threshold:
586
+ return -1.0 # Guaranteed removal
587
+ score = 0.0
588
+ total_weight = 0.0
589
+
590
+ if self.metric_config['text_density']:
591
+ density = text_len / tag_len if tag_len > 0 else 0
592
+ score += self.metric_weights['text_density'] * density
593
+ total_weight += self.metric_weights['text_density']
594
+
595
+ if self.metric_config['link_density']:
596
+ density = 1 - (link_text_len / text_len if text_len > 0 else 0)
597
+ score += self.metric_weights['link_density'] * density
598
+ total_weight += self.metric_weights['link_density']
599
+
600
+ if self.metric_config['tag_weight']:
601
+ tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
602
+ score += self.metric_weights['tag_weight'] * tag_score
603
+ total_weight += self.metric_weights['tag_weight']
604
+
605
+ if self.metric_config['class_id_weight']:
606
+ class_score = self._compute_class_id_weight(metrics['node'])
607
+ score += self.metric_weights['class_id_weight'] * max(0, class_score)
608
+ total_weight += self.metric_weights['class_id_weight']
609
+
610
+ if self.metric_config['text_length']:
611
+ score += self.metric_weights['text_length'] * math.log(text_len + 1)
612
+ total_weight += self.metric_weights['text_length']
613
+
614
+ return score / total_weight if total_weight > 0 else 0
615
+
616
+ def _compute_class_id_weight(self, node):
617
+ """Computes the class ID weight"""
618
+ class_id_score = 0
619
+ if 'class' in node.attrs:
620
+ classes = ' '.join(node['class'])
621
+ if self.negative_patterns.match(classes):
622
+ class_id_score -= 0.5
623
+ if 'id' in node.attrs:
624
+ element_id = node['id']
625
+ if self.negative_patterns.match(element_id):
626
+ class_id_score -= 0.5
627
+ return class_id_score
crawl4ai/content_scraping_strategy.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re # Point 1: Pre-Compile Regular Expressions
2
+ import time
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, Any, Optional
5
+ from bs4 import BeautifulSoup
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import asyncio, requests, re, os
8
+ from .config import *
9
+ from bs4 import element, NavigableString, Comment
10
+ from bs4 import PageElement, Tag
11
+ from urllib.parse import urljoin
12
+ from requests.exceptions import InvalidSchema
13
+ # from .content_cleaning_strategy import ContentCleaningStrategy
14
+ from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
15
+ from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
16
+ from .models import MarkdownGenerationResult
17
+ from .utils import (
18
+ extract_metadata,
19
+ normalize_url,
20
+ is_external_url,
21
+ get_base_domain,
22
+ )
23
+
24
+
25
+ # Pre-compile regular expressions for Open Graph and Twitter metadata
26
+ OG_REGEX = re.compile(r'^og:')
27
+ TWITTER_REGEX = re.compile(r'^twitter:')
28
+ DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
29
+
30
+ # Function to parse image height/width value and units
31
+ def parse_dimension(dimension):
32
+ if dimension:
33
+ # match = re.match(r"(\d+)(\D*)", dimension)
34
+ match = DIMENSION_REGEX.match(dimension)
35
+ if match:
36
+ number = int(match.group(1))
37
+ unit = match.group(2) or 'px' # Default unit is 'px' if not specified
38
+ return number, unit
39
+ return None, None
40
+
41
+ # Fetch image file metadata to extract size and extension
42
+ def fetch_image_file_size(img, base_url):
43
+ #If src is relative path construct full URL, if not it may be CDN URL
44
+ img_url = urljoin(base_url,img.get('src'))
45
+ try:
46
+ response = requests.head(img_url)
47
+ if response.status_code == 200:
48
+ return response.headers.get('Content-Length',None)
49
+ else:
50
+ print(f"Failed to retrieve file size for {img_url}")
51
+ return None
52
+ except InvalidSchema as e:
53
+ return None
54
+ finally:
55
+ return
56
+
57
+ class ContentScrapingStrategy(ABC):
58
+ @abstractmethod
59
+ def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
60
+ pass
61
+
62
+ @abstractmethod
63
+ async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
64
+ pass
65
+
66
+ class WebScrapingStrategy(ContentScrapingStrategy):
67
+ """
68
+ Class for web content scraping. Perhaps the most important class.
69
+
70
+ How it works:
71
+ 1. Extract content from HTML using BeautifulSoup.
72
+ 2. Clean the extracted content using a content cleaning strategy.
73
+ 3. Filter the cleaned content using a content filtering strategy.
74
+ 4. Generate markdown content from the filtered content.
75
+ 5. Return the markdown content.
76
+ """
77
+
78
+ def __init__(self, logger=None):
79
+ self.logger = logger
80
+
81
+ def _log(self, level, message, tag="SCRAPE", **kwargs):
82
+ """Helper method to safely use logger."""
83
+ if self.logger:
84
+ log_method = getattr(self.logger, level)
85
+ log_method(message=message, tag=tag, **kwargs)
86
+
87
+ def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
88
+ """
89
+ Main entry point for content scraping.
90
+
91
+ Args:
92
+ url (str): The URL of the page to scrape.
93
+ html (str): The HTML content of the page.
94
+ **kwargs: Additional keyword arguments.
95
+
96
+ Returns:
97
+ Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
98
+
99
+ - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
100
+ - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
101
+ - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
102
+ - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
103
+ """
104
+ return self._scrap(url, html, is_async=False, **kwargs)
105
+
106
+ async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
107
+ """
108
+ Main entry point for asynchronous content scraping.
109
+
110
+ Args:
111
+ url (str): The URL of the page to scrape.
112
+ html (str): The HTML content of the page.
113
+ **kwargs: Additional keyword arguments.
114
+
115
+ Returns:
116
+ Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
117
+
118
+ - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
119
+ - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
120
+ - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
121
+ - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
122
+ """
123
+ return await asyncio.to_thread(self._scrap, url, html, **kwargs)
124
+
125
+ def flatten_nested_elements(self, node):
126
+ """
127
+ Flatten nested elements in a HTML tree.
128
+
129
+ Args:
130
+ node (Tag): The root node of the HTML tree.
131
+
132
+ Returns:
133
+ Tag: The flattened HTML tree.
134
+ """
135
+ if isinstance(node, NavigableString):
136
+ return node
137
+ if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
138
+ return self.flatten_nested_elements(node.contents[0])
139
+ node.contents = [self.flatten_nested_elements(child) for child in node.contents]
140
+ return node
141
+
142
+ def find_closest_parent_with_useful_text(self, tag, **kwargs):
143
+ """
144
+ Find the closest parent with useful text.
145
+
146
+ Args:
147
+ tag (Tag): The starting tag to search from.
148
+ **kwargs: Additional keyword arguments.
149
+
150
+ Returns:
151
+ Tag: The closest parent with useful text, or None if not found.
152
+ """
153
+ image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
154
+ current_tag = tag
155
+ while current_tag:
156
+ current_tag = current_tag.parent
157
+ # Get the text content of the parent tag
158
+ if current_tag:
159
+ text_content = current_tag.get_text(separator=' ',strip=True)
160
+ # Check if the text content has at least word_count_threshold
161
+ if len(text_content.split()) >= image_description_min_word_threshold:
162
+ return text_content
163
+ return None
164
+
165
+ def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
166
+ """
167
+ Remove unwanted attributes from an HTML element.
168
+
169
+ Args:
170
+ element (Tag): The HTML element to remove attributes from.
171
+ important_attrs (list): List of important attributes to keep.
172
+ keep_data_attributes (bool): Whether to keep data attributes.
173
+
174
+ Returns:
175
+ None
176
+ """
177
+ attrs_to_remove = []
178
+ for attr in element.attrs:
179
+ if attr not in important_attrs:
180
+ if keep_data_attributes:
181
+ if not attr.startswith('data-'):
182
+ attrs_to_remove.append(attr)
183
+ else:
184
+ attrs_to_remove.append(attr)
185
+
186
+ for attr in attrs_to_remove:
187
+ del element[attr]
188
+
189
+ def process_image(self, img, url, index, total_images, **kwargs):
190
+ """
191
+ Process an image element.
192
+
193
+ How it works:
194
+ 1. Check if the image has valid display and inside undesired html elements.
195
+ 2. Score an image for it's usefulness.
196
+ 3. Extract image file metadata to extract size and extension.
197
+ 4. Generate a dictionary with the processed image information.
198
+ 5. Return the processed image information.
199
+
200
+ Args:
201
+ img (Tag): The image element to process.
202
+ url (str): The URL of the page containing the image.
203
+ index (int): The index of the image in the list of images.
204
+ total_images (int): The total number of images in the list.
205
+ **kwargs: Additional keyword arguments.
206
+
207
+ Returns:
208
+ dict: A dictionary containing the processed image information.
209
+ """
210
+ parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
211
+ if ' ' in u else None}
212
+ for u in [f"http{p}" for p in s.split("http") if p]]
213
+
214
+ # Constants for checks
215
+ classes_to_check = frozenset(['button', 'icon', 'logo'])
216
+ tags_to_check = frozenset(['button', 'input'])
217
+ image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif'])
218
+
219
+ # Pre-fetch commonly used attributes
220
+ style = img.get('style', '')
221
+ alt = img.get('alt', '')
222
+ src = img.get('src', '')
223
+ data_src = img.get('data-src', '')
224
+ srcset = img.get('srcset', '')
225
+ data_srcset = img.get('data-srcset', '')
226
+ width = img.get('width')
227
+ height = img.get('height')
228
+ parent = img.parent
229
+ parent_classes = parent.get('class', [])
230
+
231
+ # Quick validation checks
232
+ if ('display:none' in style or
233
+ parent.name in tags_to_check or
234
+ any(c in cls for c in parent_classes for cls in classes_to_check) or
235
+ any(c in src for c in classes_to_check) or
236
+ any(c in alt for c in classes_to_check)):
237
+ return None
238
+
239
+ # Quick score calculation
240
+ score = 0
241
+ if width and width.isdigit():
242
+ width_val = int(width)
243
+ score += 1 if width_val > 150 else 0
244
+ if height and height.isdigit():
245
+ height_val = int(height)
246
+ score += 1 if height_val > 150 else 0
247
+ if alt:
248
+ score += 1
249
+ score += index/total_images < 0.5
250
+
251
+ # image_format = ''
252
+ # if "data:image/" in src:
253
+ # image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
254
+ # else:
255
+ # image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
256
+
257
+ # if image_format in ('jpg', 'png', 'webp', 'avif'):
258
+ # score += 1
259
+
260
+
261
+ # Check for image format in all possible sources
262
+ def has_image_format(url):
263
+ return any(fmt in url.lower() for fmt in image_formats)
264
+
265
+ # Score for having proper image sources
266
+ if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
267
+ score += 1
268
+ if srcset or data_srcset:
269
+ score += 1
270
+ if img.find_parent('picture'):
271
+ score += 1
272
+
273
+ # Detect format from any available source
274
+ detected_format = None
275
+ for url in [src, data_src, srcset, data_srcset]:
276
+ if url:
277
+ format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
278
+ if format_matches:
279
+ detected_format = format_matches[0]
280
+ break
281
+
282
+ if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
283
+ return None
284
+
285
+ # Use set for deduplication
286
+ unique_urls = set()
287
+ image_variants = []
288
+
289
+ # Generate a unique group ID for this set of variants
290
+ group_id = index
291
+
292
+ # Base image info template
293
+ image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
294
+ base_info = {
295
+ 'alt': alt,
296
+ 'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
297
+ 'score': score,
298
+ 'type': 'image',
299
+ 'group_id': group_id, # Group ID for this set of variants
300
+ 'format': detected_format,
301
+ }
302
+
303
+ # Inline function for adding variants
304
+ def add_variant(src, width=None):
305
+ if src and not src.startswith('data:') and src not in unique_urls:
306
+ unique_urls.add(src)
307
+ image_variants.append({**base_info, 'src': src, 'width': width})
308
+
309
+ # Process all sources
310
+ add_variant(src)
311
+ add_variant(data_src)
312
+
313
+ # Handle srcset and data-srcset in one pass
314
+ for attr in ('srcset', 'data-srcset'):
315
+ if value := img.get(attr):
316
+ for source in parse_srcset(value):
317
+ add_variant(source['url'], source['width'])
318
+
319
+ # Quick picture element check
320
+ if picture := img.find_parent('picture'):
321
+ for source in picture.find_all('source'):
322
+ if srcset := source.get('srcset'):
323
+ for src in parse_srcset(srcset):
324
+ add_variant(src['url'], src['width'])
325
+
326
+ # Framework-specific attributes in one pass
327
+ for attr, value in img.attrs.items():
328
+ if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
329
+ add_variant(value)
330
+
331
+ return image_variants if image_variants else None
332
+
333
+ def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
334
+ """
335
+ Process an HTML element.
336
+
337
+ How it works:
338
+ 1. Check if the element is an image, video, or audio.
339
+ 2. Extract the element's attributes and content.
340
+ 3. Process the element based on its type.
341
+ 4. Return the processed element information.
342
+
343
+ Args:
344
+ url (str): The URL of the page containing the element.
345
+ element (Tag): The HTML element to process.
346
+ **kwargs: Additional keyword arguments.
347
+
348
+ Returns:
349
+ dict: A dictionary containing the processed element information.
350
+ """
351
+ media = {'images': [], 'videos': [], 'audios': []}
352
+ internal_links_dict = {}
353
+ external_links_dict = {}
354
+ self._process_element(
355
+ url,
356
+ element,
357
+ media,
358
+ internal_links_dict,
359
+ external_links_dict,
360
+ **kwargs
361
+ )
362
+ return {
363
+ 'media': media,
364
+ 'internal_links_dict': internal_links_dict,
365
+ 'external_links_dict': external_links_dict
366
+ }
367
+
368
+ def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
369
+ """
370
+ Process an HTML element.
371
+ """
372
+ try:
373
+ if isinstance(element, NavigableString):
374
+ if isinstance(element, Comment):
375
+ element.extract()
376
+ return False
377
+
378
+ # if element.name == 'img':
379
+ # process_image(element, url, 0, 1)
380
+ # return True
381
+ base_domain = kwargs.get("base_domain", get_base_domain(url))
382
+
383
+ if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
384
+ element.decompose()
385
+ return False
386
+
387
+ keep_element = False
388
+
389
+ exclude_domains = kwargs.get('exclude_domains', [])
390
+ # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
391
+ # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
392
+ # exclude_social_media_domains = list(set(exclude_social_media_domains))
393
+
394
+ try:
395
+ if element.name == 'a' and element.get('href'):
396
+ href = element.get('href', '').strip()
397
+ if not href: # Skip empty hrefs
398
+ return False
399
+
400
+ url_base = url.split('/')[2]
401
+
402
+ # Normalize the URL
403
+ try:
404
+ normalized_href = normalize_url(href, url)
405
+ except ValueError as e:
406
+ # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
407
+ return False
408
+
409
+ link_data = {
410
+ 'href': normalized_href,
411
+ 'text': element.get_text().strip(),
412
+ 'title': element.get('title', '').strip(),
413
+ 'base_domain': base_domain
414
+ }
415
+
416
+ is_external = is_external_url(normalized_href, base_domain)
417
+
418
+ keep_element = True
419
+
420
+ # Handle external link exclusions
421
+ if is_external:
422
+ link_base_domain = get_base_domain(normalized_href)
423
+ link_data['base_domain'] = link_base_domain
424
+ if kwargs.get('exclude_external_links', False):
425
+ element.decompose()
426
+ return False
427
+ # elif kwargs.get('exclude_social_media_links', False):
428
+ # if link_base_domain in exclude_social_media_domains:
429
+ # element.decompose()
430
+ # return False
431
+ # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
432
+ # element.decompose()
433
+ # return False
434
+ elif exclude_domains:
435
+ if link_base_domain in exclude_domains:
436
+ element.decompose()
437
+ return False
438
+ # if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
439
+ # element.decompose()
440
+ # return False
441
+
442
+ if is_external:
443
+ if normalized_href not in external_links_dict:
444
+ external_links_dict[normalized_href] = link_data
445
+ else:
446
+ if normalized_href not in internal_links_dict:
447
+ internal_links_dict[normalized_href] = link_data
448
+
449
+
450
+ except Exception as e:
451
+ raise Exception(f"Error processing links: {str(e)}")
452
+
453
+ try:
454
+ if element.name == 'img':
455
+ potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
456
+ src = element.get('src', '')
457
+ while not src and potential_sources:
458
+ src = element.get(potential_sources.pop(0), '')
459
+ if not src:
460
+ element.decompose()
461
+ return False
462
+
463
+ # If it is srcset pick up the first image
464
+ if 'srcset' in element.attrs:
465
+ src = element.attrs['srcset'].split(',')[0].split(' ')[0]
466
+
467
+ # If image src is internal, then skip
468
+ if not is_external_url(src, base_domain):
469
+ return True
470
+
471
+ image_src_base_domain = get_base_domain(src)
472
+
473
+ # Check flag if we should remove external images
474
+ if kwargs.get('exclude_external_images', False):
475
+ element.decompose()
476
+ return False
477
+ # src_url_base = src.split('/')[2]
478
+ # url_base = url.split('/')[2]
479
+ # if url_base not in src_url_base:
480
+ # element.decompose()
481
+ # return False
482
+
483
+ # if kwargs.get('exclude_social_media_links', False):
484
+ # if image_src_base_domain in exclude_social_media_domains:
485
+ # element.decompose()
486
+ # return False
487
+ # src_url_base = src.split('/')[2]
488
+ # url_base = url.split('/')[2]
489
+ # if any(domain in src for domain in exclude_social_media_domains):
490
+ # element.decompose()
491
+ # return False
492
+
493
+ # Handle exclude domains
494
+ if exclude_domains:
495
+ if image_src_base_domain in exclude_domains:
496
+ element.decompose()
497
+ return False
498
+ # if any(domain in src for domain in kwargs.get('exclude_domains', [])):
499
+ # element.decompose()
500
+ # return False
501
+
502
+ return True # Always keep image elements
503
+ except Exception as e:
504
+ raise "Error processing images"
505
+
506
+
507
+ # Check if flag to remove all forms is set
508
+ if kwargs.get('remove_forms', False) and element.name == 'form':
509
+ element.decompose()
510
+ return False
511
+
512
+ if element.name in ['video', 'audio']:
513
+ media[f"{element.name}s"].append({
514
+ 'src': element.get('src'),
515
+ 'alt': element.get('alt'),
516
+ 'type': element.name,
517
+ 'description': self.find_closest_parent_with_useful_text(element, **kwargs)
518
+ })
519
+ source_tags = element.find_all('source')
520
+ for source_tag in source_tags:
521
+ media[f"{element.name}s"].append({
522
+ 'src': source_tag.get('src'),
523
+ 'alt': element.get('alt'),
524
+ 'type': element.name,
525
+ 'description': self.find_closest_parent_with_useful_text(element, **kwargs)
526
+ })
527
+ return True # Always keep video and audio elements
528
+
529
+ if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
530
+ if kwargs.get('only_text', False):
531
+ element.replace_with(element.get_text())
532
+
533
+ try:
534
+ self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
535
+ except Exception as e:
536
+ # print('Error removing unwanted attributes:', str(e))
537
+ self._log('error',
538
+ message="Error removing unwanted attributes: {error}",
539
+ tag="SCRAPE",
540
+ params={"error": str(e)}
541
+ )
542
+ # Process children
543
+ for child in list(element.children):
544
+ if isinstance(child, NavigableString) and not isinstance(child, Comment):
545
+ if len(child.strip()) > 0:
546
+ keep_element = True
547
+ else:
548
+ if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs):
549
+ keep_element = True
550
+
551
+
552
+ # Check word count
553
+ word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD)
554
+ if not keep_element:
555
+ word_count = len(element.get_text(strip=True).split())
556
+ keep_element = word_count >= word_count_threshold
557
+
558
+ if not keep_element:
559
+ element.decompose()
560
+
561
+ return keep_element
562
+ except Exception as e:
563
+ # print('Error processing element:', str(e))
564
+ self._log('error',
565
+ message="Error processing element: {error}",
566
+ tag="SCRAPE",
567
+ params={"error": str(e)}
568
+ )
569
+ return False
570
+
571
+ def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
572
+ """
573
+ Extract content from HTML using BeautifulSoup.
574
+
575
+ Args:
576
+ url (str): The URL of the page to scrape.
577
+ html (str): The HTML content of the page to scrape.
578
+ word_count_threshold (int): The minimum word count threshold for content extraction.
579
+ css_selector (str): The CSS selector to use for content extraction.
580
+ **kwargs: Additional keyword arguments.
581
+
582
+ Returns:
583
+ dict: A dictionary containing the extracted content.
584
+ """
585
+ success = True
586
+ if not html:
587
+ return None
588
+
589
+ parser_type = kwargs.get('parser', 'lxml')
590
+ soup = BeautifulSoup(html, parser_type)
591
+ body = soup.body
592
+ base_domain = get_base_domain(url)
593
+
594
+ try:
595
+ meta = extract_metadata("", soup)
596
+ except Exception as e:
597
+ self._log('error',
598
+ message="Error extracting metadata: {error}",
599
+ tag="SCRAPE",
600
+ params={"error": str(e)}
601
+ )
602
+ meta = {}
603
+
604
+ # Handle tag-based removal first - faster than CSS selection
605
+ excluded_tags = set(kwargs.get('excluded_tags', []) or [])
606
+ if excluded_tags:
607
+ for element in body.find_all(lambda tag: tag.name in excluded_tags):
608
+ element.extract()
609
+
610
+ # Handle CSS selector-based removal
611
+ excluded_selector = kwargs.get('excluded_selector', '')
612
+ if excluded_selector:
613
+ is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector
614
+ if is_single_selector:
615
+ while element := body.select_one(excluded_selector):
616
+ element.extract()
617
+ else:
618
+ for element in body.select(excluded_selector):
619
+ element.extract()
620
+
621
+ if css_selector:
622
+ selected_elements = body.select(css_selector)
623
+ if not selected_elements:
624
+ return {
625
+ 'markdown': '',
626
+ 'cleaned_html': '',
627
+ 'success': True,
628
+ 'media': {'images': [], 'videos': [], 'audios': []},
629
+ 'links': {'internal': [], 'external': []},
630
+ 'metadata': {},
631
+ 'message': f"No elements found for CSS selector: {css_selector}"
632
+ }
633
+ # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
634
+ body = soup.new_tag('div')
635
+ for el in selected_elements:
636
+ body.append(el)
637
+
638
+ kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS)
639
+ kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', []))
640
+ if kwargs.get('exclude_social_media_links', False):
641
+ kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains'])
642
+
643
+ result_obj = self.process_element(
644
+ url,
645
+ body,
646
+ word_count_threshold = word_count_threshold,
647
+ base_domain=base_domain,
648
+ **kwargs
649
+ )
650
+
651
+ links = {'internal': [], 'external': []}
652
+ media = result_obj['media']
653
+ internal_links_dict = result_obj['internal_links_dict']
654
+ external_links_dict = result_obj['external_links_dict']
655
+
656
+ # Update the links dictionary with unique links
657
+ links['internal'] = list(internal_links_dict.values())
658
+ links['external'] = list(external_links_dict.values())
659
+
660
+ # # Process images using ThreadPoolExecutor
661
+ imgs = body.find_all('img')
662
+
663
+ media['images'] = [
664
+ img for result in (self.process_image(img, url, i, len(imgs))
665
+ for i, img in enumerate(imgs))
666
+ if result is not None
667
+ for img in result
668
+ ]
669
+
670
+ body = self.flatten_nested_elements(body)
671
+ base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
672
+ for img in imgs:
673
+ src = img.get('src', '')
674
+ if base64_pattern.match(src):
675
+ # Replace base64 data with empty string
676
+ img['src'] = base64_pattern.sub('', src)
677
+
678
+ str_body = ""
679
+ try:
680
+ str_body = body.encode_contents().decode('utf-8')
681
+ except Exception as e:
682
+ # Reset body to the original HTML
683
+ success = False
684
+ body = BeautifulSoup(html, 'html.parser')
685
+
686
+ # Create a new div with a special ID
687
+ error_div = body.new_tag('div', id='crawl4ai_error_message')
688
+ error_div.string = '''
689
+ Crawl4AI Error: This page is not fully supported.
690
+
691
+ Possible reasons:
692
+ 1. The page may have restrictions that prevent crawling.
693
+ 2. The page might not be fully loaded.
694
+
695
+ Suggestions:
696
+ - Try calling the crawl function with these parameters:
697
+ magic=True,
698
+ - Set headless=False to visualize what's happening on the page.
699
+
700
+ If the issue persists, please check the page's structure and any potential anti-crawling measures.
701
+ '''
702
+
703
+ # Append the error div to the body
704
+ body.body.append(error_div)
705
+ str_body = body.encode_contents().decode('utf-8')
706
+
707
+ print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
708
+ self._log('error',
709
+ message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
710
+ tag="SCRAPE"
711
+ )
712
+
713
+ cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
714
+
715
+
716
+ return {
717
+ # **markdown_content,
718
+ 'cleaned_html': cleaned_html,
719
+ 'success': success,
720
+ 'media': media,
721
+ 'links': links,
722
+ 'metadata': meta
723
+ }
crawl4ai/crawler_strategy.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from selenium import webdriver
3
+ from selenium.webdriver.chrome.service import Service
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from selenium.webdriver.chrome.options import Options
8
+ from selenium.common.exceptions import InvalidArgumentException, WebDriverException
9
+ # from selenium.webdriver.chrome.service import Service as ChromeService
10
+ # from webdriver_manager.chrome import ChromeDriverManager
11
+ # from urllib3.exceptions import MaxRetryError
12
+
13
+ from .config import *
14
+ import logging, time
15
+ import base64
16
+ from PIL import Image, ImageDraw, ImageFont
17
+ from io import BytesIO
18
+ from typing import List, Callable
19
+ import requests
20
+ import os
21
+ from pathlib import Path
22
+ from .utils import *
23
+
24
+ logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
25
+ logger.setLevel(logging.WARNING)
26
+
27
+ logger_driver = logging.getLogger('selenium.webdriver.common.service')
28
+ logger_driver.setLevel(logging.WARNING)
29
+
30
+ urllib3_logger = logging.getLogger('urllib3.connectionpool')
31
+ urllib3_logger.setLevel(logging.WARNING)
32
+
33
+ # Disable http.client logging
34
+ http_client_logger = logging.getLogger('http.client')
35
+ http_client_logger.setLevel(logging.WARNING)
36
+
37
+ # Disable driver_finder and service logging
38
+ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
39
+ driver_finder_logger.setLevel(logging.WARNING)
40
+
41
+
42
+
43
+
44
+ class CrawlerStrategy(ABC):
45
+ @abstractmethod
46
+ def crawl(self, url: str, **kwargs) -> str:
47
+ pass
48
+
49
+ @abstractmethod
50
+ def take_screenshot(self, save_path: str):
51
+ pass
52
+
53
+ @abstractmethod
54
+ def update_user_agent(self, user_agent: str):
55
+ pass
56
+
57
+ @abstractmethod
58
+ def set_hook(self, hook_type: str, hook: Callable):
59
+ pass
60
+
61
+ class CloudCrawlerStrategy(CrawlerStrategy):
62
+ def __init__(self, use_cached_html = False):
63
+ super().__init__()
64
+ self.use_cached_html = use_cached_html
65
+
66
+ def crawl(self, url: str) -> str:
67
+ data = {
68
+ "urls": [url],
69
+ "include_raw_html": True,
70
+ "forced": True,
71
+ "extract_blocks": False,
72
+ }
73
+
74
+ response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
75
+ response = response.json()
76
+ html = response["results"][0]["html"]
77
+ return sanitize_input_encode(html)
78
+
79
+ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
80
+ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
81
+ super().__init__()
82
+ print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
83
+ self.options = Options()
84
+ self.options.headless = True
85
+ if kwargs.get("proxy"):
86
+ self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
87
+ if kwargs.get("user_agent"):
88
+ self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
89
+ else:
90
+ user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
91
+ self.options.add_argument(f"--user-agent={user_agent}")
92
+ self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
93
+
94
+ self.options.headless = kwargs.get("headless", True)
95
+ if self.options.headless:
96
+ self.options.add_argument("--headless")
97
+
98
+ self.options.add_argument("--disable-gpu")
99
+ self.options.add_argument("--window-size=1920,1080")
100
+ self.options.add_argument("--no-sandbox")
101
+ self.options.add_argument("--disable-dev-shm-usage")
102
+ self.options.add_argument("--disable-blink-features=AutomationControlled")
103
+
104
+ # self.options.add_argument("--disable-dev-shm-usage")
105
+ self.options.add_argument("--disable-gpu")
106
+ # self.options.add_argument("--disable-extensions")
107
+ # self.options.add_argument("--disable-infobars")
108
+ # self.options.add_argument("--disable-logging")
109
+ # self.options.add_argument("--disable-popup-blocking")
110
+ # self.options.add_argument("--disable-translate")
111
+ # self.options.add_argument("--disable-default-apps")
112
+ # self.options.add_argument("--disable-background-networking")
113
+ # self.options.add_argument("--disable-sync")
114
+ # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
115
+ # self.options.add_argument("--disable-browser-side-navigation")
116
+ # self.options.add_argument("--dns-prefetch-disable")
117
+ # self.options.add_argument("--disable-web-security")
118
+ self.options.add_argument("--log-level=3")
119
+ self.use_cached_html = use_cached_html
120
+ self.use_cached_html = use_cached_html
121
+ self.js_code = js_code
122
+ self.verbose = kwargs.get("verbose", False)
123
+
124
+ # Hooks
125
+ self.hooks = {
126
+ 'on_driver_created': None,
127
+ 'on_user_agent_updated': None,
128
+ 'before_get_url': None,
129
+ 'after_get_url': None,
130
+ 'before_return_html': None
131
+ }
132
+
133
+ # chromedriver_autoinstaller.install()
134
+ # import chromedriver_autoinstaller
135
+ # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
136
+ # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
137
+ # chromedriver_path = chromedriver_autoinstaller.install()
138
+ # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
139
+ # self.service = Service(chromedriver_autoinstaller.install())
140
+
141
+
142
+ # chromedriver_path = ChromeDriverManager().install()
143
+ # self.service = Service(chromedriver_path)
144
+ # self.service.log_path = "NUL"
145
+ # self.driver = webdriver.Chrome(service=self.service, options=self.options)
146
+
147
+ # Use selenium-manager (built into Selenium 4.10.0+)
148
+ self.service = Service()
149
+ self.driver = webdriver.Chrome(options=self.options)
150
+
151
+ self.driver = self.execute_hook('on_driver_created', self.driver)
152
+
153
+ if kwargs.get("cookies"):
154
+ for cookie in kwargs.get("cookies"):
155
+ self.driver.add_cookie(cookie)
156
+
157
+
158
+
159
+ def set_hook(self, hook_type: str, hook: Callable):
160
+ if hook_type in self.hooks:
161
+ self.hooks[hook_type] = hook
162
+ else:
163
+ raise ValueError(f"Invalid hook type: {hook_type}")
164
+
165
+ def execute_hook(self, hook_type: str, *args):
166
+ hook = self.hooks.get(hook_type)
167
+ if hook:
168
+ result = hook(*args)
169
+ if result is not None:
170
+ if isinstance(result, webdriver.Chrome):
171
+ return result
172
+ else:
173
+ raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
174
+ # If the hook returns None or there is no hook, return self.driver
175
+ return self.driver
176
+
177
+ def update_user_agent(self, user_agent: str):
178
+ self.options.add_argument(f"user-agent={user_agent}")
179
+ self.driver.quit()
180
+ self.driver = webdriver.Chrome(service=self.service, options=self.options)
181
+ self.driver = self.execute_hook('on_user_agent_updated', self.driver)
182
+
183
+ def set_custom_headers(self, headers: dict):
184
+ # Enable Network domain for sending headers
185
+ self.driver.execute_cdp_cmd('Network.enable', {})
186
+ # Set extra HTTP headers
187
+ self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
188
+
189
+ def _ensure_page_load(self, max_checks=6, check_interval=0.01):
190
+ initial_length = len(self.driver.page_source)
191
+
192
+ for ix in range(max_checks):
193
+ # print(f"Checking page load: {ix}")
194
+ time.sleep(check_interval)
195
+ current_length = len(self.driver.page_source)
196
+
197
+ if current_length != initial_length:
198
+ break
199
+
200
+ return self.driver.page_source
201
+
202
+ def crawl(self, url: str, **kwargs) -> str:
203
+ # Create md5 hash of the URL
204
+ import hashlib
205
+ url_hash = hashlib.md5(url.encode()).hexdigest()
206
+
207
+ if self.use_cached_html:
208
+ cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
209
+ if os.path.exists(cache_file_path):
210
+ with open(cache_file_path, "r") as f:
211
+ return sanitize_input_encode(f.read())
212
+
213
+ try:
214
+ self.driver = self.execute_hook('before_get_url', self.driver)
215
+ if self.verbose:
216
+ print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
217
+ self.driver.get(url) #<html><head></head><body></body></html>
218
+
219
+ WebDriverWait(self.driver, 20).until(
220
+ lambda d: d.execute_script('return document.readyState') == 'complete'
221
+ )
222
+ WebDriverWait(self.driver, 10).until(
223
+ EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
224
+ )
225
+
226
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
227
+
228
+ self.driver = self.execute_hook('after_get_url', self.driver)
229
+ html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
230
+ can_not_be_done_headless = False # Look at my creativity for naming variables
231
+
232
+ # TODO: Very ugly approach, but promise to change it!
233
+ if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
234
+ print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
235
+ can_not_be_done_headless = True
236
+ options = Options()
237
+ options.headless = False
238
+ # set window size very small
239
+ options.add_argument("--window-size=5,5")
240
+ driver = webdriver.Chrome(service=self.service, options=options)
241
+ driver.get(url)
242
+ self.driver = self.execute_hook('after_get_url', driver)
243
+ html = sanitize_input_encode(driver.page_source)
244
+ driver.quit()
245
+
246
+ # Execute JS code if provided
247
+ self.js_code = kwargs.get("js_code", self.js_code)
248
+ if self.js_code and type(self.js_code) == str:
249
+ self.driver.execute_script(self.js_code)
250
+ # Optionally, wait for some condition after executing the JS code
251
+ WebDriverWait(self.driver, 10).until(
252
+ lambda driver: driver.execute_script("return document.readyState") == "complete"
253
+ )
254
+ elif self.js_code and type(self.js_code) == list:
255
+ for js in self.js_code:
256
+ self.driver.execute_script(js)
257
+ WebDriverWait(self.driver, 10).until(
258
+ lambda driver: driver.execute_script("return document.readyState") == "complete"
259
+ )
260
+
261
+ # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
262
+ wait_for = kwargs.get('wait_for', False)
263
+ if wait_for:
264
+ if callable(wait_for):
265
+ print("[LOG] 🔄 Waiting for condition...")
266
+ WebDriverWait(self.driver, 20).until(wait_for)
267
+ else:
268
+ print("[LOG] 🔄 Waiting for condition...")
269
+ WebDriverWait(self.driver, 20).until(
270
+ EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
271
+ )
272
+
273
+ if not can_not_be_done_headless:
274
+ html = sanitize_input_encode(self.driver.page_source)
275
+ self.driver = self.execute_hook('before_return_html', self.driver, html)
276
+
277
+ # Store in cache
278
+ cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
279
+ with open(cache_file_path, "w", encoding="utf-8") as f:
280
+ f.write(html)
281
+
282
+ if self.verbose:
283
+ print(f"[LOG] ✅ Crawled {url} successfully!")
284
+
285
+ return html
286
+ except InvalidArgumentException as e:
287
+ if not hasattr(e, 'msg'):
288
+ e.msg = sanitize_input_encode(str(e))
289
+ raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
290
+ except WebDriverException as e:
291
+ # If e does nlt have msg attribute create it and set it to str(e)
292
+ if not hasattr(e, 'msg'):
293
+ e.msg = sanitize_input_encode(str(e))
294
+ raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
295
+ except Exception as e:
296
+ if not hasattr(e, 'msg'):
297
+ e.msg = sanitize_input_encode(str(e))
298
+ raise Exception(f"Failed to crawl {url}: {e.msg}")
299
+
300
+ def take_screenshot(self) -> str:
301
+ try:
302
+ # Get the dimensions of the page
303
+ total_width = self.driver.execute_script("return document.body.scrollWidth")
304
+ total_height = self.driver.execute_script("return document.body.scrollHeight")
305
+
306
+ # Set the window size to the dimensions of the page
307
+ self.driver.set_window_size(total_width, total_height)
308
+
309
+ # Take screenshot
310
+ screenshot = self.driver.get_screenshot_as_png()
311
+
312
+ # Open the screenshot with PIL
313
+ image = Image.open(BytesIO(screenshot))
314
+
315
+ # Convert image to RGB mode (this will handle both RGB and RGBA images)
316
+ rgb_image = image.convert('RGB')
317
+
318
+ # Convert to JPEG and compress
319
+ buffered = BytesIO()
320
+ rgb_image.save(buffered, format="JPEG", quality=85)
321
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
322
+
323
+ if self.verbose:
324
+ print(f"[LOG] 📸 Screenshot taken and converted to base64")
325
+
326
+ return img_base64
327
+ except Exception as e:
328
+ error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
329
+ print(error_message)
330
+
331
+ # Generate an image with black background
332
+ img = Image.new('RGB', (800, 600), color='black')
333
+ draw = ImageDraw.Draw(img)
334
+
335
+ # Load a font
336
+ try:
337
+ font = ImageFont.truetype("arial.ttf", 40)
338
+ except IOError:
339
+ font = ImageFont.load_default()
340
+
341
+ # Define text color and wrap the text
342
+ text_color = (255, 255, 255)
343
+ max_width = 780
344
+ wrapped_text = wrap_text(draw, error_message, font, max_width)
345
+
346
+ # Calculate text position
347
+ text_position = (10, 10)
348
+
349
+ # Draw the text on the image
350
+ draw.text(text_position, wrapped_text, fill=text_color, font=font)
351
+
352
+ # Convert to base64
353
+ buffered = BytesIO()
354
+ img.save(buffered, format="JPEG")
355
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
356
+
357
+ return img_base64
358
+
359
+ def quit(self):
360
+ self.driver.quit()
crawl4ai/database.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import sqlite3
4
+ from typing import Optional, Tuple
5
+
6
+ DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
7
+ os.makedirs(DB_PATH, exist_ok=True)
8
+ DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
9
+
10
+ def init_db():
11
+ global DB_PATH
12
+ conn = sqlite3.connect(DB_PATH)
13
+ cursor = conn.cursor()
14
+ cursor.execute('''
15
+ CREATE TABLE IF NOT EXISTS crawled_data (
16
+ url TEXT PRIMARY KEY,
17
+ html TEXT,
18
+ cleaned_html TEXT,
19
+ markdown TEXT,
20
+ extracted_content TEXT,
21
+ success BOOLEAN,
22
+ media TEXT DEFAULT "{}",
23
+ links TEXT DEFAULT "{}",
24
+ metadata TEXT DEFAULT "{}",
25
+ screenshot TEXT DEFAULT ""
26
+ )
27
+ ''')
28
+ conn.commit()
29
+ conn.close()
30
+
31
+ def alter_db_add_screenshot(new_column: str = "media"):
32
+ check_db_path()
33
+ try:
34
+ conn = sqlite3.connect(DB_PATH)
35
+ cursor = conn.cursor()
36
+ cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
37
+ conn.commit()
38
+ conn.close()
39
+ except Exception as e:
40
+ print(f"Error altering database to add screenshot column: {e}")
41
+
42
+ def check_db_path():
43
+ if not DB_PATH:
44
+ raise ValueError("Database path is not set or is empty.")
45
+
46
+ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
47
+ check_db_path()
48
+ try:
49
+ conn = sqlite3.connect(DB_PATH)
50
+ cursor = conn.cursor()
51
+ cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
52
+ result = cursor.fetchone()
53
+ conn.close()
54
+ return result
55
+ except Exception as e:
56
+ print(f"Error retrieving cached URL: {e}")
57
+ return None
58
+
59
+ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
60
+ check_db_path()
61
+ try:
62
+ conn = sqlite3.connect(DB_PATH)
63
+ cursor = conn.cursor()
64
+ cursor.execute('''
65
+ INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
66
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
67
+ ON CONFLICT(url) DO UPDATE SET
68
+ html = excluded.html,
69
+ cleaned_html = excluded.cleaned_html,
70
+ markdown = excluded.markdown,
71
+ extracted_content = excluded.extracted_content,
72
+ success = excluded.success,
73
+ media = excluded.media,
74
+ links = excluded.links,
75
+ metadata = excluded.metadata,
76
+ screenshot = excluded.screenshot
77
+ ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
78
+ conn.commit()
79
+ conn.close()
80
+ except Exception as e:
81
+ print(f"Error caching URL: {e}")
82
+
83
+ def get_total_count() -> int:
84
+ check_db_path()
85
+ try:
86
+ conn = sqlite3.connect(DB_PATH)
87
+ cursor = conn.cursor()
88
+ cursor.execute('SELECT COUNT(*) FROM crawled_data')
89
+ result = cursor.fetchone()
90
+ conn.close()
91
+ return result[0]
92
+ except Exception as e:
93
+ print(f"Error getting total count: {e}")
94
+ return 0
95
+
96
+ def clear_db():
97
+ check_db_path()
98
+ try:
99
+ conn = sqlite3.connect(DB_PATH)
100
+ cursor = conn.cursor()
101
+ cursor.execute('DELETE FROM crawled_data')
102
+ conn.commit()
103
+ conn.close()
104
+ except Exception as e:
105
+ print(f"Error clearing database: {e}")
106
+
107
+ def flush_db():
108
+ check_db_path()
109
+ try:
110
+ conn = sqlite3.connect(DB_PATH)
111
+ cursor = conn.cursor()
112
+ cursor.execute('DROP TABLE crawled_data')
113
+ conn.commit()
114
+ conn.close()
115
+ except Exception as e:
116
+ print(f"Error flushing database: {e}")
117
+
118
+ def update_existing_records(new_column: str = "media", default_value: str = "{}"):
119
+ check_db_path()
120
+ try:
121
+ conn = sqlite3.connect(DB_PATH)
122
+ cursor = conn.cursor()
123
+ cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
124
+ conn.commit()
125
+ conn.close()
126
+ except Exception as e:
127
+ print(f"Error updating existing records: {e}")
128
+
129
+ if __name__ == "__main__":
130
+ # Delete the existing database file
131
+ if os.path.exists(DB_PATH):
132
+ os.remove(DB_PATH)
133
+ init_db()
134
+ # alter_db_add_screenshot("COL_NAME")
135
+
crawl4ai/docs_manager.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import shutil
3
+ from pathlib import Path
4
+ from crawl4ai.async_logger import AsyncLogger
5
+ from crawl4ai.llmtxt import AsyncLLMTextManager
6
+
7
+ class DocsManager:
8
+ def __init__(self, logger=None):
9
+ self.docs_dir = Path.home() / ".crawl4ai" / "docs"
10
+ self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
11
+ self.docs_dir.mkdir(parents=True, exist_ok=True)
12
+ self.logger = logger or AsyncLogger(verbose=True)
13
+ self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
14
+
15
+ async def ensure_docs_exist(self):
16
+ """Fetch docs if not present"""
17
+ if not any(self.docs_dir.iterdir()):
18
+ await self.fetch_docs()
19
+
20
+ async def fetch_docs(self) -> bool:
21
+ """Copy from local docs or download from GitHub"""
22
+ try:
23
+ # Try local first
24
+ if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))):
25
+ # Empty the local docs directory
26
+ for file_path in self.docs_dir.glob("*.md"):
27
+ file_path.unlink()
28
+ # for file_path in self.docs_dir.glob("*.tokens"):
29
+ # file_path.unlink()
30
+ for file_path in self.local_docs.glob("*.md"):
31
+ shutil.copy2(file_path, self.docs_dir / file_path.name)
32
+ # for file_path in self.local_docs.glob("*.tokens"):
33
+ # shutil.copy2(file_path, self.docs_dir / file_path.name)
34
+ return True
35
+
36
+ # Fallback to GitHub
37
+ response = requests.get(
38
+ "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
39
+ headers={'Accept': 'application/vnd.github.v3+json'}
40
+ )
41
+ response.raise_for_status()
42
+
43
+ for item in response.json():
44
+ if item['type'] == 'file' and item['name'].endswith('.md'):
45
+ content = requests.get(item['download_url']).text
46
+ with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f:
47
+ f.write(content)
48
+ return True
49
+
50
+ except Exception as e:
51
+ self.logger.error(f"Failed to fetch docs: {str(e)}")
52
+ raise
53
+
54
+ def list(self) -> list[str]:
55
+ """List available topics"""
56
+ names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
57
+ # Remove [0-9]+_ prefix
58
+ names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
59
+ # Exclude those end with .xs.md and .q.md
60
+ names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")]
61
+ return names
62
+
63
+ def generate(self, sections, mode="extended"):
64
+ return self.llm_text.generate(sections, mode)
65
+
66
+ def search(self, query: str, top_k: int = 5):
67
+ return self.llm_text.search(query, top_k)
crawl4ai/extraction_strategy.bak.py ADDED
@@ -0,0 +1,1440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, List, Dict, Optional, Union
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import json, time
5
+ # from optimum.intel import IPEXModel
6
+ from .prompts import *
7
+ from .config import *
8
+ from .utils import *
9
+ from .models import *
10
+ from functools import partial
11
+ from .model_loader import *
12
+ import math
13
+ import numpy as np
14
+ import re
15
+ from bs4 import BeautifulSoup
16
+ from lxml import html, etree
17
+ from dataclasses import dataclass
18
+
19
+ class ExtractionStrategy(ABC):
20
+ """
21
+ Abstract base class for all extraction strategies.
22
+ """
23
+
24
+ def __init__(self, input_format: str = "markdown", **kwargs):
25
+ """
26
+ Initialize the extraction strategy.
27
+
28
+ Args:
29
+ input_format: Content format to use for extraction.
30
+ Options: "markdown" (default), "html", "fit_markdown"
31
+ **kwargs: Additional keyword arguments
32
+ """
33
+ self.input_format = input_format
34
+ self.DEL = "<|DEL|>"
35
+ self.name = self.__class__.__name__
36
+ self.verbose = kwargs.get("verbose", False)
37
+
38
+ @abstractmethod
39
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
40
+ """
41
+ Extract meaningful blocks or chunks from the given HTML.
42
+
43
+ :param url: The URL of the webpage.
44
+ :param html: The HTML content of the webpage.
45
+ :return: A list of extracted blocks or chunks.
46
+ """
47
+ pass
48
+
49
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
50
+ """
51
+ Process sections of text in parallel by default.
52
+
53
+ :param url: The URL of the webpage.
54
+ :param sections: List of sections (strings) to process.
55
+ :return: A list of processed JSON blocks.
56
+ """
57
+ extracted_content = []
58
+ with ThreadPoolExecutor() as executor:
59
+ futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
60
+ for future in as_completed(futures):
61
+ extracted_content.extend(future.result())
62
+ return extracted_content
63
+
64
+ class NoExtractionStrategy(ExtractionStrategy):
65
+ """
66
+ A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
67
+ """
68
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
69
+ """
70
+ Extract meaningful blocks or chunks from the given HTML.
71
+ """
72
+ return [{"index": 0, "content": html}]
73
+
74
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
75
+ return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
76
+
77
+ #######################################################
78
+ # Strategies using LLM-based extraction for text data #
79
+ #######################################################
80
+ class LLMExtractionStrategy(ExtractionStrategy):
81
+ """
82
+ A strategy that uses an LLM to extract meaningful content from the HTML.
83
+
84
+ Attributes:
85
+ provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
86
+ api_token: The API token for the provider.
87
+ instruction: The instruction to use for the LLM model.
88
+ schema: Pydantic model schema for structured data.
89
+ extraction_type: "block" or "schema".
90
+ chunk_token_threshold: Maximum tokens per chunk.
91
+ overlap_rate: Overlap between chunks.
92
+ word_token_rate: Word to token conversion rate.
93
+ apply_chunking: Whether to apply chunking.
94
+ base_url: The base URL for the API request.
95
+ api_base: The base URL for the API request.
96
+ extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
97
+ verbose: Whether to print verbose output.
98
+ usages: List of individual token usages.
99
+ total_usage: Accumulated token usage.
100
+ """
101
+
102
+ def __init__(self,
103
+ provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
104
+ instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
105
+ """
106
+ Initialize the strategy with clustering parameters.
107
+
108
+ Args:
109
+ provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
110
+ api_token: The API token for the provider.
111
+ instruction: The instruction to use for the LLM model.
112
+ schema: Pydantic model schema for structured data.
113
+ extraction_type: "block" or "schema".
114
+ chunk_token_threshold: Maximum tokens per chunk.
115
+ overlap_rate: Overlap between chunks.
116
+ word_token_rate: Word to token conversion rate.
117
+ apply_chunking: Whether to apply chunking.
118
+ base_url: The base URL for the API request.
119
+ api_base: The base URL for the API request.
120
+ extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
121
+ verbose: Whether to print verbose output.
122
+ usages: List of individual token usages.
123
+ total_usage: Accumulated token usage.
124
+
125
+ """
126
+ super().__init__(**kwargs)
127
+ self.provider = provider
128
+ self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
129
+ self.instruction = instruction
130
+ self.extract_type = extraction_type
131
+ self.schema = schema
132
+ if schema:
133
+ self.extract_type = "schema"
134
+
135
+ self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
136
+ self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
137
+ self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
138
+ self.apply_chunking = kwargs.get("apply_chunking", True)
139
+ self.base_url = kwargs.get("base_url", None)
140
+ self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
141
+ self.extra_args = kwargs.get("extra_args", {})
142
+ if not self.apply_chunking:
143
+ self.chunk_token_threshold = 1e9
144
+
145
+ self.verbose = kwargs.get("verbose", False)
146
+ self.usages = [] # Store individual usages
147
+ self.total_usage = TokenUsage() # Accumulated usage
148
+
149
+ if not self.api_token:
150
+ raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
151
+
152
+
153
+ def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
154
+ """
155
+ Extract meaningful blocks or chunks from the given HTML using an LLM.
156
+
157
+ How it works:
158
+ 1. Construct a prompt with variables.
159
+ 2. Make a request to the LLM using the prompt.
160
+ 3. Parse the response and extract blocks or chunks.
161
+
162
+ Args:
163
+ url: The URL of the webpage.
164
+ ix: Index of the block.
165
+ html: The HTML content of the webpage.
166
+
167
+ Returns:
168
+ A list of extracted blocks or chunks.
169
+ """
170
+ if self.verbose:
171
+ # print("[LOG] Extracting blocks from URL:", url)
172
+ print(f"[LOG] Call LLM for {url} - block index: {ix}")
173
+
174
+ variable_values = {
175
+ "URL": url,
176
+ "HTML": escape_json_string(sanitize_html(html)),
177
+ }
178
+
179
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
180
+ if self.instruction:
181
+ variable_values["REQUEST"] = self.instruction
182
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
183
+
184
+ if self.extract_type == "schema" and self.schema:
185
+ variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
186
+ prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
187
+
188
+ for variable in variable_values:
189
+ prompt_with_variables = prompt_with_variables.replace(
190
+ "{" + variable + "}", variable_values[variable]
191
+ )
192
+
193
+ response = perform_completion_with_backoff(
194
+ self.provider,
195
+ prompt_with_variables,
196
+ self.api_token,
197
+ base_url=self.api_base or self.base_url,
198
+ extra_args = self.extra_args
199
+ ) # , json_response=self.extract_type == "schema")
200
+ # Track usage
201
+ usage = TokenUsage(
202
+ completion_tokens=response.usage.completion_tokens,
203
+ prompt_tokens=response.usage.prompt_tokens,
204
+ total_tokens=response.usage.total_tokens,
205
+ completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
206
+ prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
207
+ )
208
+ self.usages.append(usage)
209
+
210
+ # Update totals
211
+ self.total_usage.completion_tokens += usage.completion_tokens
212
+ self.total_usage.prompt_tokens += usage.prompt_tokens
213
+ self.total_usage.total_tokens += usage.total_tokens
214
+
215
+ try:
216
+ blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
217
+ blocks = json.loads(blocks)
218
+ for block in blocks:
219
+ block['error'] = False
220
+ except Exception as e:
221
+ parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
222
+ blocks = parsed
223
+ if unparsed:
224
+ blocks.append({
225
+ "index": 0,
226
+ "error": True,
227
+ "tags": ["error"],
228
+ "content": unparsed
229
+ })
230
+
231
+ if self.verbose:
232
+ print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
233
+ return blocks
234
+
235
+ def _merge(self, documents, chunk_token_threshold, overlap):
236
+ """
237
+ Merge documents into sections based on chunk_token_threshold and overlap.
238
+ """
239
+ chunks = []
240
+ sections = []
241
+ total_tokens = 0
242
+
243
+ # Calculate the total tokens across all documents
244
+ for document in documents:
245
+ total_tokens += len(document.split(' ')) * self.word_token_rate
246
+
247
+ # Calculate the number of sections needed
248
+ num_sections = math.floor(total_tokens / chunk_token_threshold)
249
+ if num_sections < 1:
250
+ num_sections = 1 # Ensure there is at least one section
251
+ adjusted_chunk_threshold = total_tokens / num_sections
252
+
253
+ total_token_so_far = 0
254
+ current_chunk = []
255
+
256
+ for document in documents:
257
+ tokens = document.split(' ')
258
+ token_count = len(tokens) * self.word_token_rate
259
+
260
+ if total_token_so_far + token_count <= adjusted_chunk_threshold:
261
+ current_chunk.extend(tokens)
262
+ total_token_so_far += token_count
263
+ else:
264
+ # Ensure to handle the last section properly
265
+ if len(sections) == num_sections - 1:
266
+ current_chunk.extend(tokens)
267
+ continue
268
+
269
+ # Add overlap if specified
270
+ if overlap > 0 and current_chunk:
271
+ overlap_tokens = current_chunk[-overlap:]
272
+ current_chunk.extend(overlap_tokens)
273
+
274
+ sections.append(' '.join(current_chunk))
275
+ current_chunk = tokens
276
+ total_token_so_far = token_count
277
+
278
+ # Add the last chunk
279
+ if current_chunk:
280
+ sections.append(' '.join(current_chunk))
281
+
282
+ return sections
283
+
284
+
285
+ def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
286
+ """
287
+ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
288
+
289
+ Args:
290
+ url: The URL of the webpage.
291
+ sections: List of sections (strings) to process.
292
+
293
+ Returns:
294
+ A list of extracted blocks or chunks.
295
+ """
296
+
297
+ merged_sections = self._merge(
298
+ sections, self.chunk_token_threshold,
299
+ overlap= int(self.chunk_token_threshold * self.overlap_rate)
300
+ )
301
+ extracted_content = []
302
+ if self.provider.startswith("groq/"):
303
+ # Sequential processing with a delay
304
+ for ix, section in enumerate(merged_sections):
305
+ extract_func = partial(self.extract, url)
306
+ extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
307
+ time.sleep(0.5) # 500 ms delay between each processing
308
+ else:
309
+ # Parallel processing using ThreadPoolExecutor
310
+ # extract_func = partial(self.extract, url)
311
+ # for ix, section in enumerate(merged_sections):
312
+ # extracted_content.append(extract_func(ix, section))
313
+
314
+ with ThreadPoolExecutor(max_workers=4) as executor:
315
+ extract_func = partial(self.extract, url)
316
+ futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
317
+
318
+ for future in as_completed(futures):
319
+ try:
320
+ extracted_content.extend(future.result())
321
+ except Exception as e:
322
+ if self.verbose:
323
+ print(f"Error in thread execution: {e}")
324
+ # Add error information to extracted_content
325
+ extracted_content.append({
326
+ "index": 0,
327
+ "error": True,
328
+ "tags": ["error"],
329
+ "content": str(e)
330
+ })
331
+
332
+
333
+ return extracted_content
334
+
335
+
336
+ def show_usage(self) -> None:
337
+ """Print a detailed token usage report showing total and per-request usage."""
338
+ print("\n=== Token Usage Summary ===")
339
+ print(f"{'Type':<15} {'Count':>12}")
340
+ print("-" * 30)
341
+ print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
342
+ print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
343
+ print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
344
+
345
+ print("\n=== Usage History ===")
346
+ print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
347
+ print("-" * 48)
348
+ for i, usage in enumerate(self.usages, 1):
349
+ print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
350
+
351
+ #######################################################
352
+ # Strategies using clustering for text data extraction #
353
+ #######################################################
354
+
355
+ class CosineStrategy(ExtractionStrategy):
356
+ """
357
+ Extract meaningful blocks or chunks from the given HTML using cosine similarity.
358
+
359
+ How it works:
360
+ 1. Pre-filter documents using embeddings and semantic_filter.
361
+ 2. Perform clustering using cosine similarity.
362
+ 3. Organize texts by their cluster labels, retaining order.
363
+ 4. Filter clusters by word count.
364
+ 5. Extract meaningful blocks or chunks from the filtered clusters.
365
+
366
+ Attributes:
367
+ semantic_filter (str): A keyword filter for document filtering.
368
+ word_count_threshold (int): Minimum number of words per cluster.
369
+ max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
370
+ linkage_method (str): The linkage method for hierarchical clustering.
371
+ top_k (int): Number of top categories to extract.
372
+ model_name (str): The name of the sentence-transformers model.
373
+ sim_threshold (float): The similarity threshold for clustering.
374
+ """
375
+ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
376
+ """
377
+ Initialize the strategy with clustering parameters.
378
+
379
+ Args:
380
+ semantic_filter (str): A keyword filter for document filtering.
381
+ word_count_threshold (int): Minimum number of words per cluster.
382
+ max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
383
+ linkage_method (str): The linkage method for hierarchical clustering.
384
+ top_k (int): Number of top categories to extract.
385
+ """
386
+ super().__init__(**kwargs)
387
+
388
+ import numpy as np
389
+
390
+ self.semantic_filter = semantic_filter
391
+ self.word_count_threshold = word_count_threshold
392
+ self.max_dist = max_dist
393
+ self.linkage_method = linkage_method
394
+ self.top_k = top_k
395
+ self.sim_threshold = sim_threshold
396
+ self.timer = time.time()
397
+ self.verbose = kwargs.get("verbose", False)
398
+
399
+ self.buffer_embeddings = np.array([])
400
+ self.get_embedding_method = "direct"
401
+
402
+ self.device = get_device()
403
+ # import torch
404
+ # self.device = torch.device('cpu')
405
+
406
+ self.default_batch_size = calculate_batch_size(self.device)
407
+
408
+ if self.verbose:
409
+ print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
410
+
411
+ # if False and self.device.type == "cpu":
412
+ # self.model = load_onnx_all_MiniLM_l6_v2()
413
+ # self.tokenizer = self.model.tokenizer
414
+ # self.get_embedding_method = "direct"
415
+ # else:
416
+
417
+ self.tokenizer, self.model = load_HF_embedding_model(model_name)
418
+ self.model.to(self.device)
419
+ self.model.eval()
420
+
421
+ self.get_embedding_method = "batch"
422
+
423
+ self.buffer_embeddings = np.array([])
424
+
425
+ # if model_name == "bert-base-uncased":
426
+ # self.tokenizer, self.model = load_bert_base_uncased()
427
+ # self.model.eval() # Ensure the model is in evaluation mode
428
+ # self.get_embedding_method = "batch"
429
+ # elif model_name == "BAAI/bge-small-en-v1.5":
430
+ # self.tokenizer, self.model = load_bge_small_en_v1_5()
431
+ # self.model.eval() # Ensure the model is in evaluation mode
432
+ # self.get_embedding_method = "batch"
433
+ # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
434
+ # self.model = load_onnx_all_MiniLM_l6_v2()
435
+ # self.tokenizer = self.model.tokenizer
436
+ # self.get_embedding_method = "direct"
437
+
438
+
439
+ if self.verbose:
440
+ print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
441
+
442
+ self.nlp, _ = load_text_multilabel_classifier()
443
+ # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
444
+
445
+ if self.verbose:
446
+ print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
447
+
448
+ def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
449
+ """
450
+ Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
451
+
452
+ Args:
453
+ documents (List[str]): A list of document texts.
454
+ semantic_filter (str): A keyword filter for document filtering.
455
+ at_least_k (int): The minimum number of documents to return.
456
+
457
+ Returns:
458
+ List[str]: A list of filtered and sorted document texts.
459
+ """
460
+
461
+ if not semantic_filter:
462
+ return documents
463
+
464
+ if len(documents) < at_least_k:
465
+ at_least_k = len(documents) // 2
466
+
467
+ from sklearn.metrics.pairwise import cosine_similarity
468
+
469
+ # Compute embedding for the keyword filter
470
+ query_embedding = self.get_embeddings([semantic_filter])[0]
471
+
472
+ # Compute embeddings for the documents
473
+ document_embeddings = self.get_embeddings(documents)
474
+
475
+ # Calculate cosine similarity between the query embedding and document embeddings
476
+ similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
477
+
478
+ # Filter documents based on the similarity threshold
479
+ filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
480
+
481
+ # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
482
+ if len(filtered_docs) < at_least_k:
483
+ remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
484
+ remaining_docs.sort(key=lambda x: x[1], reverse=True)
485
+ filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
486
+
487
+ # Extract the document texts from the tuples
488
+ filtered_docs = [doc for doc, _ in filtered_docs]
489
+
490
+ return filtered_docs[:at_least_k]
491
+
492
+ def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
493
+ """
494
+ Get BERT embeddings for a list of sentences.
495
+
496
+ Args:
497
+ sentences (List[str]): A list of text chunks (sentences).
498
+
499
+ Returns:
500
+ NumPy array of embeddings.
501
+ """
502
+ # if self.buffer_embeddings.any() and not bypass_buffer:
503
+ # return self.buffer_embeddings
504
+
505
+ if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
506
+ import torch
507
+ # Tokenize sentences and convert to tensor
508
+ if batch_size is None:
509
+ batch_size = self.default_batch_size
510
+
511
+ all_embeddings = []
512
+ for i in range(0, len(sentences), batch_size):
513
+ batch_sentences = sentences[i:i + batch_size]
514
+ encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
515
+ encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
516
+
517
+ # Ensure no gradients are calculated
518
+ with torch.no_grad():
519
+ model_output = self.model(**encoded_input)
520
+
521
+ # Get embeddings from the last hidden state (mean pooling)
522
+ embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
523
+ all_embeddings.append(embeddings)
524
+
525
+ self.buffer_embeddings = np.vstack(all_embeddings)
526
+ elif self.device.type == "cpu":
527
+ # self.buffer_embeddings = self.model(sentences)
528
+ if batch_size is None:
529
+ batch_size = self.default_batch_size
530
+
531
+ all_embeddings = []
532
+ for i in range(0, len(sentences), batch_size):
533
+ batch_sentences = sentences[i:i + batch_size]
534
+ embeddings = self.model(batch_sentences)
535
+ all_embeddings.append(embeddings)
536
+
537
+ self.buffer_embeddings = np.vstack(all_embeddings)
538
+ return self.buffer_embeddings
539
+
540
+ def hierarchical_clustering(self, sentences: List[str], embeddings = None):
541
+ """
542
+ Perform hierarchical clustering on sentences and return cluster labels.
543
+
544
+ Args:
545
+ sentences (List[str]): A list of text chunks (sentences).
546
+
547
+ Returns:
548
+ NumPy array of cluster labels.
549
+ """
550
+ # Get embeddings
551
+ from scipy.cluster.hierarchy import linkage, fcluster
552
+ from scipy.spatial.distance import pdist
553
+ self.timer = time.time()
554
+ embeddings = self.get_embeddings(sentences, bypass_buffer=True)
555
+ # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
556
+ # Compute pairwise cosine distances
557
+ distance_matrix = pdist(embeddings, 'cosine')
558
+ # Perform agglomerative clustering respecting order
559
+ linked = linkage(distance_matrix, method=self.linkage_method)
560
+ # Form flat clusters
561
+ labels = fcluster(linked, self.max_dist, criterion='distance')
562
+ return labels
563
+
564
+ def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
565
+ """
566
+ Filter clusters to remove those with a word count below the threshold.
567
+
568
+ Args:
569
+ clusters (Dict[int, List[str]]): Dictionary of clusters.
570
+
571
+ Returns:
572
+ Dict[int, List[str]]: Filtered dictionary of clusters.
573
+ """
574
+ filtered_clusters = {}
575
+ for cluster_id, texts in clusters.items():
576
+ # Concatenate texts for analysis
577
+ full_text = " ".join(texts)
578
+ # Count words
579
+ word_count = len(full_text.split())
580
+
581
+ # Keep clusters with word count above the threshold
582
+ if word_count >= self.word_count_threshold:
583
+ filtered_clusters[cluster_id] = texts
584
+
585
+ return filtered_clusters
586
+
587
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
588
+ """
589
+ Extract clusters from HTML content using hierarchical clustering.
590
+
591
+ Args:
592
+ url (str): The URL of the webpage.
593
+ html (str): The HTML content of the webpage.
594
+
595
+ Returns:
596
+ List[Dict[str, Any]]: A list of processed JSON blocks.
597
+ """
598
+ # Assume `html` is a list of text chunks for this strategy
599
+ t = time.time()
600
+ text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
601
+
602
+ # Pre-filter documents using embeddings and semantic_filter
603
+ text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
604
+
605
+ if not text_chunks:
606
+ return []
607
+
608
+ # Perform clustering
609
+ labels = self.hierarchical_clustering(text_chunks)
610
+ # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
611
+
612
+ # Organize texts by their cluster labels, retaining order
613
+ t = time.time()
614
+ clusters = {}
615
+ for index, label in enumerate(labels):
616
+ clusters.setdefault(label, []).append(text_chunks[index])
617
+
618
+ # Filter clusters by word count
619
+ filtered_clusters = self.filter_clusters_by_word_count(clusters)
620
+
621
+ # Convert filtered clusters to a sorted list of dictionaries
622
+ cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
623
+
624
+ if self.verbose:
625
+ print(f"[LOG] 🚀 Assign tags using {self.device}")
626
+
627
+ if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
628
+ labels = self.nlp([cluster['content'] for cluster in cluster_list])
629
+
630
+ for cluster, label in zip(cluster_list, labels):
631
+ cluster['tags'] = label
632
+ # elif self.device.type == "cpu":
633
+ # # Process the text with the loaded model
634
+ # texts = [cluster['content'] for cluster in cluster_list]
635
+ # # Batch process texts
636
+ # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
637
+
638
+ # for doc, cluster in zip(docs, cluster_list):
639
+ # tok_k = self.top_k
640
+ # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
641
+ # cluster['tags'] = [cat for cat, _ in top_categories]
642
+
643
+ # for cluster in cluster_list:
644
+ # doc = self.nlp(cluster['content'])
645
+ # tok_k = self.top_k
646
+ # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
647
+ # cluster['tags'] = [cat for cat, _ in top_categories]
648
+
649
+ if self.verbose:
650
+ print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
651
+
652
+ return cluster_list
653
+
654
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
655
+ """
656
+ Process sections using hierarchical clustering.
657
+
658
+ Args:
659
+ url (str): The URL of the webpage.
660
+ sections (List[str]): List of sections (strings) to process.
661
+
662
+ Returns:
663
+ """
664
+ # This strategy processes all sections together
665
+
666
+ return self.extract(url, self.DEL.join(sections), **kwargs)
667
+
668
+ #######################################################
669
+ # New extraction strategies for JSON-based extraction #
670
+ #######################################################
671
+
672
+ class JsonElementExtractionStrategy(ExtractionStrategy):
673
+ """
674
+ Abstract base class for extracting structured JSON from HTML content.
675
+
676
+ How it works:
677
+ 1. Parses HTML content using the `_parse_html` method.
678
+ 2. Uses a schema to define base selectors, fields, and transformations.
679
+ 3. Extracts data hierarchically, supporting nested fields and lists.
680
+ 4. Handles computed fields with expressions or functions.
681
+
682
+ Attributes:
683
+ DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
684
+ schema (Dict[str, Any]): The schema defining the extraction rules.
685
+ verbose (bool): Enables verbose logging for debugging purposes.
686
+
687
+ Methods:
688
+ extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
689
+ _extract_item(element, fields): Extracts fields from a single element.
690
+ _extract_single_field(element, field): Extracts a single field based on its type.
691
+ _apply_transform(value, transform): Applies a transformation to a value.
692
+ _compute_field(item, field): Computes a field value using an expression or function.
693
+ run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
694
+
695
+ Abstract Methods:
696
+ _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
697
+ _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
698
+ _get_elements(element, selector): Retrieves child elements using a selector.
699
+ _get_element_text(element): Extracts text content from an element.
700
+ _get_element_html(element): Extracts raw HTML from an element.
701
+ _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
702
+ """
703
+
704
+
705
+ DEL = '\n'
706
+
707
+ def __init__(self, schema: Dict[str, Any], **kwargs):
708
+ """
709
+ Initialize the JSON element extraction strategy with a schema.
710
+
711
+ Args:
712
+ schema (Dict[str, Any]): The schema defining the extraction rules.
713
+ """
714
+ super().__init__(**kwargs)
715
+ self.schema = schema
716
+ self.verbose = kwargs.get('verbose', False)
717
+
718
+ def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
719
+ """
720
+ Extract structured data from HTML content.
721
+
722
+ How it works:
723
+ 1. Parses the HTML content using the `_parse_html` method.
724
+ 2. Identifies base elements using the schema's base selector.
725
+ 3. Extracts fields from each base element using `_extract_item`.
726
+
727
+ Args:
728
+ url (str): The URL of the page being processed.
729
+ html_content (str): The raw HTML content to parse and extract.
730
+ *q: Additional positional arguments.
731
+ **kwargs: Additional keyword arguments for custom extraction.
732
+
733
+ Returns:
734
+ List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
735
+ """
736
+
737
+ parsed_html = self._parse_html(html_content)
738
+ base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
739
+
740
+ results = []
741
+ for element in base_elements:
742
+ # Extract base element attributes
743
+ item = {}
744
+ if 'baseFields' in self.schema:
745
+ for field in self.schema['baseFields']:
746
+ value = self._extract_single_field(element, field)
747
+ if value is not None:
748
+ item[field['name']] = value
749
+
750
+ # Extract child fields
751
+ field_data = self._extract_item(element, self.schema['fields'])
752
+ item.update(field_data)
753
+
754
+ if item:
755
+ results.append(item)
756
+
757
+ return results
758
+
759
+ @abstractmethod
760
+ def _parse_html(self, html_content: str):
761
+ """Parse HTML content into appropriate format"""
762
+ pass
763
+
764
+ @abstractmethod
765
+ def _get_base_elements(self, parsed_html, selector: str):
766
+ """Get all base elements using the selector"""
767
+ pass
768
+
769
+ @abstractmethod
770
+ def _get_elements(self, element, selector: str):
771
+ """Get child elements using the selector"""
772
+ pass
773
+
774
+ def _extract_field(self, element, field):
775
+ try:
776
+ if field['type'] == 'nested':
777
+ nested_elements = self._get_elements(element, field['selector'])
778
+ nested_element = nested_elements[0] if nested_elements else None
779
+ return self._extract_item(nested_element, field['fields']) if nested_element else {}
780
+
781
+ if field['type'] == 'list':
782
+ elements = self._get_elements(element, field['selector'])
783
+ return [self._extract_list_item(el, field['fields']) for el in elements]
784
+
785
+ if field['type'] == 'nested_list':
786
+ elements = self._get_elements(element, field['selector'])
787
+ return [self._extract_item(el, field['fields']) for el in elements]
788
+
789
+ return self._extract_single_field(element, field)
790
+ except Exception as e:
791
+ if self.verbose:
792
+ print(f"Error extracting field {field['name']}: {str(e)}")
793
+ return field.get('default')
794
+
795
+ def _extract_single_field(self, element, field):
796
+ """
797
+ Extract a single field based on its type.
798
+
799
+ How it works:
800
+ 1. Selects the target element using the field's selector.
801
+ 2. Extracts the field value based on its type (e.g., text, attribute, regex).
802
+ 3. Applies transformations if defined in the schema.
803
+
804
+ Args:
805
+ element: The base element to extract the field from.
806
+ field (Dict[str, Any]): The field definition in the schema.
807
+
808
+ Returns:
809
+ Any: The extracted field value.
810
+ """
811
+
812
+ if 'selector' in field:
813
+ selected = self._get_elements(element, field['selector'])
814
+ if not selected:
815
+ return field.get('default')
816
+ selected = selected[0]
817
+ else:
818
+ selected = element
819
+
820
+ value = None
821
+ if field['type'] == 'text':
822
+ value = self._get_element_text(selected)
823
+ elif field['type'] == 'attribute':
824
+ value = self._get_element_attribute(selected, field['attribute'])
825
+ elif field['type'] == 'html':
826
+ value = self._get_element_html(selected)
827
+ elif field['type'] == 'regex':
828
+ text = self._get_element_text(selected)
829
+ match = re.search(field['pattern'], text)
830
+ value = match.group(1) if match else None
831
+
832
+ if 'transform' in field:
833
+ value = self._apply_transform(value, field['transform'])
834
+
835
+ return value if value is not None else field.get('default')
836
+
837
+ def _extract_list_item(self, element, fields):
838
+ item = {}
839
+ for field in fields:
840
+ value = self._extract_single_field(element, field)
841
+ if value is not None:
842
+ item[field['name']] = value
843
+ return item
844
+
845
+ def _extract_item(self, element, fields):
846
+ """
847
+ Extracts fields from a given element.
848
+
849
+ How it works:
850
+ 1. Iterates through the fields defined in the schema.
851
+ 2. Handles computed, single, and nested field types.
852
+ 3. Updates the item dictionary with extracted field values.
853
+
854
+ Args:
855
+ element: The base element to extract fields from.
856
+ fields (List[Dict[str, Any]]): The list of fields to extract.
857
+
858
+ Returns:
859
+ Dict[str, Any]: A dictionary representing the extracted item.
860
+ """
861
+
862
+ item = {}
863
+ for field in fields:
864
+ if field['type'] == 'computed':
865
+ value = self._compute_field(item, field)
866
+ else:
867
+ value = self._extract_field(element, field)
868
+ if value is not None:
869
+ item[field['name']] = value
870
+ return item
871
+
872
+ def _apply_transform(self, value, transform):
873
+ """
874
+ Apply a transformation to a value.
875
+
876
+ How it works:
877
+ 1. Checks the transformation type (e.g., `lowercase`, `strip`).
878
+ 2. Applies the transformation to the value.
879
+ 3. Returns the transformed value.
880
+
881
+ Args:
882
+ value (str): The value to transform.
883
+ transform (str): The type of transformation to apply.
884
+
885
+ Returns:
886
+ str: The transformed value.
887
+ """
888
+
889
+ if transform == 'lowercase':
890
+ return value.lower()
891
+ elif transform == 'uppercase':
892
+ return value.upper()
893
+ elif transform == 'strip':
894
+ return value.strip()
895
+ return value
896
+
897
+ def _compute_field(self, item, field):
898
+ try:
899
+ if 'expression' in field:
900
+ return eval(field['expression'], {}, item)
901
+ elif 'function' in field:
902
+ return field['function'](item)
903
+ except Exception as e:
904
+ if self.verbose:
905
+ print(f"Error computing field {field['name']}: {str(e)}")
906
+ return field.get('default')
907
+
908
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
909
+ """
910
+ Run the extraction strategy on a combined HTML content.
911
+
912
+ How it works:
913
+ 1. Combines multiple HTML sections using the `DEL` delimiter.
914
+ 2. Calls the `extract` method with the combined HTML.
915
+
916
+ Args:
917
+ url (str): The URL of the page being processed.
918
+ sections (List[str]): A list of HTML sections.
919
+ *q: Additional positional arguments.
920
+ **kwargs: Additional keyword arguments for custom extraction.
921
+
922
+ Returns:
923
+ List[Dict[str, Any]]: A list of extracted items.
924
+ """
925
+
926
+ combined_html = self.DEL.join(sections)
927
+ return self.extract(url, combined_html, **kwargs)
928
+
929
+ @abstractmethod
930
+ def _get_element_text(self, element) -> str:
931
+ """Get text content from element"""
932
+ pass
933
+
934
+ @abstractmethod
935
+ def _get_element_html(self, element) -> str:
936
+ """Get HTML content from element"""
937
+ pass
938
+
939
+ @abstractmethod
940
+ def _get_element_attribute(self, element, attribute: str):
941
+ """Get attribute value from element"""
942
+ pass
943
+
944
+ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
945
+ """
946
+ Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
947
+
948
+ How it works:
949
+ 1. Parses HTML content with BeautifulSoup.
950
+ 2. Selects elements using CSS selectors defined in the schema.
951
+ 3. Extracts field data and applies transformations as defined.
952
+
953
+ Attributes:
954
+ schema (Dict[str, Any]): The schema defining the extraction rules.
955
+ verbose (bool): Enables verbose logging for debugging purposes.
956
+
957
+ Methods:
958
+ _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
959
+ _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
960
+ _get_elements(element, selector): Selects child elements using a CSS selector.
961
+ _get_element_text(element): Extracts text content from a BeautifulSoup element.
962
+ _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
963
+ _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
964
+ """
965
+
966
+ def __init__(self, schema: Dict[str, Any], **kwargs):
967
+ kwargs['input_format'] = 'html' # Force HTML input
968
+ super().__init__(schema, **kwargs)
969
+
970
+ def _parse_html(self, html_content: str):
971
+ return BeautifulSoup(html_content, 'html.parser')
972
+
973
+ def _get_base_elements(self, parsed_html, selector: str):
974
+ return parsed_html.select(selector)
975
+
976
+ def _get_elements(self, element, selector: str):
977
+ selected = element.select_one(selector)
978
+ return [selected] if selected else []
979
+
980
+ def _get_element_text(self, element) -> str:
981
+ return element.get_text(strip=True)
982
+
983
+ def _get_element_html(self, element) -> str:
984
+ return str(element)
985
+
986
+ def _get_element_attribute(self, element, attribute: str):
987
+ return element.get(attribute)
988
+
989
+ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
990
+ """
991
+ Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
992
+
993
+ How it works:
994
+ 1. Parses HTML content into an lxml tree.
995
+ 2. Selects elements using XPath expressions.
996
+ 3. Converts CSS selectors to XPath when needed.
997
+
998
+ Attributes:
999
+ schema (Dict[str, Any]): The schema defining the extraction rules.
1000
+ verbose (bool): Enables verbose logging for debugging purposes.
1001
+
1002
+ Methods:
1003
+ _parse_html(html_content): Parses HTML content into an lxml tree.
1004
+ _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
1005
+ _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
1006
+ _get_elements(element, selector): Selects child elements using an XPath selector.
1007
+ _get_element_text(element): Extracts text content from an lxml element.
1008
+ _get_element_html(element): Extracts the raw HTML content of an lxml element.
1009
+ _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
1010
+ """
1011
+
1012
+ def __init__(self, schema: Dict[str, Any], **kwargs):
1013
+ kwargs['input_format'] = 'html' # Force HTML input
1014
+ super().__init__(schema, **kwargs)
1015
+
1016
+ def _parse_html(self, html_content: str):
1017
+ return html.fromstring(html_content)
1018
+
1019
+ def _get_base_elements(self, parsed_html, selector: str):
1020
+ return parsed_html.xpath(selector)
1021
+
1022
+ def _css_to_xpath(self, css_selector: str) -> str:
1023
+ """Convert CSS selector to XPath if needed"""
1024
+ if '/' in css_selector: # Already an XPath
1025
+ return css_selector
1026
+ return self._basic_css_to_xpath(css_selector)
1027
+
1028
+ def _basic_css_to_xpath(self, css_selector: str) -> str:
1029
+ """Basic CSS to XPath conversion for common cases"""
1030
+ if ' > ' in css_selector:
1031
+ parts = css_selector.split(' > ')
1032
+ return '//' + '/'.join(parts)
1033
+ if ' ' in css_selector:
1034
+ parts = css_selector.split(' ')
1035
+ return '//' + '//'.join(parts)
1036
+ return '//' + css_selector
1037
+
1038
+ def _get_elements(self, element, selector: str):
1039
+ xpath = self._css_to_xpath(selector)
1040
+ if not xpath.startswith('.'):
1041
+ xpath = '.' + xpath
1042
+ return element.xpath(xpath)
1043
+
1044
+ def _get_element_text(self, element) -> str:
1045
+ return ''.join(element.xpath('.//text()')).strip()
1046
+
1047
+ def _get_element_html(self, element) -> str:
1048
+ return etree.tostring(element, encoding='unicode')
1049
+
1050
+ def _get_element_attribute(self, element, attribute: str):
1051
+ return element.get(attribute)
1052
+
1053
+
1054
+ #######################################################
1055
+ # Strategies based on the extraction of specific types#
1056
+ #######################################################
1057
+
1058
+ class TopicExtractionStrategy(ExtractionStrategy):
1059
+ def __init__(self, num_keywords: int = 3, **kwargs):
1060
+ """
1061
+ Initialize the topic extraction strategy with parameters for topic segmentation.
1062
+
1063
+ :param num_keywords: Number of keywords to represent each topic segment.
1064
+ """
1065
+ import nltk
1066
+ super().__init__(**kwargs)
1067
+ self.num_keywords = num_keywords
1068
+ self.tokenizer = nltk.TextTilingTokenizer()
1069
+
1070
+ def extract_keywords(self, text: str) -> List[str]:
1071
+ """
1072
+ Extract keywords from a given text segment using simple frequency analysis.
1073
+
1074
+ :param text: The text segment from which to extract keywords.
1075
+ :return: A list of keyword strings.
1076
+ """
1077
+ import nltk
1078
+ # Tokenize the text and compute word frequency
1079
+ words = nltk.word_tokenize(text)
1080
+ freq_dist = nltk.FreqDist(words)
1081
+ # Get the most common words as keywords
1082
+ keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
1083
+ return keywords
1084
+
1085
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
1086
+ """
1087
+ Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
1088
+
1089
+ :param url: The URL of the webpage.
1090
+ :param html: The HTML content of the webpage.
1091
+ :param provider: The provider to be used for extraction (not used here).
1092
+ :param api_token: Optional API token for the provider (not used here).
1093
+ :return: A list of dictionaries representing the topics.
1094
+ """
1095
+ # Use TextTiling to segment the text into topics
1096
+ segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
1097
+
1098
+ # Prepare the output as a list of dictionaries
1099
+ topic_list = []
1100
+ for i, segment in enumerate(segmented_topics):
1101
+ # Extract keywords for each segment
1102
+ keywords = self.extract_keywords(segment)
1103
+ topic_list.append({
1104
+ "index": i,
1105
+ "content": segment,
1106
+ "keywords": keywords
1107
+ })
1108
+
1109
+ return topic_list
1110
+
1111
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
1112
+ """
1113
+ Process sections using topic segmentation and keyword extraction.
1114
+
1115
+ :param url: The URL of the webpage.
1116
+ :param sections: List of sections (strings) to process.
1117
+ :param provider: The provider to be used for extraction (not used here).
1118
+ :param api_token: Optional API token for the provider (not used here).
1119
+ :return: A list of processed JSON blocks.
1120
+ """
1121
+ # Concatenate sections into a single text for coherent topic segmentation
1122
+
1123
+
1124
+ return self.extract(url, self.DEL.join(sections), **kwargs)
1125
+
1126
+ class ContentSummarizationStrategy(ExtractionStrategy):
1127
+ def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
1128
+ """
1129
+ Initialize the content summarization strategy with a specific model.
1130
+
1131
+ :param model_name: The model to use for summarization.
1132
+ """
1133
+ super().__init__(**kwargs)
1134
+ from transformers import pipeline
1135
+ self.summarizer = pipeline("summarization", model=model_name)
1136
+
1137
+ def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
1138
+ """
1139
+ Summarize a single section of text.
1140
+
1141
+ :param url: The URL of the webpage.
1142
+ :param text: A section of text to summarize.
1143
+ :param provider: The provider to be used for extraction (not used here).
1144
+ :param api_token: Optional API token for the provider (not used here).
1145
+ :return: A dictionary with the summary.
1146
+ """
1147
+ try:
1148
+ summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
1149
+ return {"summary": summary[0]['summary_text']}
1150
+ except Exception as e:
1151
+ print(f"Error summarizing text: {e}")
1152
+ return {"summary": text} # Fallback to original text if summarization fails
1153
+
1154
+ def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
1155
+ """
1156
+ Process each section in parallel to produce summaries.
1157
+
1158
+ :param url: The URL of the webpage.
1159
+ :param sections: List of sections (strings) to summarize.
1160
+ :param provider: The provider to be used for extraction (not used here).
1161
+ :param api_token: Optional API token for the provider (not used here).
1162
+ :return: A list of dictionaries with summaries for each section.
1163
+ """
1164
+ # Use a ThreadPoolExecutor to summarize in parallel
1165
+ summaries = []
1166
+ with ThreadPoolExecutor() as executor:
1167
+ # Create a future for each section's summarization
1168
+ future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
1169
+ for future in as_completed(future_to_section):
1170
+ section_index = future_to_section[future]
1171
+ try:
1172
+ summary_result = future.result()
1173
+ summaries.append((section_index, summary_result))
1174
+ except Exception as e:
1175
+ print(f"Error processing section {section_index}: {e}")
1176
+ summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text
1177
+
1178
+ # Sort summaries by the original section index to maintain order
1179
+ summaries.sort(key=lambda x: x[0])
1180
+ return [summary for _, summary in summaries]
1181
+
1182
+ #######################################################
1183
+ # Deprecated strategies
1184
+ #######################################################
1185
+
1186
+ class _JsonCssExtractionStrategy(ExtractionStrategy):
1187
+ def __init__(self, schema: Dict[str, Any], **kwargs):
1188
+ kwargs['input_format'] = 'html' # Force HTML input
1189
+ super().__init__(**kwargs)
1190
+ self.schema = schema
1191
+
1192
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
1193
+ soup = BeautifulSoup(html, 'html.parser')
1194
+ base_elements = soup.select(self.schema['baseSelector'])
1195
+
1196
+ results = []
1197
+ for element in base_elements:
1198
+ # Extract base element attributes first
1199
+ item = {}
1200
+ if 'baseFields' in self.schema:
1201
+ for field in self.schema['baseFields']:
1202
+ value = self._extract_single_field(element, field)
1203
+ if value is not None:
1204
+ item[field['name']] = value
1205
+
1206
+ # Then extract child fields
1207
+ field_data = self._extract_item(element, self.schema['fields'])
1208
+ item.update(field_data)
1209
+
1210
+ results.append(item)
1211
+
1212
+ return results
1213
+
1214
+ def _extract_field(self, element, field):
1215
+ try:
1216
+ if field['type'] == 'nested':
1217
+ nested_element = element.select_one(field['selector'])
1218
+ return self._extract_item(nested_element, field['fields']) if nested_element else {}
1219
+
1220
+ if field['type'] == 'list':
1221
+ elements = element.select(field['selector'])
1222
+ return [self._extract_list_item(el, field['fields']) for el in elements]
1223
+
1224
+ if field['type'] == 'nested_list':
1225
+ elements = element.select(field['selector'])
1226
+ return [self._extract_item(el, field['fields']) for el in elements]
1227
+
1228
+ return self._extract_single_field(element, field)
1229
+ except Exception as e:
1230
+ if self.verbose:
1231
+ print(f"Error extracting field {field['name']}: {str(e)}")
1232
+ return field.get('default')
1233
+
1234
+ def _extract_list_item(self, element, fields):
1235
+ item = {}
1236
+ for field in fields:
1237
+ value = self._extract_single_field(element, field)
1238
+ if value is not None:
1239
+ item[field['name']] = value
1240
+ return item
1241
+
1242
+ def _extract_single_field(self, element, field):
1243
+ if 'selector' in field:
1244
+ selected = element.select_one(field['selector'])
1245
+ if not selected:
1246
+ return field.get('default')
1247
+ else:
1248
+ selected = element
1249
+
1250
+ value = None
1251
+ if field['type'] == 'text':
1252
+ value = selected.get_text(strip=True)
1253
+ elif field['type'] == 'attribute':
1254
+ value = selected.get(field['attribute'])
1255
+ elif field['type'] == 'html':
1256
+ value = str(selected)
1257
+ elif field['type'] == 'regex':
1258
+ text = selected.get_text(strip=True)
1259
+ match = re.search(field['pattern'], text)
1260
+ value = match.group(1) if match else None
1261
+
1262
+ if 'transform' in field:
1263
+ value = self._apply_transform(value, field['transform'])
1264
+
1265
+ return value if value is not None else field.get('default')
1266
+
1267
+ def _extract_item(self, element, fields):
1268
+ item = {}
1269
+ for field in fields:
1270
+ if field['type'] == 'computed':
1271
+ value = self._compute_field(item, field)
1272
+ else:
1273
+ value = self._extract_field(element, field)
1274
+ if value is not None:
1275
+ item[field['name']] = value
1276
+ return item
1277
+
1278
+ def _apply_transform(self, value, transform):
1279
+ if transform == 'lowercase':
1280
+ return value.lower()
1281
+ elif transform == 'uppercase':
1282
+ return value.upper()
1283
+ elif transform == 'strip':
1284
+ return value.strip()
1285
+ return value
1286
+
1287
+ def _compute_field(self, item, field):
1288
+ try:
1289
+ if 'expression' in field:
1290
+ return eval(field['expression'], {}, item)
1291
+ elif 'function' in field:
1292
+ return field['function'](item)
1293
+ except Exception as e:
1294
+ if self.verbose:
1295
+ print(f"Error computing field {field['name']}: {str(e)}")
1296
+ return field.get('default')
1297
+
1298
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
1299
+ combined_html = self.DEL.join(sections)
1300
+ return self.extract(url, combined_html, **kwargs)
1301
+ class _JsonXPathExtractionStrategy(ExtractionStrategy):
1302
+ def __init__(self, schema: Dict[str, Any], **kwargs):
1303
+ kwargs['input_format'] = 'html' # Force HTML input
1304
+ super().__init__(**kwargs)
1305
+ self.schema = schema
1306
+
1307
+ def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
1308
+ tree = html.fromstring(html_content)
1309
+ base_xpath = self.schema['baseSelector']
1310
+ base_elements = tree.xpath(base_xpath)
1311
+
1312
+ results = []
1313
+ for element in base_elements:
1314
+ # Extract base element attributes first
1315
+ item = {}
1316
+ if 'baseFields' in self.schema:
1317
+ for field in self.schema['baseFields']:
1318
+ value = self._extract_single_field(element, field)
1319
+ if value is not None:
1320
+ item[field['name']] = value
1321
+
1322
+ # Then extract child fields
1323
+ field_data = self._extract_item(element, self.schema['fields'])
1324
+ item.update(field_data)
1325
+
1326
+ results.append(item)
1327
+
1328
+ return results
1329
+
1330
+ def _css_to_xpath(self, css_selector: str) -> str:
1331
+ """Convert CSS selector to XPath if needed"""
1332
+ if '/' in css_selector: # Already an XPath
1333
+ return css_selector
1334
+ else:
1335
+ # Fallback to basic conversion for common cases
1336
+ return self._basic_css_to_xpath(css_selector)
1337
+
1338
+ def _basic_css_to_xpath(self, css_selector: str) -> str:
1339
+ """Basic CSS to XPath conversion for common cases"""
1340
+ # Handle basic cases
1341
+ if ' > ' in css_selector:
1342
+ parts = css_selector.split(' > ')
1343
+ return '//' + '/'.join(parts)
1344
+ if ' ' in css_selector:
1345
+ parts = css_selector.split(' ')
1346
+ return '//' + '//'.join(parts)
1347
+ return '//' + css_selector
1348
+
1349
+ def _extract_field(self, element, field):
1350
+ try:
1351
+ if field['type'] == 'nested':
1352
+ xpath = self._css_to_xpath(field['selector'])
1353
+ nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
1354
+ return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
1355
+
1356
+ if field['type'] == 'list':
1357
+ xpath = self._css_to_xpath(field['selector'])
1358
+ elements = element.xpath(xpath)
1359
+ return [self._extract_list_item(el, field['fields']) for el in elements]
1360
+
1361
+ if field['type'] == 'nested_list':
1362
+ xpath = self._css_to_xpath(field['selector'])
1363
+ elements = element.xpath(xpath)
1364
+ return [self._extract_item(el, field['fields']) for el in elements]
1365
+
1366
+ return self._extract_single_field(element, field)
1367
+ except Exception as e:
1368
+ if self.verbose:
1369
+ print(f"Error extracting field {field['name']}: {str(e)}")
1370
+ return field.get('default')
1371
+
1372
+ def _extract_list_item(self, element, fields):
1373
+ item = {}
1374
+ for field in fields:
1375
+ value = self._extract_single_field(element, field)
1376
+ if value is not None:
1377
+ item[field['name']] = value
1378
+ return item
1379
+
1380
+ def _extract_single_field(self, element, field):
1381
+ if 'selector' in field:
1382
+ xpath = self._css_to_xpath(field['selector'])
1383
+ selected = element.xpath(xpath)
1384
+ if not selected:
1385
+ return field.get('default')
1386
+ selected = selected[0]
1387
+ else:
1388
+ selected = element
1389
+
1390
+ value = None
1391
+ if field['type'] == 'text':
1392
+ value = ''.join(selected.xpath('.//text()')).strip()
1393
+ elif field['type'] == 'attribute':
1394
+ value = selected.get(field['attribute'])
1395
+ elif field['type'] == 'html':
1396
+ value = etree.tostring(selected, encoding='unicode')
1397
+ elif field['type'] == 'regex':
1398
+ text = ''.join(selected.xpath('.//text()')).strip()
1399
+ match = re.search(field['pattern'], text)
1400
+ value = match.group(1) if match else None
1401
+
1402
+ if 'transform' in field:
1403
+ value = self._apply_transform(value, field['transform'])
1404
+
1405
+ return value if value is not None else field.get('default')
1406
+
1407
+ def _extract_item(self, element, fields):
1408
+ item = {}
1409
+ for field in fields:
1410
+ if field['type'] == 'computed':
1411
+ value = self._compute_field(item, field)
1412
+ else:
1413
+ value = self._extract_field(element, field)
1414
+ if value is not None:
1415
+ item[field['name']] = value
1416
+ return item
1417
+
1418
+ def _apply_transform(self, value, transform):
1419
+ if transform == 'lowercase':
1420
+ return value.lower()
1421
+ elif transform == 'uppercase':
1422
+ return value.upper()
1423
+ elif transform == 'strip':
1424
+ return value.strip()
1425
+ return value
1426
+
1427
+ def _compute_field(self, item, field):
1428
+ try:
1429
+ if 'expression' in field:
1430
+ return eval(field['expression'], {}, item)
1431
+ elif 'function' in field:
1432
+ return field['function'](item)
1433
+ except Exception as e:
1434
+ if self.verbose:
1435
+ print(f"Error computing field {field['name']}: {str(e)}")
1436
+ return field.get('default')
1437
+
1438
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
1439
+ combined_html = self.DEL.join(sections)
1440
+ return self.extract(url, combined_html, **kwargs)
crawl4ai/extraction_strategy.py ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, List, Dict, Optional, Union
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import json, time
5
+ # from optimum.intel import IPEXModel
6
+ from .prompts import *
7
+ from .config import *
8
+ from .utils import *
9
+ from .models import *
10
+ from functools import partial
11
+ from .model_loader import *
12
+ import math
13
+ import numpy as np
14
+ import re
15
+ from bs4 import BeautifulSoup
16
+ from lxml import html, etree
17
+ from dataclasses import dataclass
18
+
19
+ class ExtractionStrategy(ABC):
20
+ """
21
+ Abstract base class for all extraction strategies.
22
+ """
23
+
24
+ def __init__(self, input_format: str = "markdown", **kwargs):
25
+ """
26
+ Initialize the extraction strategy.
27
+
28
+ Args:
29
+ input_format: Content format to use for extraction.
30
+ Options: "markdown" (default), "html", "fit_markdown"
31
+ **kwargs: Additional keyword arguments
32
+ """
33
+ self.input_format = input_format
34
+ self.DEL = "<|DEL|>"
35
+ self.name = self.__class__.__name__
36
+ self.verbose = kwargs.get("verbose", False)
37
+
38
+ @abstractmethod
39
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
40
+ """
41
+ Extract meaningful blocks or chunks from the given HTML.
42
+
43
+ :param url: The URL of the webpage.
44
+ :param html: The HTML content of the webpage.
45
+ :return: A list of extracted blocks or chunks.
46
+ """
47
+ pass
48
+
49
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
50
+ """
51
+ Process sections of text in parallel by default.
52
+
53
+ :param url: The URL of the webpage.
54
+ :param sections: List of sections (strings) to process.
55
+ :return: A list of processed JSON blocks.
56
+ """
57
+ extracted_content = []
58
+ with ThreadPoolExecutor() as executor:
59
+ futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
60
+ for future in as_completed(futures):
61
+ extracted_content.extend(future.result())
62
+ return extracted_content
63
+
64
+ class NoExtractionStrategy(ExtractionStrategy):
65
+ """
66
+ A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
67
+ """
68
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
69
+ """
70
+ Extract meaningful blocks or chunks from the given HTML.
71
+ """
72
+ return [{"index": 0, "content": html}]
73
+
74
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
75
+ return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
76
+
77
+ #######################################################
78
+ # Strategies using LLM-based extraction for text data #
79
+ #######################################################
80
+ class LLMExtractionStrategy(ExtractionStrategy):
81
+ """
82
+ A strategy that uses an LLM to extract meaningful content from the HTML.
83
+
84
+ Attributes:
85
+ provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
86
+ api_token: The API token for the provider.
87
+ instruction: The instruction to use for the LLM model.
88
+ schema: Pydantic model schema for structured data.
89
+ extraction_type: "block" or "schema".
90
+ chunk_token_threshold: Maximum tokens per chunk.
91
+ overlap_rate: Overlap between chunks.
92
+ word_token_rate: Word to token conversion rate.
93
+ apply_chunking: Whether to apply chunking.
94
+ base_url: The base URL for the API request.
95
+ api_base: The base URL for the API request.
96
+ extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
97
+ verbose: Whether to print verbose output.
98
+ usages: List of individual token usages.
99
+ total_usage: Accumulated token usage.
100
+ """
101
+
102
+ def __init__(self,
103
+ provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
104
+ instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
105
+ """
106
+ Initialize the strategy with clustering parameters.
107
+
108
+ Args:
109
+ provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
110
+ api_token: The API token for the provider.
111
+ instruction: The instruction to use for the LLM model.
112
+ schema: Pydantic model schema for structured data.
113
+ extraction_type: "block" or "schema".
114
+ chunk_token_threshold: Maximum tokens per chunk.
115
+ overlap_rate: Overlap between chunks.
116
+ word_token_rate: Word to token conversion rate.
117
+ apply_chunking: Whether to apply chunking.
118
+ base_url: The base URL for the API request.
119
+ api_base: The base URL for the API request.
120
+ extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
121
+ verbose: Whether to print verbose output.
122
+ usages: List of individual token usages.
123
+ total_usage: Accumulated token usage.
124
+
125
+ """
126
+ super().__init__(**kwargs)
127
+ self.provider = provider
128
+ self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
129
+ self.instruction = instruction
130
+ self.extract_type = extraction_type
131
+ self.schema = schema
132
+ if schema:
133
+ self.extract_type = "schema"
134
+
135
+ self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
136
+ self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
137
+ self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
138
+ self.apply_chunking = kwargs.get("apply_chunking", True)
139
+ self.base_url = kwargs.get("base_url", None)
140
+ self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
141
+ self.extra_args = kwargs.get("extra_args", {})
142
+ if not self.apply_chunking:
143
+ self.chunk_token_threshold = 1e9
144
+
145
+ self.verbose = kwargs.get("verbose", False)
146
+ self.usages = [] # Store individual usages
147
+ self.total_usage = TokenUsage() # Accumulated usage
148
+
149
+ if not self.api_token:
150
+ raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
151
+
152
+
153
+ def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
154
+ """
155
+ Extract meaningful blocks or chunks from the given HTML using an LLM.
156
+
157
+ How it works:
158
+ 1. Construct a prompt with variables.
159
+ 2. Make a request to the LLM using the prompt.
160
+ 3. Parse the response and extract blocks or chunks.
161
+
162
+ Args:
163
+ url: The URL of the webpage.
164
+ ix: Index of the block.
165
+ html: The HTML content of the webpage.
166
+
167
+ Returns:
168
+ A list of extracted blocks or chunks.
169
+ """
170
+ if self.verbose:
171
+ # print("[LOG] Extracting blocks from URL:", url)
172
+ print(f"[LOG] Call LLM for {url} - block index: {ix}")
173
+
174
+ variable_values = {
175
+ "URL": url,
176
+ "HTML": escape_json_string(sanitize_html(html)),
177
+ }
178
+
179
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
180
+ if self.instruction:
181
+ variable_values["REQUEST"] = self.instruction
182
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
183
+
184
+ if self.extract_type == "schema" and self.schema:
185
+ variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
186
+ prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
187
+
188
+ for variable in variable_values:
189
+ prompt_with_variables = prompt_with_variables.replace(
190
+ "{" + variable + "}", variable_values[variable]
191
+ )
192
+
193
+ response = perform_completion_with_backoff(
194
+ self.provider,
195
+ prompt_with_variables,
196
+ self.api_token,
197
+ base_url=self.api_base or self.base_url,
198
+ extra_args = self.extra_args
199
+ ) # , json_response=self.extract_type == "schema")
200
+ # Track usage
201
+ usage = TokenUsage(
202
+ completion_tokens=response.usage.completion_tokens,
203
+ prompt_tokens=response.usage.prompt_tokens,
204
+ total_tokens=response.usage.total_tokens,
205
+ completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
206
+ prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
207
+ )
208
+ self.usages.append(usage)
209
+
210
+ # Update totals
211
+ self.total_usage.completion_tokens += usage.completion_tokens
212
+ self.total_usage.prompt_tokens += usage.prompt_tokens
213
+ self.total_usage.total_tokens += usage.total_tokens
214
+
215
+ try:
216
+ blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
217
+ blocks = json.loads(blocks)
218
+ for block in blocks:
219
+ block['error'] = False
220
+ except Exception as e:
221
+ parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
222
+ blocks = parsed
223
+ if unparsed:
224
+ blocks.append({
225
+ "index": 0,
226
+ "error": True,
227
+ "tags": ["error"],
228
+ "content": unparsed
229
+ })
230
+
231
+ if self.verbose:
232
+ print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
233
+ return blocks
234
+
235
+ def _merge(self, documents, chunk_token_threshold, overlap):
236
+ """
237
+ Merge documents into sections based on chunk_token_threshold and overlap.
238
+ """
239
+ chunks = []
240
+ sections = []
241
+ total_tokens = 0
242
+
243
+ # Calculate the total tokens across all documents
244
+ for document in documents:
245
+ total_tokens += len(document.split(' ')) * self.word_token_rate
246
+
247
+ # Calculate the number of sections needed
248
+ num_sections = math.floor(total_tokens / chunk_token_threshold)
249
+ if num_sections < 1:
250
+ num_sections = 1 # Ensure there is at least one section
251
+ adjusted_chunk_threshold = total_tokens / num_sections
252
+
253
+ total_token_so_far = 0
254
+ current_chunk = []
255
+
256
+ for document in documents:
257
+ tokens = document.split(' ')
258
+ token_count = len(tokens) * self.word_token_rate
259
+
260
+ if total_token_so_far + token_count <= adjusted_chunk_threshold:
261
+ current_chunk.extend(tokens)
262
+ total_token_so_far += token_count
263
+ else:
264
+ # Ensure to handle the last section properly
265
+ if len(sections) == num_sections - 1:
266
+ current_chunk.extend(tokens)
267
+ continue
268
+
269
+ # Add overlap if specified
270
+ if overlap > 0 and current_chunk:
271
+ overlap_tokens = current_chunk[-overlap:]
272
+ current_chunk.extend(overlap_tokens)
273
+
274
+ sections.append(' '.join(current_chunk))
275
+ current_chunk = tokens
276
+ total_token_so_far = token_count
277
+
278
+ # Add the last chunk
279
+ if current_chunk:
280
+ sections.append(' '.join(current_chunk))
281
+
282
+ return sections
283
+
284
+
285
+ def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
286
+ """
287
+ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
288
+
289
+ Args:
290
+ url: The URL of the webpage.
291
+ sections: List of sections (strings) to process.
292
+
293
+ Returns:
294
+ A list of extracted blocks or chunks.
295
+ """
296
+
297
+ merged_sections = self._merge(
298
+ sections, self.chunk_token_threshold,
299
+ overlap= int(self.chunk_token_threshold * self.overlap_rate)
300
+ )
301
+ extracted_content = []
302
+ if self.provider.startswith("groq/"):
303
+ # Sequential processing with a delay
304
+ for ix, section in enumerate(merged_sections):
305
+ extract_func = partial(self.extract, url)
306
+ extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
307
+ time.sleep(0.5) # 500 ms delay between each processing
308
+ else:
309
+ # Parallel processing using ThreadPoolExecutor
310
+ # extract_func = partial(self.extract, url)
311
+ # for ix, section in enumerate(merged_sections):
312
+ # extracted_content.append(extract_func(ix, section))
313
+
314
+ with ThreadPoolExecutor(max_workers=4) as executor:
315
+ extract_func = partial(self.extract, url)
316
+ futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
317
+
318
+ for future in as_completed(futures):
319
+ try:
320
+ extracted_content.extend(future.result())
321
+ except Exception as e:
322
+ if self.verbose:
323
+ print(f"Error in thread execution: {e}")
324
+ # Add error information to extracted_content
325
+ extracted_content.append({
326
+ "index": 0,
327
+ "error": True,
328
+ "tags": ["error"],
329
+ "content": str(e)
330
+ })
331
+
332
+
333
+ return extracted_content
334
+
335
+
336
+ def show_usage(self) -> None:
337
+ """Print a detailed token usage report showing total and per-request usage."""
338
+ print("\n=== Token Usage Summary ===")
339
+ print(f"{'Type':<15} {'Count':>12}")
340
+ print("-" * 30)
341
+ print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
342
+ print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
343
+ print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
344
+
345
+ print("\n=== Usage History ===")
346
+ print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
347
+ print("-" * 48)
348
+ for i, usage in enumerate(self.usages, 1):
349
+ print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
350
+
351
+ #######################################################
352
+ # Strategies using clustering for text data extraction #
353
+ #######################################################
354
+
355
+ class CosineStrategy(ExtractionStrategy):
356
+ """
357
+ Extract meaningful blocks or chunks from the given HTML using cosine similarity.
358
+
359
+ How it works:
360
+ 1. Pre-filter documents using embeddings and semantic_filter.
361
+ 2. Perform clustering using cosine similarity.
362
+ 3. Organize texts by their cluster labels, retaining order.
363
+ 4. Filter clusters by word count.
364
+ 5. Extract meaningful blocks or chunks from the filtered clusters.
365
+
366
+ Attributes:
367
+ semantic_filter (str): A keyword filter for document filtering.
368
+ word_count_threshold (int): Minimum number of words per cluster.
369
+ max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
370
+ linkage_method (str): The linkage method for hierarchical clustering.
371
+ top_k (int): Number of top categories to extract.
372
+ model_name (str): The name of the sentence-transformers model.
373
+ sim_threshold (float): The similarity threshold for clustering.
374
+ """
375
+ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
376
+ """
377
+ Initialize the strategy with clustering parameters.
378
+
379
+ Args:
380
+ semantic_filter (str): A keyword filter for document filtering.
381
+ word_count_threshold (int): Minimum number of words per cluster.
382
+ max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
383
+ linkage_method (str): The linkage method for hierarchical clustering.
384
+ top_k (int): Number of top categories to extract.
385
+ """
386
+ super().__init__(**kwargs)
387
+
388
+ import numpy as np
389
+
390
+ self.semantic_filter = semantic_filter
391
+ self.word_count_threshold = word_count_threshold
392
+ self.max_dist = max_dist
393
+ self.linkage_method = linkage_method
394
+ self.top_k = top_k
395
+ self.sim_threshold = sim_threshold
396
+ self.timer = time.time()
397
+ self.verbose = kwargs.get("verbose", False)
398
+
399
+ self.buffer_embeddings = np.array([])
400
+ self.get_embedding_method = "direct"
401
+
402
+ self.device = get_device()
403
+ # import torch
404
+ # self.device = torch.device('cpu')
405
+
406
+ self.default_batch_size = calculate_batch_size(self.device)
407
+
408
+ if self.verbose:
409
+ print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
410
+
411
+ # if False and self.device.type == "cpu":
412
+ # self.model = load_onnx_all_MiniLM_l6_v2()
413
+ # self.tokenizer = self.model.tokenizer
414
+ # self.get_embedding_method = "direct"
415
+ # else:
416
+
417
+ self.tokenizer, self.model = load_HF_embedding_model(model_name)
418
+ self.model.to(self.device)
419
+ self.model.eval()
420
+
421
+ self.get_embedding_method = "batch"
422
+
423
+ self.buffer_embeddings = np.array([])
424
+
425
+ # if model_name == "bert-base-uncased":
426
+ # self.tokenizer, self.model = load_bert_base_uncased()
427
+ # self.model.eval() # Ensure the model is in evaluation mode
428
+ # self.get_embedding_method = "batch"
429
+ # elif model_name == "BAAI/bge-small-en-v1.5":
430
+ # self.tokenizer, self.model = load_bge_small_en_v1_5()
431
+ # self.model.eval() # Ensure the model is in evaluation mode
432
+ # self.get_embedding_method = "batch"
433
+ # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
434
+ # self.model = load_onnx_all_MiniLM_l6_v2()
435
+ # self.tokenizer = self.model.tokenizer
436
+ # self.get_embedding_method = "direct"
437
+
438
+
439
+ if self.verbose:
440
+ print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
441
+
442
+ self.nlp, _ = load_text_multilabel_classifier()
443
+ # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
444
+
445
+ if self.verbose:
446
+ print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
447
+
448
+ def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
449
+ """
450
+ Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
451
+
452
+ Args:
453
+ documents (List[str]): A list of document texts.
454
+ semantic_filter (str): A keyword filter for document filtering.
455
+ at_least_k (int): The minimum number of documents to return.
456
+
457
+ Returns:
458
+ List[str]: A list of filtered and sorted document texts.
459
+ """
460
+
461
+ if not semantic_filter:
462
+ return documents
463
+
464
+ if len(documents) < at_least_k:
465
+ at_least_k = len(documents) // 2
466
+
467
+ from sklearn.metrics.pairwise import cosine_similarity
468
+
469
+ # Compute embedding for the keyword filter
470
+ query_embedding = self.get_embeddings([semantic_filter])[0]
471
+
472
+ # Compute embeddings for the documents
473
+ document_embeddings = self.get_embeddings(documents)
474
+
475
+ # Calculate cosine similarity between the query embedding and document embeddings
476
+ similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
477
+
478
+ # Filter documents based on the similarity threshold
479
+ filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
480
+
481
+ # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
482
+ if len(filtered_docs) < at_least_k:
483
+ remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
484
+ remaining_docs.sort(key=lambda x: x[1], reverse=True)
485
+ filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
486
+
487
+ # Extract the document texts from the tuples
488
+ filtered_docs = [doc for doc, _ in filtered_docs]
489
+
490
+ return filtered_docs[:at_least_k]
491
+
492
+ def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
493
+ """
494
+ Get BERT embeddings for a list of sentences.
495
+
496
+ Args:
497
+ sentences (List[str]): A list of text chunks (sentences).
498
+
499
+ Returns:
500
+ NumPy array of embeddings.
501
+ """
502
+ # if self.buffer_embeddings.any() and not bypass_buffer:
503
+ # return self.buffer_embeddings
504
+
505
+ if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
506
+ import torch
507
+ # Tokenize sentences and convert to tensor
508
+ if batch_size is None:
509
+ batch_size = self.default_batch_size
510
+
511
+ all_embeddings = []
512
+ for i in range(0, len(sentences), batch_size):
513
+ batch_sentences = sentences[i:i + batch_size]
514
+ encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
515
+ encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
516
+
517
+ # Ensure no gradients are calculated
518
+ with torch.no_grad():
519
+ model_output = self.model(**encoded_input)
520
+
521
+ # Get embeddings from the last hidden state (mean pooling)
522
+ embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
523
+ all_embeddings.append(embeddings)
524
+
525
+ self.buffer_embeddings = np.vstack(all_embeddings)
526
+ elif self.device.type == "cpu":
527
+ # self.buffer_embeddings = self.model(sentences)
528
+ if batch_size is None:
529
+ batch_size = self.default_batch_size
530
+
531
+ all_embeddings = []
532
+ for i in range(0, len(sentences), batch_size):
533
+ batch_sentences = sentences[i:i + batch_size]
534
+ embeddings = self.model(batch_sentences)
535
+ all_embeddings.append(embeddings)
536
+
537
+ self.buffer_embeddings = np.vstack(all_embeddings)
538
+ return self.buffer_embeddings
539
+
540
+ def hierarchical_clustering(self, sentences: List[str], embeddings = None):
541
+ """
542
+ Perform hierarchical clustering on sentences and return cluster labels.
543
+
544
+ Args:
545
+ sentences (List[str]): A list of text chunks (sentences).
546
+
547
+ Returns:
548
+ NumPy array of cluster labels.
549
+ """
550
+ # Get embeddings
551
+ from scipy.cluster.hierarchy import linkage, fcluster
552
+ from scipy.spatial.distance import pdist
553
+ self.timer = time.time()
554
+ embeddings = self.get_embeddings(sentences, bypass_buffer=True)
555
+ # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
556
+ # Compute pairwise cosine distances
557
+ distance_matrix = pdist(embeddings, 'cosine')
558
+ # Perform agglomerative clustering respecting order
559
+ linked = linkage(distance_matrix, method=self.linkage_method)
560
+ # Form flat clusters
561
+ labels = fcluster(linked, self.max_dist, criterion='distance')
562
+ return labels
563
+
564
+ def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
565
+ """
566
+ Filter clusters to remove those with a word count below the threshold.
567
+
568
+ Args:
569
+ clusters (Dict[int, List[str]]): Dictionary of clusters.
570
+
571
+ Returns:
572
+ Dict[int, List[str]]: Filtered dictionary of clusters.
573
+ """
574
+ filtered_clusters = {}
575
+ for cluster_id, texts in clusters.items():
576
+ # Concatenate texts for analysis
577
+ full_text = " ".join(texts)
578
+ # Count words
579
+ word_count = len(full_text.split())
580
+
581
+ # Keep clusters with word count above the threshold
582
+ if word_count >= self.word_count_threshold:
583
+ filtered_clusters[cluster_id] = texts
584
+
585
+ return filtered_clusters
586
+
587
+ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
588
+ """
589
+ Extract clusters from HTML content using hierarchical clustering.
590
+
591
+ Args:
592
+ url (str): The URL of the webpage.
593
+ html (str): The HTML content of the webpage.
594
+
595
+ Returns:
596
+ List[Dict[str, Any]]: A list of processed JSON blocks.
597
+ """
598
+ # Assume `html` is a list of text chunks for this strategy
599
+ t = time.time()
600
+ text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
601
+
602
+ # Pre-filter documents using embeddings and semantic_filter
603
+ text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
604
+
605
+ if not text_chunks:
606
+ return []
607
+
608
+ # Perform clustering
609
+ labels = self.hierarchical_clustering(text_chunks)
610
+ # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
611
+
612
+ # Organize texts by their cluster labels, retaining order
613
+ t = time.time()
614
+ clusters = {}
615
+ for index, label in enumerate(labels):
616
+ clusters.setdefault(label, []).append(text_chunks[index])
617
+
618
+ # Filter clusters by word count
619
+ filtered_clusters = self.filter_clusters_by_word_count(clusters)
620
+
621
+ # Convert filtered clusters to a sorted list of dictionaries
622
+ cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
623
+
624
+ if self.verbose:
625
+ print(f"[LOG] 🚀 Assign tags using {self.device}")
626
+
627
+ if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
628
+ labels = self.nlp([cluster['content'] for cluster in cluster_list])
629
+
630
+ for cluster, label in zip(cluster_list, labels):
631
+ cluster['tags'] = label
632
+ # elif self.device.type == "cpu":
633
+ # # Process the text with the loaded model
634
+ # texts = [cluster['content'] for cluster in cluster_list]
635
+ # # Batch process texts
636
+ # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
637
+
638
+ # for doc, cluster in zip(docs, cluster_list):
639
+ # tok_k = self.top_k
640
+ # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
641
+ # cluster['tags'] = [cat for cat, _ in top_categories]
642
+
643
+ # for cluster in cluster_list:
644
+ # doc = self.nlp(cluster['content'])
645
+ # tok_k = self.top_k
646
+ # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
647
+ # cluster['tags'] = [cat for cat, _ in top_categories]
648
+
649
+ if self.verbose:
650
+ print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
651
+
652
+ return cluster_list
653
+
654
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
655
+ """
656
+ Process sections using hierarchical clustering.
657
+
658
+ Args:
659
+ url (str): The URL of the webpage.
660
+ sections (List[str]): List of sections (strings) to process.
661
+
662
+ Returns:
663
+ """
664
+ # This strategy processes all sections together
665
+
666
+ return self.extract(url, self.DEL.join(sections), **kwargs)
667
+
668
+ #######################################################
669
+ # New extraction strategies for JSON-based extraction #
670
+ #######################################################
671
+
672
+ class JsonElementExtractionStrategy(ExtractionStrategy):
673
+ """
674
+ Abstract base class for extracting structured JSON from HTML content.
675
+
676
+ How it works:
677
+ 1. Parses HTML content using the `_parse_html` method.
678
+ 2. Uses a schema to define base selectors, fields, and transformations.
679
+ 3. Extracts data hierarchically, supporting nested fields and lists.
680
+ 4. Handles computed fields with expressions or functions.
681
+
682
+ Attributes:
683
+ DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
684
+ schema (Dict[str, Any]): The schema defining the extraction rules.
685
+ verbose (bool): Enables verbose logging for debugging purposes.
686
+
687
+ Methods:
688
+ extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
689
+ _extract_item(element, fields): Extracts fields from a single element.
690
+ _extract_single_field(element, field): Extracts a single field based on its type.
691
+ _apply_transform(value, transform): Applies a transformation to a value.
692
+ _compute_field(item, field): Computes a field value using an expression or function.
693
+ run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
694
+
695
+ Abstract Methods:
696
+ _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
697
+ _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
698
+ _get_elements(element, selector): Retrieves child elements using a selector.
699
+ _get_element_text(element): Extracts text content from an element.
700
+ _get_element_html(element): Extracts raw HTML from an element.
701
+ _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
702
+ """
703
+
704
+
705
+ DEL = '\n'
706
+
707
+ def __init__(self, schema: Dict[str, Any], **kwargs):
708
+ """
709
+ Initialize the JSON element extraction strategy with a schema.
710
+
711
+ Args:
712
+ schema (Dict[str, Any]): The schema defining the extraction rules.
713
+ """
714
+ super().__init__(**kwargs)
715
+ self.schema = schema
716
+ self.verbose = kwargs.get('verbose', False)
717
+
718
+ def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
719
+ """
720
+ Extract structured data from HTML content.
721
+
722
+ How it works:
723
+ 1. Parses the HTML content using the `_parse_html` method.
724
+ 2. Identifies base elements using the schema's base selector.
725
+ 3. Extracts fields from each base element using `_extract_item`.
726
+
727
+ Args:
728
+ url (str): The URL of the page being processed.
729
+ html_content (str): The raw HTML content to parse and extract.
730
+ *q: Additional positional arguments.
731
+ **kwargs: Additional keyword arguments for custom extraction.
732
+
733
+ Returns:
734
+ List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
735
+ """
736
+
737
+ parsed_html = self._parse_html(html_content)
738
+ base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
739
+
740
+ results = []
741
+ for element in base_elements:
742
+ # Extract base element attributes
743
+ item = {}
744
+ if 'baseFields' in self.schema:
745
+ for field in self.schema['baseFields']:
746
+ value = self._extract_single_field(element, field)
747
+ if value is not None:
748
+ item[field['name']] = value
749
+
750
+ # Extract child fields
751
+ field_data = self._extract_item(element, self.schema['fields'])
752
+ item.update(field_data)
753
+
754
+ if item:
755
+ results.append(item)
756
+
757
+ return results
758
+
759
+ @abstractmethod
760
+ def _parse_html(self, html_content: str):
761
+ """Parse HTML content into appropriate format"""
762
+ pass
763
+
764
+ @abstractmethod
765
+ def _get_base_elements(self, parsed_html, selector: str):
766
+ """Get all base elements using the selector"""
767
+ pass
768
+
769
+ @abstractmethod
770
+ def _get_elements(self, element, selector: str):
771
+ """Get child elements using the selector"""
772
+ pass
773
+
774
+ def _extract_field(self, element, field):
775
+ try:
776
+ if field['type'] == 'nested':
777
+ nested_elements = self._get_elements(element, field['selector'])
778
+ nested_element = nested_elements[0] if nested_elements else None
779
+ return self._extract_item(nested_element, field['fields']) if nested_element else {}
780
+
781
+ if field['type'] == 'list':
782
+ elements = self._get_elements(element, field['selector'])
783
+ return [self._extract_list_item(el, field['fields']) for el in elements]
784
+
785
+ if field['type'] == 'nested_list':
786
+ elements = self._get_elements(element, field['selector'])
787
+ return [self._extract_item(el, field['fields']) for el in elements]
788
+
789
+ return self._extract_single_field(element, field)
790
+ except Exception as e:
791
+ if self.verbose:
792
+ print(f"Error extracting field {field['name']}: {str(e)}")
793
+ return field.get('default')
794
+
795
+ def _extract_single_field(self, element, field):
796
+ """
797
+ Extract a single field based on its type.
798
+
799
+ How it works:
800
+ 1. Selects the target element using the field's selector.
801
+ 2. Extracts the field value based on its type (e.g., text, attribute, regex).
802
+ 3. Applies transformations if defined in the schema.
803
+
804
+ Args:
805
+ element: The base element to extract the field from.
806
+ field (Dict[str, Any]): The field definition in the schema.
807
+
808
+ Returns:
809
+ Any: The extracted field value.
810
+ """
811
+
812
+ if 'selector' in field:
813
+ selected = self._get_elements(element, field['selector'])
814
+ if not selected:
815
+ return field.get('default')
816
+ selected = selected[0]
817
+ else:
818
+ selected = element
819
+
820
+ value = None
821
+ if field['type'] == 'text':
822
+ value = self._get_element_text(selected)
823
+ elif field['type'] == 'attribute':
824
+ value = self._get_element_attribute(selected, field['attribute'])
825
+ elif field['type'] == 'html':
826
+ value = self._get_element_html(selected)
827
+ elif field['type'] == 'regex':
828
+ text = self._get_element_text(selected)
829
+ match = re.search(field['pattern'], text)
830
+ value = match.group(1) if match else None
831
+
832
+ if 'transform' in field:
833
+ value = self._apply_transform(value, field['transform'])
834
+
835
+ return value if value is not None else field.get('default')
836
+
837
+ def _extract_list_item(self, element, fields):
838
+ item = {}
839
+ for field in fields:
840
+ value = self._extract_single_field(element, field)
841
+ if value is not None:
842
+ item[field['name']] = value
843
+ return item
844
+
845
+ def _extract_item(self, element, fields):
846
+ """
847
+ Extracts fields from a given element.
848
+
849
+ How it works:
850
+ 1. Iterates through the fields defined in the schema.
851
+ 2. Handles computed, single, and nested field types.
852
+ 3. Updates the item dictionary with extracted field values.
853
+
854
+ Args:
855
+ element: The base element to extract fields from.
856
+ fields (List[Dict[str, Any]]): The list of fields to extract.
857
+
858
+ Returns:
859
+ Dict[str, Any]: A dictionary representing the extracted item.
860
+ """
861
+
862
+ item = {}
863
+ for field in fields:
864
+ if field['type'] == 'computed':
865
+ value = self._compute_field(item, field)
866
+ else:
867
+ value = self._extract_field(element, field)
868
+ if value is not None:
869
+ item[field['name']] = value
870
+ return item
871
+
872
+ def _apply_transform(self, value, transform):
873
+ """
874
+ Apply a transformation to a value.
875
+
876
+ How it works:
877
+ 1. Checks the transformation type (e.g., `lowercase`, `strip`).
878
+ 2. Applies the transformation to the value.
879
+ 3. Returns the transformed value.
880
+
881
+ Args:
882
+ value (str): The value to transform.
883
+ transform (str): The type of transformation to apply.
884
+
885
+ Returns:
886
+ str: The transformed value.
887
+ """
888
+
889
+ if transform == 'lowercase':
890
+ return value.lower()
891
+ elif transform == 'uppercase':
892
+ return value.upper()
893
+ elif transform == 'strip':
894
+ return value.strip()
895
+ return value
896
+
897
+ def _compute_field(self, item, field):
898
+ try:
899
+ if 'expression' in field:
900
+ return eval(field['expression'], {}, item)
901
+ elif 'function' in field:
902
+ return field['function'](item)
903
+ except Exception as e:
904
+ if self.verbose:
905
+ print(f"Error computing field {field['name']}: {str(e)}")
906
+ return field.get('default')
907
+
908
+ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
909
+ """
910
+ Run the extraction strategy on a combined HTML content.
911
+
912
+ How it works:
913
+ 1. Combines multiple HTML sections using the `DEL` delimiter.
914
+ 2. Calls the `extract` method with the combined HTML.
915
+
916
+ Args:
917
+ url (str): The URL of the page being processed.
918
+ sections (List[str]): A list of HTML sections.
919
+ *q: Additional positional arguments.
920
+ **kwargs: Additional keyword arguments for custom extraction.
921
+
922
+ Returns:
923
+ List[Dict[str, Any]]: A list of extracted items.
924
+ """
925
+
926
+ combined_html = self.DEL.join(sections)
927
+ return self.extract(url, combined_html, **kwargs)
928
+
929
+ @abstractmethod
930
+ def _get_element_text(self, element) -> str:
931
+ """Get text content from element"""
932
+ pass
933
+
934
+ @abstractmethod
935
+ def _get_element_html(self, element) -> str:
936
+ """Get HTML content from element"""
937
+ pass
938
+
939
+ @abstractmethod
940
+ def _get_element_attribute(self, element, attribute: str):
941
+ """Get attribute value from element"""
942
+ pass
943
+
944
+ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
945
+ """
946
+ Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
947
+
948
+ How it works:
949
+ 1. Parses HTML content with BeautifulSoup.
950
+ 2. Selects elements using CSS selectors defined in the schema.
951
+ 3. Extracts field data and applies transformations as defined.
952
+
953
+ Attributes:
954
+ schema (Dict[str, Any]): The schema defining the extraction rules.
955
+ verbose (bool): Enables verbose logging for debugging purposes.
956
+
957
+ Methods:
958
+ _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
959
+ _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
960
+ _get_elements(element, selector): Selects child elements using a CSS selector.
961
+ _get_element_text(element): Extracts text content from a BeautifulSoup element.
962
+ _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
963
+ _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
964
+ """
965
+
966
+ def __init__(self, schema: Dict[str, Any], **kwargs):
967
+ kwargs['input_format'] = 'html' # Force HTML input
968
+ super().__init__(schema, **kwargs)
969
+
970
+ def _parse_html(self, html_content: str):
971
+ return BeautifulSoup(html_content, 'html.parser')
972
+
973
+ def _get_base_elements(self, parsed_html, selector: str):
974
+ return parsed_html.select(selector)
975
+
976
+ def _get_elements(self, element, selector: str):
977
+ # Return all matching elements using select() instead of select_one()
978
+ # This ensures that we get all elements that match the selector, not just the first one
979
+ return element.select(selector)
980
+
981
+ def _get_element_text(self, element) -> str:
982
+ return element.get_text(strip=True)
983
+
984
+ def _get_element_html(self, element) -> str:
985
+ return str(element)
986
+
987
+ def _get_element_attribute(self, element, attribute: str):
988
+ return element.get(attribute)
989
+
990
+ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
991
+ """
992
+ Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
993
+
994
+ How it works:
995
+ 1. Parses HTML content into an lxml tree.
996
+ 2. Selects elements using XPath expressions.
997
+ 3. Converts CSS selectors to XPath when needed.
998
+
999
+ Attributes:
1000
+ schema (Dict[str, Any]): The schema defining the extraction rules.
1001
+ verbose (bool): Enables verbose logging for debugging purposes.
1002
+
1003
+ Methods:
1004
+ _parse_html(html_content): Parses HTML content into an lxml tree.
1005
+ _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
1006
+ _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
1007
+ _get_elements(element, selector): Selects child elements using an XPath selector.
1008
+ _get_element_text(element): Extracts text content from an lxml element.
1009
+ _get_element_html(element): Extracts the raw HTML content of an lxml element.
1010
+ _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
1011
+ """
1012
+
1013
+ def __init__(self, schema: Dict[str, Any], **kwargs):
1014
+ kwargs['input_format'] = 'html' # Force HTML input
1015
+ super().__init__(schema, **kwargs)
1016
+
1017
+ def _parse_html(self, html_content: str):
1018
+ return html.fromstring(html_content)
1019
+
1020
+ def _get_base_elements(self, parsed_html, selector: str):
1021
+ return parsed_html.xpath(selector)
1022
+
1023
+ def _css_to_xpath(self, css_selector: str) -> str:
1024
+ """Convert CSS selector to XPath if needed"""
1025
+ if '/' in css_selector: # Already an XPath
1026
+ return css_selector
1027
+ return self._basic_css_to_xpath(css_selector)
1028
+
1029
+ def _basic_css_to_xpath(self, css_selector: str) -> str:
1030
+ """Basic CSS to XPath conversion for common cases"""
1031
+ if ' > ' in css_selector:
1032
+ parts = css_selector.split(' > ')
1033
+ return '//' + '/'.join(parts)
1034
+ if ' ' in css_selector:
1035
+ parts = css_selector.split(' ')
1036
+ return '//' + '//'.join(parts)
1037
+ return '//' + css_selector
1038
+
1039
+ def _get_elements(self, element, selector: str):
1040
+ xpath = self._css_to_xpath(selector)
1041
+ if not xpath.startswith('.'):
1042
+ xpath = '.' + xpath
1043
+ return element.xpath(xpath)
1044
+
1045
+ def _get_element_text(self, element) -> str:
1046
+ return ''.join(element.xpath('.//text()')).strip()
1047
+
1048
+ def _get_element_html(self, element) -> str:
1049
+ return etree.tostring(element, encoding='unicode')
1050
+
1051
+ def _get_element_attribute(self, element, attribute: str):
1052
+ return element.get(attribute)
crawl4ai/html2text/__init__.py ADDED
@@ -0,0 +1,1141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """html2text: Turn HTML into equivalent Markdown-structured text."""
2
+
3
+ import html.entities
4
+ import html.parser
5
+ import re
6
+ import string
7
+ import urllib.parse as urlparse
8
+ from textwrap import wrap
9
+ from typing import Dict, List, Optional, Tuple, Union
10
+
11
+ from . import config
12
+ from ._typing import OutCallback
13
+ from .elements import AnchorElement, ListElement
14
+ from .utils import (
15
+ dumb_css_parser,
16
+ element_style,
17
+ escape_md,
18
+ escape_md_section,
19
+ google_fixed_width_font,
20
+ google_has_height,
21
+ google_list_style,
22
+ google_text_emphasis,
23
+ hn,
24
+ list_numbering_start,
25
+ pad_tables_in_text,
26
+ skipwrap,
27
+ unifiable_n,
28
+ )
29
+
30
+ __version__ = (2024, 2, 26)
31
+
32
+
33
+ # TODO:
34
+ # Support decoded entities with UNIFIABLE.
35
+
36
+
37
+ class HTML2Text(html.parser.HTMLParser):
38
+ def __init__(
39
+ self,
40
+ out: Optional[OutCallback] = None,
41
+ baseurl: str = "",
42
+ bodywidth: int = config.BODY_WIDTH,
43
+ ) -> None:
44
+ """
45
+ Input parameters:
46
+ out: possible custom replacement for self.outtextf (which
47
+ appends lines of text).
48
+ baseurl: base URL of the document we process
49
+ """
50
+ super().__init__(convert_charrefs=False)
51
+
52
+ # Config options
53
+ self.split_next_td = False
54
+ self.td_count = 0
55
+ self.table_start = False
56
+ self.unicode_snob = config.UNICODE_SNOB # covered in cli
57
+
58
+ self.escape_snob = config.ESCAPE_SNOB # covered in cli
59
+ self.escape_backslash = config.ESCAPE_BACKSLASH # covered in cli
60
+ self.escape_dot = config.ESCAPE_DOT # covered in cli
61
+ self.escape_plus = config.ESCAPE_PLUS # covered in cli
62
+ self.escape_dash = config.ESCAPE_DASH # covered in cli
63
+
64
+ self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
65
+ self.body_width = bodywidth # covered in cli
66
+ self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
67
+ self.inline_links = config.INLINE_LINKS # covered in cli
68
+ self.protect_links = config.PROTECT_LINKS # covered in cli
69
+ self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
70
+ self.ignore_links = config.IGNORE_ANCHORS # covered in cli
71
+ self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli
72
+ self.ignore_images = config.IGNORE_IMAGES # covered in cli
73
+ self.images_as_html = config.IMAGES_AS_HTML # covered in cli
74
+ self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
75
+ self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
76
+ self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
77
+ self.bypass_tables = config.BYPASS_TABLES # covered in cli
78
+ self.ignore_tables = config.IGNORE_TABLES # covered in cli
79
+ self.google_doc = False # covered in cli
80
+ self.ul_item_mark = "*" # covered in cli
81
+ self.emphasis_mark = "_" # covered in cli
82
+ self.strong_mark = "**"
83
+ self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
84
+ self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
85
+ self.hide_strikethrough = False # covered in cli
86
+ self.mark_code = config.MARK_CODE
87
+ self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
88
+ self.wrap_links = config.WRAP_LINKS # covered in cli
89
+ self.wrap_tables = config.WRAP_TABLES
90
+ self.pad_tables = config.PAD_TABLES # covered in cli
91
+ self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
92
+ self.tag_callback = None
93
+ self.open_quote = config.OPEN_QUOTE # covered in cli
94
+ self.close_quote = config.CLOSE_QUOTE # covered in cli
95
+ self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli
96
+
97
+ if out is None:
98
+ self.out = self.outtextf
99
+ else:
100
+ self.out = out
101
+
102
+ # empty list to store output characters before they are "joined"
103
+ self.outtextlist: List[str] = []
104
+
105
+ self.quiet = 0
106
+ self.p_p = 0 # number of newline character to print before next output
107
+ self.outcount = 0
108
+ self.start = True
109
+ self.space = False
110
+ self.a: List[AnchorElement] = []
111
+ self.astack: List[Optional[Dict[str, Optional[str]]]] = []
112
+ self.maybe_automatic_link: Optional[str] = None
113
+ self.empty_link = False
114
+ self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
115
+ self.acount = 0
116
+ self.list: List[ListElement] = []
117
+ self.blockquote = 0
118
+ self.pre = False
119
+ self.startpre = False
120
+ self.code = False
121
+ self.quote = False
122
+ self.br_toggle = ""
123
+ self.lastWasNL = False
124
+ self.lastWasList = False
125
+ self.style = 0
126
+ self.style_def: Dict[str, Dict[str, str]] = {}
127
+ self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = []
128
+ self.emphasis = 0
129
+ self.drop_white_space = 0
130
+ self.inheader = False
131
+ # Current abbreviation definition
132
+ self.abbr_title: Optional[str] = None
133
+ # Last inner HTML (for abbr being defined)
134
+ self.abbr_data: Optional[str] = None
135
+ # Stack of abbreviations to write later
136
+ self.abbr_list: Dict[str, str] = {}
137
+ self.baseurl = baseurl
138
+ self.stressed = False
139
+ self.preceding_stressed = False
140
+ self.preceding_data = ""
141
+ self.current_tag = ""
142
+
143
+ config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;"
144
+
145
+ def update_params(self, **kwargs):
146
+ for key, value in kwargs.items():
147
+ setattr(self, key, value)
148
+
149
+ def feed(self, data: str) -> None:
150
+ data = data.replace("</' + 'script>", "</ignore>")
151
+ super().feed(data)
152
+
153
+ def handle(self, data: str) -> str:
154
+ self.start = True
155
+ self.feed(data)
156
+ self.feed("")
157
+ markdown = self.optwrap(self.finish())
158
+ if self.pad_tables:
159
+ return pad_tables_in_text(markdown)
160
+ else:
161
+ return markdown
162
+
163
+ def outtextf(self, s: str) -> None:
164
+ self.outtextlist.append(s)
165
+ if s:
166
+ self.lastWasNL = s[-1] == "\n"
167
+
168
+ def finish(self) -> str:
169
+ self.close()
170
+
171
+ self.pbr()
172
+ self.o("", force="end")
173
+
174
+ outtext = "".join(self.outtextlist)
175
+
176
+ if self.unicode_snob:
177
+ nbsp = html.entities.html5["nbsp;"]
178
+ else:
179
+ nbsp = " "
180
+ outtext = outtext.replace("&nbsp_place_holder;", nbsp)
181
+
182
+ # Clear self.outtextlist to avoid memory leak of its content to
183
+ # the next handling.
184
+ self.outtextlist = []
185
+
186
+ return outtext
187
+
188
+ def handle_charref(self, c: str) -> None:
189
+ self.handle_data(self.charref(c), True)
190
+
191
+ def handle_entityref(self, c: str) -> None:
192
+ ref = self.entityref(c)
193
+
194
+ # ref may be an empty string (e.g. for &lrm;/&rlm; markers that should
195
+ # not contribute to the final output).
196
+ # self.handle_data cannot handle a zero-length string right after a
197
+ # stressed tag or mid-text within a stressed tag (text get split and
198
+ # self.stressed/self.preceding_stressed gets switched after the first
199
+ # part of that text).
200
+ if ref:
201
+ self.handle_data(ref, True)
202
+
203
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
204
+ self.handle_tag(tag, dict(attrs), start=True)
205
+
206
+ def handle_endtag(self, tag: str) -> None:
207
+ self.handle_tag(tag, {}, start=False)
208
+
209
+ def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
210
+ """
211
+ :type attrs: dict
212
+
213
+ :returns: The index of certain set of attributes (of a link) in the
214
+ self.a list. If the set of attributes is not found, returns None
215
+ :rtype: int
216
+ """
217
+ if "href" not in attrs:
218
+ return None
219
+
220
+ match = False
221
+ for i, a in enumerate(self.a):
222
+ if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
223
+ if "title" in a.attrs or "title" in attrs:
224
+ if (
225
+ "title" in a.attrs
226
+ and "title" in attrs
227
+ and a.attrs["title"] == attrs["title"]
228
+ ):
229
+ match = True
230
+ else:
231
+ match = True
232
+
233
+ if match:
234
+ return i
235
+ return None
236
+
237
+ def handle_emphasis(
238
+ self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
239
+ ) -> None:
240
+ """
241
+ Handles various text emphases
242
+ """
243
+ tag_emphasis = google_text_emphasis(tag_style)
244
+ parent_emphasis = google_text_emphasis(parent_style)
245
+
246
+ # handle Google's text emphasis
247
+ strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough
248
+
249
+ # google and others may mark a font's weight as `bold` or `700`
250
+ bold = False
251
+ for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
252
+ bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis
253
+ if bold:
254
+ break
255
+
256
+ italic = "italic" in tag_emphasis and "italic" not in parent_emphasis
257
+ fixed = (
258
+ google_fixed_width_font(tag_style)
259
+ and not google_fixed_width_font(parent_style)
260
+ and not self.pre
261
+ )
262
+
263
+ if start:
264
+ # crossed-out text must be handled before other attributes
265
+ # in order not to output qualifiers unnecessarily
266
+ if bold or italic or fixed:
267
+ self.emphasis += 1
268
+ if strikethrough:
269
+ self.quiet += 1
270
+ if italic:
271
+ self.o(self.emphasis_mark)
272
+ self.drop_white_space += 1
273
+ if bold:
274
+ self.o(self.strong_mark)
275
+ self.drop_white_space += 1
276
+ if fixed:
277
+ self.o("`")
278
+ self.drop_white_space += 1
279
+ self.code = True
280
+ else:
281
+ if bold or italic or fixed:
282
+ # there must not be whitespace before closing emphasis mark
283
+ self.emphasis -= 1
284
+ self.space = False
285
+ if fixed:
286
+ if self.drop_white_space:
287
+ # empty emphasis, drop it
288
+ self.drop_white_space -= 1
289
+ else:
290
+ self.o("`")
291
+ self.code = False
292
+ if bold:
293
+ if self.drop_white_space:
294
+ # empty emphasis, drop it
295
+ self.drop_white_space -= 1
296
+ else:
297
+ self.o(self.strong_mark)
298
+ if italic:
299
+ if self.drop_white_space:
300
+ # empty emphasis, drop it
301
+ self.drop_white_space -= 1
302
+ else:
303
+ self.o(self.emphasis_mark)
304
+ # space is only allowed after *all* emphasis marks
305
+ if (bold or italic) and not self.emphasis:
306
+ self.o(" ")
307
+ if strikethrough:
308
+ self.quiet -= 1
309
+
310
+ def handle_tag(
311
+ self, tag: str, attrs: Dict[str, Optional[str]], start: bool
312
+ ) -> None:
313
+ self.current_tag = tag
314
+
315
+ if self.tag_callback is not None:
316
+ if self.tag_callback(self, tag, attrs, start) is True:
317
+ return
318
+
319
+ # first thing inside the anchor tag is another tag
320
+ # that produces some output
321
+ if (
322
+ start
323
+ and self.maybe_automatic_link is not None
324
+ and tag not in ["p", "div", "style", "dl", "dt"]
325
+ and (tag != "img" or self.ignore_images)
326
+ ):
327
+ self.o("[")
328
+ self.maybe_automatic_link = None
329
+ self.empty_link = False
330
+
331
+ if self.google_doc:
332
+ # the attrs parameter is empty for a closing tag. in addition, we
333
+ # need the attributes of the parent nodes in order to get a
334
+ # complete style description for the current element. we assume
335
+ # that google docs export well formed html.
336
+ parent_style: Dict[str, str] = {}
337
+ if start:
338
+ if self.tag_stack:
339
+ parent_style = self.tag_stack[-1][2]
340
+ tag_style = element_style(attrs, self.style_def, parent_style)
341
+ self.tag_stack.append((tag, attrs, tag_style))
342
+ else:
343
+ dummy, attrs, tag_style = (
344
+ self.tag_stack.pop() if self.tag_stack else (None, {}, {})
345
+ )
346
+ if self.tag_stack:
347
+ parent_style = self.tag_stack[-1][2]
348
+
349
+ if hn(tag):
350
+ # check if nh is inside of an 'a' tag (incorrect but found in the wild)
351
+ if self.astack:
352
+ if start:
353
+ self.inheader = True
354
+ # are inside link name, so only add '#' if it can appear before '['
355
+ if self.outtextlist and self.outtextlist[-1] == "[":
356
+ self.outtextlist.pop()
357
+ self.space = False
358
+ self.o(hn(tag) * "#" + " ")
359
+ self.o("[")
360
+ else:
361
+ self.p_p = 0 # don't break up link name
362
+ self.inheader = False
363
+ return # prevent redundant emphasis marks on headers
364
+ else:
365
+ self.p()
366
+ if start:
367
+ self.inheader = True
368
+ self.o(hn(tag) * "#" + " ")
369
+ else:
370
+ self.inheader = False
371
+ return # prevent redundant emphasis marks on headers
372
+
373
+ if tag in ["p", "div"]:
374
+ if self.google_doc:
375
+ if start and google_has_height(tag_style):
376
+ self.p()
377
+ else:
378
+ self.soft_br()
379
+ elif self.astack:
380
+ pass
381
+ elif self.split_next_td:
382
+ pass
383
+ else:
384
+ self.p()
385
+
386
+ if tag == "br" and start:
387
+ if self.blockquote > 0:
388
+ self.o(" \n> ")
389
+ else:
390
+ self.o(" \n")
391
+
392
+ if tag == "hr" and start:
393
+ self.p()
394
+ self.o("* * *")
395
+ self.p()
396
+
397
+ if tag in ["head", "style", "script"]:
398
+ if start:
399
+ self.quiet += 1
400
+ else:
401
+ self.quiet -= 1
402
+
403
+ if tag == "style":
404
+ if start:
405
+ self.style += 1
406
+ else:
407
+ self.style -= 1
408
+
409
+ if tag in ["body"]:
410
+ self.quiet = 0 # sites like 9rules.com never close <head>
411
+
412
+ if tag == "blockquote":
413
+ if start:
414
+ self.p()
415
+ self.o("> ", force=True)
416
+ self.start = True
417
+ self.blockquote += 1
418
+ else:
419
+ self.blockquote -= 1
420
+ self.p()
421
+
422
+ if tag in ["em", "i", "u"] and not self.ignore_emphasis:
423
+ # Separate with a space if we immediately follow an alphanumeric
424
+ # character, since otherwise Markdown won't render the emphasis
425
+ # marks, and we'll be left with eg 'foo_bar_' visible.
426
+ # (Don't add a space otherwise, though, since there isn't one in the
427
+ # original HTML.)
428
+ if (
429
+ start
430
+ and self.preceding_data
431
+ and self.preceding_data[-1] not in string.whitespace
432
+ and self.preceding_data[-1] not in string.punctuation
433
+ ):
434
+ emphasis = " " + self.emphasis_mark
435
+ self.preceding_data += " "
436
+ else:
437
+ emphasis = self.emphasis_mark
438
+
439
+ self.o(emphasis)
440
+ if start:
441
+ self.stressed = True
442
+
443
+ if tag in ["strong", "b"] and not self.ignore_emphasis:
444
+ # Separate with space if we immediately follow an * character, since
445
+ # without it, Markdown won't render the resulting *** correctly.
446
+ # (Don't add a space otherwise, though, since there isn't one in the
447
+ # original HTML.)
448
+ if (
449
+ start
450
+ and self.preceding_data
451
+ # When `self.strong_mark` is set to empty, the next condition
452
+ # will cause IndexError since it's trying to match the data
453
+ # with the first character of the `self.strong_mark`.
454
+ and len(self.strong_mark) > 0
455
+ and self.preceding_data[-1] == self.strong_mark[0]
456
+ ):
457
+ strong = " " + self.strong_mark
458
+ self.preceding_data += " "
459
+ else:
460
+ strong = self.strong_mark
461
+
462
+ self.o(strong)
463
+ if start:
464
+ self.stressed = True
465
+
466
+ if tag in ["del", "strike", "s"]:
467
+ if start and self.preceding_data and self.preceding_data[-1] == "~":
468
+ strike = " ~~"
469
+ self.preceding_data += " "
470
+ else:
471
+ strike = "~~"
472
+
473
+ self.o(strike)
474
+ if start:
475
+ self.stressed = True
476
+
477
+ if self.google_doc:
478
+ if not self.inheader:
479
+ # handle some font attributes, but leave headers clean
480
+ self.handle_emphasis(start, tag_style, parent_style)
481
+
482
+ if tag in ["kbd", "code", "tt"] and not self.pre:
483
+ self.o("`") # TODO: `` `this` ``
484
+ self.code = not self.code
485
+
486
+ if tag == "abbr":
487
+ if start:
488
+ self.abbr_title = None
489
+ self.abbr_data = ""
490
+ if "title" in attrs:
491
+ self.abbr_title = attrs["title"]
492
+ else:
493
+ if self.abbr_title is not None:
494
+ assert self.abbr_data is not None
495
+ self.abbr_list[self.abbr_data] = self.abbr_title
496
+ self.abbr_title = None
497
+ self.abbr_data = None
498
+
499
+ if tag == "q":
500
+ if not self.quote:
501
+ self.o(self.open_quote)
502
+ else:
503
+ self.o(self.close_quote)
504
+ self.quote = not self.quote
505
+
506
+ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
507
+ url = urlparse.urljoin(self.baseurl, link)
508
+ title = ' "{}"'.format(title) if title.strip() else ""
509
+ self.o("]({url}{title})".format(url=escape_md(url), title=title))
510
+
511
+ if tag == "a" and not self.ignore_links:
512
+ if start:
513
+ if (
514
+ "href" in attrs
515
+ and attrs["href"] is not None
516
+ and not (self.skip_internal_links and attrs["href"].startswith("#"))
517
+ and not (
518
+ self.ignore_mailto_links and attrs["href"].startswith("mailto:")
519
+ )
520
+ ):
521
+ self.astack.append(attrs)
522
+ self.maybe_automatic_link = attrs["href"]
523
+ self.empty_link = True
524
+ if self.protect_links:
525
+ attrs["href"] = "<" + attrs["href"] + ">"
526
+ else:
527
+ self.astack.append(None)
528
+ else:
529
+ if self.astack:
530
+ a = self.astack.pop()
531
+ if self.maybe_automatic_link and not self.empty_link:
532
+ self.maybe_automatic_link = None
533
+ elif a:
534
+ assert a["href"] is not None
535
+ if self.empty_link:
536
+ self.o("[")
537
+ self.empty_link = False
538
+ self.maybe_automatic_link = None
539
+ if self.inline_links:
540
+ self.p_p = 0
541
+ title = a.get("title") or ""
542
+ title = escape_md(title)
543
+ link_url(self, a["href"], title)
544
+ else:
545
+ i = self.previousIndex(a)
546
+ if i is not None:
547
+ a_props = self.a[i]
548
+ else:
549
+ self.acount += 1
550
+ a_props = AnchorElement(a, self.acount, self.outcount)
551
+ self.a.append(a_props)
552
+ self.o("][" + str(a_props.count) + "]")
553
+
554
+ if tag == "img" and start and not self.ignore_images:
555
+ if "src" in attrs and attrs["src"] is not None:
556
+ if not self.images_to_alt:
557
+ attrs["href"] = attrs["src"]
558
+ alt = attrs.get("alt") or self.default_image_alt
559
+
560
+ # If we have images_with_size, write raw html including width,
561
+ # height, and alt attributes
562
+ if self.images_as_html or (
563
+ self.images_with_size and ("width" in attrs or "height" in attrs)
564
+ ):
565
+ self.o("<img src='" + attrs["src"] + "' ")
566
+ if "width" in attrs and attrs["width"] is not None:
567
+ self.o("width='" + attrs["width"] + "' ")
568
+ if "height" in attrs and attrs["height"] is not None:
569
+ self.o("height='" + attrs["height"] + "' ")
570
+ if alt:
571
+ self.o("alt='" + alt + "' ")
572
+ self.o("/>")
573
+ return
574
+
575
+ # If we have a link to create, output the start
576
+ if self.maybe_automatic_link is not None:
577
+ href = self.maybe_automatic_link
578
+ if (
579
+ self.images_to_alt
580
+ and escape_md(alt) == href
581
+ and self.absolute_url_matcher.match(href)
582
+ ):
583
+ self.o("<" + escape_md(alt) + ">")
584
+ self.empty_link = False
585
+ return
586
+ else:
587
+ self.o("[")
588
+ self.maybe_automatic_link = None
589
+ self.empty_link = False
590
+
591
+ # If we have images_to_alt, we discard the image itself,
592
+ # considering only the alt text.
593
+ if self.images_to_alt:
594
+ self.o(escape_md(alt))
595
+ else:
596
+ self.o("![" + escape_md(alt) + "]")
597
+ if self.inline_links:
598
+ href = attrs.get("href") or ""
599
+ self.o(
600
+ "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
601
+ )
602
+ else:
603
+ i = self.previousIndex(attrs)
604
+ if i is not None:
605
+ a_props = self.a[i]
606
+ else:
607
+ self.acount += 1
608
+ a_props = AnchorElement(attrs, self.acount, self.outcount)
609
+ self.a.append(a_props)
610
+ self.o("[" + str(a_props.count) + "]")
611
+
612
+ if tag == "dl" and start:
613
+ self.p()
614
+ if tag == "dt" and not start:
615
+ self.pbr()
616
+ if tag == "dd" and start:
617
+ self.o(" ")
618
+ if tag == "dd" and not start:
619
+ self.pbr()
620
+
621
+ if tag in ["ol", "ul"]:
622
+ # Google Docs create sub lists as top level lists
623
+ if not self.list and not self.lastWasList:
624
+ self.p()
625
+ if start:
626
+ if self.google_doc:
627
+ list_style = google_list_style(tag_style)
628
+ else:
629
+ list_style = tag
630
+ numbering_start = list_numbering_start(attrs)
631
+ self.list.append(ListElement(list_style, numbering_start))
632
+ else:
633
+ if self.list:
634
+ self.list.pop()
635
+ if not self.google_doc and not self.list:
636
+ self.o("\n")
637
+ self.lastWasList = True
638
+ else:
639
+ self.lastWasList = False
640
+
641
+ if tag == "li":
642
+ self.pbr()
643
+ if start:
644
+ if self.list:
645
+ li = self.list[-1]
646
+ else:
647
+ li = ListElement("ul", 0)
648
+ if self.google_doc:
649
+ self.o(" " * self.google_nest_count(tag_style))
650
+ else:
651
+ # Indent two spaces per list, except use three spaces for an
652
+ # unordered list inside an ordered list.
653
+ # https://spec.commonmark.org/0.28/#motivation
654
+ # TODO: line up <ol><li>s > 9 correctly.
655
+ parent_list = None
656
+ for list in self.list:
657
+ self.o(
658
+ " " if parent_list == "ol" and list.name == "ul" else " "
659
+ )
660
+ parent_list = list.name
661
+
662
+ if li.name == "ul":
663
+ self.o(self.ul_item_mark + " ")
664
+ elif li.name == "ol":
665
+ li.num += 1
666
+ self.o(str(li.num) + ". ")
667
+ self.start = True
668
+
669
+ if tag in ["table", "tr", "td", "th"]:
670
+ if self.ignore_tables:
671
+ if tag == "tr":
672
+ if start:
673
+ pass
674
+ else:
675
+ self.soft_br()
676
+ else:
677
+ pass
678
+
679
+ elif self.bypass_tables:
680
+ if start:
681
+ self.soft_br()
682
+ if tag in ["td", "th"]:
683
+ if start:
684
+ self.o("<{}>\n\n".format(tag))
685
+ else:
686
+ self.o("\n</{}>".format(tag))
687
+ else:
688
+ if start:
689
+ self.o("<{}>".format(tag))
690
+ else:
691
+ self.o("</{}>".format(tag))
692
+
693
+ else:
694
+ if tag == "table":
695
+ if start:
696
+ self.table_start = True
697
+ if self.pad_tables:
698
+ self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
699
+ self.o(" \n")
700
+ else:
701
+ if self.pad_tables:
702
+ # add break in case the table is empty or its 1 row table
703
+ self.soft_br()
704
+ self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
705
+ self.o(" \n")
706
+ if tag in ["td", "th"] and start:
707
+ if self.split_next_td:
708
+ self.o("| ")
709
+ self.split_next_td = True
710
+
711
+ if tag == "tr" and start:
712
+ self.td_count = 0
713
+ if tag == "tr" and not start:
714
+ self.split_next_td = False
715
+ self.soft_br()
716
+ if tag == "tr" and not start and self.table_start:
717
+ # Underline table header
718
+ self.o("|".join(["---"] * self.td_count))
719
+ self.soft_br()
720
+ self.table_start = False
721
+ if tag in ["td", "th"] and start:
722
+ self.td_count += 1
723
+
724
+ if tag == "pre":
725
+ if start:
726
+ self.startpre = True
727
+ self.pre = True
728
+ else:
729
+ self.pre = False
730
+ if self.mark_code:
731
+ self.out("\n[/code]")
732
+ self.p()
733
+
734
+ if tag in ["sup", "sub"] and self.include_sup_sub:
735
+ if start:
736
+ self.o("<{}>".format(tag))
737
+ else:
738
+ self.o("</{}>".format(tag))
739
+
740
+ # TODO: Add docstring for these one letter functions
741
+ def pbr(self) -> None:
742
+ "Pretty print has a line break"
743
+ if self.p_p == 0:
744
+ self.p_p = 1
745
+
746
+ def p(self) -> None:
747
+ "Set pretty print to 1 or 2 lines"
748
+ self.p_p = 1 if self.single_line_break else 2
749
+
750
+ def soft_br(self) -> None:
751
+ "Soft breaks"
752
+ self.pbr()
753
+ self.br_toggle = " "
754
+
755
+ def o(
756
+ self, data: str, puredata: bool = False, force: Union[bool, str] = False
757
+ ) -> None:
758
+ """
759
+ Deal with indentation and whitespace
760
+ """
761
+ if self.abbr_data is not None:
762
+ self.abbr_data += data
763
+
764
+ if not self.quiet:
765
+ if self.google_doc:
766
+ # prevent white space immediately after 'begin emphasis'
767
+ # marks ('**' and '_')
768
+ lstripped_data = data.lstrip()
769
+ if self.drop_white_space and not (self.pre or self.code):
770
+ data = lstripped_data
771
+ if lstripped_data != "":
772
+ self.drop_white_space = 0
773
+
774
+ if puredata and not self.pre:
775
+ # This is a very dangerous call ... it could mess up
776
+ # all handling of &nbsp; when not handled properly
777
+ # (see entityref)
778
+ data = re.sub(r"\s+", r" ", data)
779
+ if data and data[0] == " ":
780
+ self.space = True
781
+ data = data[1:]
782
+ if not data and not force:
783
+ return
784
+
785
+ if self.startpre:
786
+ # self.out(" :") #TODO: not output when already one there
787
+ if not data.startswith("\n") and not data.startswith("\r\n"):
788
+ # <pre>stuff...
789
+ data = "\n" + data
790
+ if self.mark_code:
791
+ self.out("\n[code]")
792
+ self.p_p = 0
793
+
794
+ bq = ">" * self.blockquote
795
+ if not (force and data and data[0] == ">") and self.blockquote:
796
+ bq += " "
797
+
798
+ if self.pre:
799
+ if not self.list:
800
+ bq += " "
801
+ # else: list content is already partially indented
802
+ bq += " " * len(self.list)
803
+ data = data.replace("\n", "\n" + bq)
804
+
805
+ if self.startpre:
806
+ self.startpre = False
807
+ if self.list:
808
+ # use existing initial indentation
809
+ data = data.lstrip("\n")
810
+
811
+ if self.start:
812
+ self.space = False
813
+ self.p_p = 0
814
+ self.start = False
815
+
816
+ if force == "end":
817
+ # It's the end.
818
+ self.p_p = 0
819
+ self.out("\n")
820
+ self.space = False
821
+
822
+ if self.p_p:
823
+ self.out((self.br_toggle + "\n" + bq) * self.p_p)
824
+ self.space = False
825
+ self.br_toggle = ""
826
+
827
+ if self.space:
828
+ if not self.lastWasNL:
829
+ self.out(" ")
830
+ self.space = False
831
+
832
+ if self.a and (
833
+ (self.p_p == 2 and self.links_each_paragraph) or force == "end"
834
+ ):
835
+ if force == "end":
836
+ self.out("\n")
837
+
838
+ newa = []
839
+ for link in self.a:
840
+ if self.outcount > link.outcount:
841
+ self.out(
842
+ " ["
843
+ + str(link.count)
844
+ + "]: "
845
+ + urlparse.urljoin(self.baseurl, link.attrs["href"])
846
+ )
847
+ if "title" in link.attrs and link.attrs["title"] is not None:
848
+ self.out(" (" + link.attrs["title"] + ")")
849
+ self.out("\n")
850
+ else:
851
+ newa.append(link)
852
+
853
+ # Don't need an extra line when nothing was done.
854
+ if self.a != newa:
855
+ self.out("\n")
856
+
857
+ self.a = newa
858
+
859
+ if self.abbr_list and force == "end":
860
+ for abbr, definition in self.abbr_list.items():
861
+ self.out(" *[" + abbr + "]: " + definition + "\n")
862
+
863
+ self.p_p = 0
864
+ self.out(data)
865
+ self.outcount += 1
866
+
867
+ def handle_data(self, data: str, entity_char: bool = False) -> None:
868
+ if not data:
869
+ # Data may be empty for some HTML entities. For example,
870
+ # LEFT-TO-RIGHT MARK.
871
+ return
872
+
873
+ if self.stressed:
874
+ data = data.strip()
875
+ self.stressed = False
876
+ self.preceding_stressed = True
877
+ elif self.preceding_stressed:
878
+ if (
879
+ re.match(r"[^][(){}\s.!?]", data[0])
880
+ and not hn(self.current_tag)
881
+ and self.current_tag not in ["a", "code", "pre"]
882
+ ):
883
+ # should match a letter or common punctuation
884
+ data = " " + data
885
+ self.preceding_stressed = False
886
+
887
+ if self.style:
888
+ self.style_def.update(dumb_css_parser(data))
889
+
890
+ if self.maybe_automatic_link is not None:
891
+ href = self.maybe_automatic_link
892
+ if (
893
+ href == data
894
+ and self.absolute_url_matcher.match(href)
895
+ and self.use_automatic_links
896
+ ):
897
+ self.o("<" + data + ">")
898
+ self.empty_link = False
899
+ return
900
+ else:
901
+ self.o("[")
902
+ self.maybe_automatic_link = None
903
+ self.empty_link = False
904
+
905
+ if not self.code and not self.pre and not entity_char:
906
+ data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
907
+ self.preceding_data = data
908
+ self.o(data, puredata=True)
909
+
910
+ def charref(self, name: str) -> str:
911
+ if name[0] in ["x", "X"]:
912
+ c = int(name[1:], 16)
913
+ else:
914
+ c = int(name)
915
+
916
+ if not self.unicode_snob and c in unifiable_n:
917
+ return unifiable_n[c]
918
+ else:
919
+ try:
920
+ return chr(c)
921
+ except ValueError: # invalid unicode
922
+ return ""
923
+
924
+ def entityref(self, c: str) -> str:
925
+ if not self.unicode_snob and c in config.UNIFIABLE:
926
+ return config.UNIFIABLE[c]
927
+ try:
928
+ ch = html.entities.html5[c + ";"]
929
+ except KeyError:
930
+ return "&" + c + ";"
931
+ return config.UNIFIABLE[c] if c == "nbsp" else ch
932
+
933
+ def google_nest_count(self, style: Dict[str, str]) -> int:
934
+ """
935
+ Calculate the nesting count of google doc lists
936
+
937
+ :type style: dict
938
+
939
+ :rtype: int
940
+ """
941
+ nest_count = 0
942
+ if "margin-left" in style:
943
+ nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
944
+
945
+ return nest_count
946
+
947
+ def optwrap(self, text: str) -> str:
948
+ """
949
+ Wrap all paragraphs in the provided text.
950
+
951
+ :type text: str
952
+
953
+ :rtype: str
954
+ """
955
+ if not self.body_width:
956
+ return text
957
+
958
+ result = ""
959
+ newlines = 0
960
+ # I cannot think of a better solution for now.
961
+ # To avoid the non-wrap behaviour for entire paras
962
+ # because of the presence of a link in it
963
+ if not self.wrap_links:
964
+ self.inline_links = False
965
+ for para in text.split("\n"):
966
+ if len(para) > 0:
967
+ if not skipwrap(
968
+ para, self.wrap_links, self.wrap_list_items, self.wrap_tables
969
+ ):
970
+ indent = ""
971
+ if para.startswith(" " + self.ul_item_mark):
972
+ # list item continuation: add a double indent to the
973
+ # new lines
974
+ indent = " "
975
+ elif para.startswith("> "):
976
+ # blockquote continuation: add the greater than symbol
977
+ # to the new lines
978
+ indent = "> "
979
+ wrapped = wrap(
980
+ para,
981
+ self.body_width,
982
+ break_long_words=False,
983
+ subsequent_indent=indent,
984
+ )
985
+ result += "\n".join(wrapped)
986
+ if para.endswith(" "):
987
+ result += " \n"
988
+ newlines = 1
989
+ elif indent:
990
+ result += "\n"
991
+ newlines = 1
992
+ else:
993
+ result += "\n\n"
994
+ newlines = 2
995
+ else:
996
+ # Warning for the tempted!!!
997
+ # Be aware that obvious replacement of this with
998
+ # line.isspace()
999
+ # DOES NOT work! Explanations are welcome.
1000
+ if not config.RE_SPACE.match(para):
1001
+ result += para + "\n"
1002
+ newlines = 1
1003
+ else:
1004
+ if newlines < 2:
1005
+ result += "\n"
1006
+ newlines += 1
1007
+ return result
1008
+
1009
+ def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
1010
+ if bodywidth is None:
1011
+ bodywidth = config.BODY_WIDTH
1012
+ h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
1013
+
1014
+ return h.handle(html)
1015
+
1016
+ class CustomHTML2Text(HTML2Text):
1017
+ def __init__(self, *args, handle_code_in_pre=False, **kwargs):
1018
+ super().__init__(*args, **kwargs)
1019
+ self.inside_pre = False
1020
+ self.inside_code = False
1021
+ self.preserve_tags = set() # Set of tags to preserve
1022
+ self.current_preserved_tag = None
1023
+ self.preserved_content = []
1024
+ self.preserve_depth = 0
1025
+ self.handle_code_in_pre = handle_code_in_pre
1026
+
1027
+ # Configuration options
1028
+ self.skip_internal_links = False
1029
+ self.single_line_break = False
1030
+ self.mark_code = False
1031
+ self.include_sup_sub = False
1032
+ self.body_width = 0
1033
+ self.ignore_mailto_links = True
1034
+ self.ignore_links = False
1035
+ self.escape_backslash = False
1036
+ self.escape_dot = False
1037
+ self.escape_plus = False
1038
+ self.escape_dash = False
1039
+ self.escape_snob = False
1040
+
1041
+ def update_params(self, **kwargs):
1042
+ """Update parameters and set preserved tags."""
1043
+ for key, value in kwargs.items():
1044
+ if key == 'preserve_tags':
1045
+ self.preserve_tags = set(value)
1046
+ elif key == 'handle_code_in_pre':
1047
+ self.handle_code_in_pre = value
1048
+ else:
1049
+ setattr(self, key, value)
1050
+
1051
+ def handle_tag(self, tag, attrs, start):
1052
+ # Handle preserved tags
1053
+ if tag in self.preserve_tags:
1054
+ if start:
1055
+ if self.preserve_depth == 0:
1056
+ self.current_preserved_tag = tag
1057
+ self.preserved_content = []
1058
+ # Format opening tag with attributes
1059
+ attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
1060
+ self.preserved_content.append(f'<{tag}{attr_str}>')
1061
+ self.preserve_depth += 1
1062
+ return
1063
+ else:
1064
+ self.preserve_depth -= 1
1065
+ if self.preserve_depth == 0:
1066
+ self.preserved_content.append(f'</{tag}>')
1067
+ # Output the preserved HTML block with proper spacing
1068
+ preserved_html = ''.join(self.preserved_content)
1069
+ self.o('\n' + preserved_html + '\n')
1070
+ self.current_preserved_tag = None
1071
+ return
1072
+
1073
+ # If we're inside a preserved tag, collect all content
1074
+ if self.preserve_depth > 0:
1075
+ if start:
1076
+ # Format nested tags with attributes
1077
+ attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
1078
+ self.preserved_content.append(f'<{tag}{attr_str}>')
1079
+ else:
1080
+ self.preserved_content.append(f'</{tag}>')
1081
+ return
1082
+
1083
+ # Handle pre tags
1084
+ if tag == 'pre':
1085
+ if start:
1086
+ self.o('```\n') # Markdown code block start
1087
+ self.inside_pre = True
1088
+ else:
1089
+ self.o('\n```\n') # Markdown code block end
1090
+ self.inside_pre = False
1091
+ elif tag == 'code':
1092
+ if self.inside_pre and not self.handle_code_in_pre:
1093
+ # Ignore code tags inside pre blocks if handle_code_in_pre is False
1094
+ return
1095
+ if start:
1096
+ self.o('`') # Markdown inline code start
1097
+ self.inside_code = True
1098
+ else:
1099
+ self.o('`') # Markdown inline code end
1100
+ self.inside_code = False
1101
+ else:
1102
+ super().handle_tag(tag, attrs, start)
1103
+
1104
+ def handle_data(self, data, entity_char=False):
1105
+ """Override handle_data to capture content within preserved tags."""
1106
+ if self.preserve_depth > 0:
1107
+ self.preserved_content.append(data)
1108
+ return
1109
+
1110
+ if self.inside_pre:
1111
+ # Output the raw content for pre blocks, including content inside code tags
1112
+ self.o(data) # Directly output the data as-is (preserve newlines)
1113
+ return
1114
+ if self.inside_code:
1115
+ # Inline code: no newlines allowed
1116
+ self.o(data.replace('\n', ' '))
1117
+ return
1118
+
1119
+ # Default behavior for other tags
1120
+ super().handle_data(data, entity_char)
1121
+
1122
+
1123
+ # # Handle pre tags
1124
+ # if tag == 'pre':
1125
+ # if start:
1126
+ # self.o('```\n')
1127
+ # self.inside_pre = True
1128
+ # else:
1129
+ # self.o('\n```')
1130
+ # self.inside_pre = False
1131
+ # # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
1132
+ # # pass
1133
+ # else:
1134
+ # super().handle_tag(tag, attrs, start)
1135
+
1136
+ # def handle_data(self, data, entity_char=False):
1137
+ # """Override handle_data to capture content within preserved tags."""
1138
+ # if self.preserve_depth > 0:
1139
+ # self.preserved_content.append(data)
1140
+ # return
1141
+ # super().handle_data(data, entity_char)
crawl4ai/html2text/__main__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .cli import main
2
+
3
+ main()
crawl4ai/html2text/_typing.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ class OutCallback:
2
+ def __call__(self, s: str) -> None: ...
crawl4ai/html2text/cli.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+
4
+ from . import HTML2Text, __version__, config
5
+
6
+
7
+ def main() -> None:
8
+ baseurl = ""
9
+
10
+ class bcolors:
11
+ HEADER = "\033[95m"
12
+ OKBLUE = "\033[94m"
13
+ OKGREEN = "\033[92m"
14
+ WARNING = "\033[93m"
15
+ FAIL = "\033[91m"
16
+ ENDC = "\033[0m"
17
+ BOLD = "\033[1m"
18
+ UNDERLINE = "\033[4m"
19
+
20
+ p = argparse.ArgumentParser()
21
+ p.add_argument(
22
+ "--default-image-alt",
23
+ dest="default_image_alt",
24
+ default=config.DEFAULT_IMAGE_ALT,
25
+ help="The default alt string for images with missing ones",
26
+ )
27
+ p.add_argument(
28
+ "--pad-tables",
29
+ dest="pad_tables",
30
+ action="store_true",
31
+ default=config.PAD_TABLES,
32
+ help="pad the cells to equal column width in tables",
33
+ )
34
+ p.add_argument(
35
+ "--no-wrap-links",
36
+ dest="wrap_links",
37
+ action="store_false",
38
+ default=config.WRAP_LINKS,
39
+ help="don't wrap links during conversion",
40
+ )
41
+ p.add_argument(
42
+ "--wrap-list-items",
43
+ dest="wrap_list_items",
44
+ action="store_true",
45
+ default=config.WRAP_LIST_ITEMS,
46
+ help="wrap list items during conversion",
47
+ )
48
+ p.add_argument(
49
+ "--wrap-tables",
50
+ dest="wrap_tables",
51
+ action="store_true",
52
+ default=config.WRAP_TABLES,
53
+ help="wrap tables",
54
+ )
55
+ p.add_argument(
56
+ "--ignore-emphasis",
57
+ dest="ignore_emphasis",
58
+ action="store_true",
59
+ default=config.IGNORE_EMPHASIS,
60
+ help="don't include any formatting for emphasis",
61
+ )
62
+ p.add_argument(
63
+ "--reference-links",
64
+ dest="inline_links",
65
+ action="store_false",
66
+ default=config.INLINE_LINKS,
67
+ help="use reference style links instead of inline links",
68
+ )
69
+ p.add_argument(
70
+ "--ignore-links",
71
+ dest="ignore_links",
72
+ action="store_true",
73
+ default=config.IGNORE_ANCHORS,
74
+ help="don't include any formatting for links",
75
+ )
76
+ p.add_argument(
77
+ "--ignore-mailto-links",
78
+ action="store_true",
79
+ dest="ignore_mailto_links",
80
+ default=config.IGNORE_MAILTO_LINKS,
81
+ help="don't include mailto: links",
82
+ )
83
+ p.add_argument(
84
+ "--protect-links",
85
+ dest="protect_links",
86
+ action="store_true",
87
+ default=config.PROTECT_LINKS,
88
+ help="protect links from line breaks surrounding them with angle brackets",
89
+ )
90
+ p.add_argument(
91
+ "--ignore-images",
92
+ dest="ignore_images",
93
+ action="store_true",
94
+ default=config.IGNORE_IMAGES,
95
+ help="don't include any formatting for images",
96
+ )
97
+ p.add_argument(
98
+ "--images-as-html",
99
+ dest="images_as_html",
100
+ action="store_true",
101
+ default=config.IMAGES_AS_HTML,
102
+ help=(
103
+ "Always write image tags as raw html; preserves `height`, `width` and "
104
+ "`alt` if possible."
105
+ ),
106
+ )
107
+ p.add_argument(
108
+ "--images-to-alt",
109
+ dest="images_to_alt",
110
+ action="store_true",
111
+ default=config.IMAGES_TO_ALT,
112
+ help="Discard image data, only keep alt text",
113
+ )
114
+ p.add_argument(
115
+ "--images-with-size",
116
+ dest="images_with_size",
117
+ action="store_true",
118
+ default=config.IMAGES_WITH_SIZE,
119
+ help=(
120
+ "Write image tags with height and width attrs as raw html to retain "
121
+ "dimensions"
122
+ ),
123
+ )
124
+ p.add_argument(
125
+ "-g",
126
+ "--google-doc",
127
+ action="store_true",
128
+ dest="google_doc",
129
+ default=False,
130
+ help="convert an html-exported Google Document",
131
+ )
132
+ p.add_argument(
133
+ "-d",
134
+ "--dash-unordered-list",
135
+ action="store_true",
136
+ dest="ul_style_dash",
137
+ default=False,
138
+ help="use a dash rather than a star for unordered list items",
139
+ )
140
+ p.add_argument(
141
+ "-e",
142
+ "--asterisk-emphasis",
143
+ action="store_true",
144
+ dest="em_style_asterisk",
145
+ default=False,
146
+ help="use an asterisk rather than an underscore for emphasized text",
147
+ )
148
+ p.add_argument(
149
+ "-b",
150
+ "--body-width",
151
+ dest="body_width",
152
+ type=int,
153
+ default=config.BODY_WIDTH,
154
+ help="number of characters per output line, 0 for no wrap",
155
+ )
156
+ p.add_argument(
157
+ "-i",
158
+ "--google-list-indent",
159
+ dest="list_indent",
160
+ type=int,
161
+ default=config.GOOGLE_LIST_INDENT,
162
+ help="number of pixels Google indents nested lists",
163
+ )
164
+ p.add_argument(
165
+ "-s",
166
+ "--hide-strikethrough",
167
+ action="store_true",
168
+ dest="hide_strikethrough",
169
+ default=False,
170
+ help="hide strike-through text. only relevant when -g is " "specified as well",
171
+ )
172
+ p.add_argument(
173
+ "--escape-all",
174
+ action="store_true",
175
+ dest="escape_snob",
176
+ default=False,
177
+ help=(
178
+ "Escape all special characters. Output is less readable, but avoids "
179
+ "corner case formatting issues."
180
+ ),
181
+ )
182
+ p.add_argument(
183
+ "--bypass-tables",
184
+ action="store_true",
185
+ dest="bypass_tables",
186
+ default=config.BYPASS_TABLES,
187
+ help="Format tables in HTML rather than Markdown syntax.",
188
+ )
189
+ p.add_argument(
190
+ "--ignore-tables",
191
+ action="store_true",
192
+ dest="ignore_tables",
193
+ default=config.IGNORE_TABLES,
194
+ help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
195
+ )
196
+ p.add_argument(
197
+ "--single-line-break",
198
+ action="store_true",
199
+ dest="single_line_break",
200
+ default=config.SINGLE_LINE_BREAK,
201
+ help=(
202
+ "Use a single line break after a block element rather than two line "
203
+ "breaks. NOTE: Requires --body-width=0"
204
+ ),
205
+ )
206
+ p.add_argument(
207
+ "--unicode-snob",
208
+ action="store_true",
209
+ dest="unicode_snob",
210
+ default=config.UNICODE_SNOB,
211
+ help="Use unicode throughout document",
212
+ )
213
+ p.add_argument(
214
+ "--no-automatic-links",
215
+ action="store_false",
216
+ dest="use_automatic_links",
217
+ default=config.USE_AUTOMATIC_LINKS,
218
+ help="Do not use automatic links wherever applicable",
219
+ )
220
+ p.add_argument(
221
+ "--no-skip-internal-links",
222
+ action="store_false",
223
+ dest="skip_internal_links",
224
+ default=config.SKIP_INTERNAL_LINKS,
225
+ help="Do not skip internal links",
226
+ )
227
+ p.add_argument(
228
+ "--links-after-para",
229
+ action="store_true",
230
+ dest="links_each_paragraph",
231
+ default=config.LINKS_EACH_PARAGRAPH,
232
+ help="Put links after each paragraph instead of document",
233
+ )
234
+ p.add_argument(
235
+ "--mark-code",
236
+ action="store_true",
237
+ dest="mark_code",
238
+ default=config.MARK_CODE,
239
+ help="Mark program code blocks with [code]...[/code]",
240
+ )
241
+ p.add_argument(
242
+ "--decode-errors",
243
+ dest="decode_errors",
244
+ default=config.DECODE_ERRORS,
245
+ help=(
246
+ "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
247
+ "acceptable values"
248
+ ),
249
+ )
250
+ p.add_argument(
251
+ "--open-quote",
252
+ dest="open_quote",
253
+ default=config.OPEN_QUOTE,
254
+ help="The character used to open quotes",
255
+ )
256
+ p.add_argument(
257
+ "--close-quote",
258
+ dest="close_quote",
259
+ default=config.CLOSE_QUOTE,
260
+ help="The character used to close quotes",
261
+ )
262
+ p.add_argument(
263
+ "--version", action="version", version=".".join(map(str, __version__))
264
+ )
265
+ p.add_argument("filename", nargs="?")
266
+ p.add_argument("encoding", nargs="?", default="utf-8")
267
+ p.add_argument(
268
+ "--include-sup-sub",
269
+ dest="include_sup_sub",
270
+ action="store_true",
271
+ default=config.INCLUDE_SUP_SUB,
272
+ help="Include the sup and sub tags",
273
+ )
274
+ args = p.parse_args()
275
+
276
+ if args.filename and args.filename != "-":
277
+ with open(args.filename, "rb") as fp:
278
+ data = fp.read()
279
+ else:
280
+ data = sys.stdin.buffer.read()
281
+
282
+ try:
283
+ html = data.decode(args.encoding, args.decode_errors)
284
+ except UnicodeDecodeError as err:
285
+ warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
286
+ warning += " Use the " + bcolors.OKGREEN
287
+ warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
288
+ print(warning)
289
+ raise err
290
+
291
+ h = HTML2Text(baseurl=baseurl)
292
+ # handle options
293
+ if args.ul_style_dash:
294
+ h.ul_item_mark = "-"
295
+ if args.em_style_asterisk:
296
+ h.emphasis_mark = "*"
297
+ h.strong_mark = "__"
298
+
299
+ h.body_width = args.body_width
300
+ h.google_list_indent = args.list_indent
301
+ h.ignore_emphasis = args.ignore_emphasis
302
+ h.ignore_links = args.ignore_links
303
+ h.ignore_mailto_links = args.ignore_mailto_links
304
+ h.protect_links = args.protect_links
305
+ h.ignore_images = args.ignore_images
306
+ h.images_as_html = args.images_as_html
307
+ h.images_to_alt = args.images_to_alt
308
+ h.images_with_size = args.images_with_size
309
+ h.google_doc = args.google_doc
310
+ h.hide_strikethrough = args.hide_strikethrough
311
+ h.escape_snob = args.escape_snob
312
+ h.bypass_tables = args.bypass_tables
313
+ h.ignore_tables = args.ignore_tables
314
+ h.single_line_break = args.single_line_break
315
+ h.inline_links = args.inline_links
316
+ h.unicode_snob = args.unicode_snob
317
+ h.use_automatic_links = args.use_automatic_links
318
+ h.skip_internal_links = args.skip_internal_links
319
+ h.links_each_paragraph = args.links_each_paragraph
320
+ h.mark_code = args.mark_code
321
+ h.wrap_links = args.wrap_links
322
+ h.wrap_list_items = args.wrap_list_items
323
+ h.wrap_tables = args.wrap_tables
324
+ h.pad_tables = args.pad_tables
325
+ h.default_image_alt = args.default_image_alt
326
+ h.open_quote = args.open_quote
327
+ h.close_quote = args.close_quote
328
+ h.include_sup_sub = args.include_sup_sub
329
+
330
+ sys.stdout.write(h.handle(html))
crawl4ai/html2text/config.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # Use Unicode characters instead of their ascii pseudo-replacements
4
+ UNICODE_SNOB = False
5
+
6
+ # Marker to use for marking tables for padding post processing
7
+ TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
8
+ # Escape all special characters. Output is less readable, but avoids
9
+ # corner case formatting issues.
10
+ ESCAPE_SNOB = False
11
+ ESCAPE_BACKSLASH = False
12
+ ESCAPE_DOT = False
13
+ ESCAPE_PLUS = False
14
+ ESCAPE_DASH = False
15
+
16
+ # Put the links after each paragraph instead of at the end.
17
+ LINKS_EACH_PARAGRAPH = False
18
+
19
+ # Wrap long lines at position. 0 for no wrapping.
20
+ BODY_WIDTH = 78
21
+
22
+ # Don't show internal links (href="#local-anchor") -- corresponding link
23
+ # targets won't be visible in the plain text file anyway.
24
+ SKIP_INTERNAL_LINKS = True
25
+
26
+ # Use inline, rather than reference, formatting for images and links
27
+ INLINE_LINKS = True
28
+
29
+ # Protect links from line breaks surrounding them with angle brackets (in
30
+ # addition to their square brackets)
31
+ PROTECT_LINKS = False
32
+ # WRAP_LINKS = True
33
+ WRAP_LINKS = True
34
+
35
+ # Wrap list items.
36
+ WRAP_LIST_ITEMS = False
37
+
38
+ # Wrap tables
39
+ WRAP_TABLES = False
40
+
41
+ # Number of pixels Google indents nested lists
42
+ GOOGLE_LIST_INDENT = 36
43
+
44
+ # Values Google and others may use to indicate bold text
45
+ BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
46
+
47
+ IGNORE_ANCHORS = False
48
+ IGNORE_MAILTO_LINKS = False
49
+ IGNORE_IMAGES = False
50
+ IMAGES_AS_HTML = False
51
+ IMAGES_TO_ALT = False
52
+ IMAGES_WITH_SIZE = False
53
+ IGNORE_EMPHASIS = False
54
+ MARK_CODE = False
55
+ DECODE_ERRORS = "strict"
56
+ DEFAULT_IMAGE_ALT = ""
57
+ PAD_TABLES = False
58
+
59
+ # Convert links with same href and text to <href> format
60
+ # if they are absolute links
61
+ USE_AUTOMATIC_LINKS = True
62
+
63
+ # For checking space-only lines on line 771
64
+ RE_SPACE = re.compile(r"\s\+")
65
+
66
+ RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
67
+ RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
68
+ RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
69
+ RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
70
+
71
+ # to find links in the text
72
+ RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
73
+
74
+ # to find table separators
75
+ RE_TABLE = re.compile(r" \| ")
76
+
77
+ RE_MD_DOT_MATCHER = re.compile(
78
+ r"""
79
+ ^ # start of line
80
+ (\s*\d+) # optional whitespace and a number
81
+ (\.) # dot
82
+ (?=\s) # lookahead assert whitespace
83
+ """,
84
+ re.MULTILINE | re.VERBOSE,
85
+ )
86
+ RE_MD_PLUS_MATCHER = re.compile(
87
+ r"""
88
+ ^
89
+ (\s*)
90
+ (\+)
91
+ (?=\s)
92
+ """,
93
+ flags=re.MULTILINE | re.VERBOSE,
94
+ )
95
+ RE_MD_DASH_MATCHER = re.compile(
96
+ r"""
97
+ ^
98
+ (\s*)
99
+ (-)
100
+ (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
101
+ # or another dash (header or hr)
102
+ """,
103
+ flags=re.MULTILINE | re.VERBOSE,
104
+ )
105
+ RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
106
+ RE_MD_BACKSLASH_MATCHER = re.compile(
107
+ r"""
108
+ (\\) # match one slash
109
+ (?=[%s]) # followed by a char that requires escaping
110
+ """
111
+ % re.escape(RE_SLASH_CHARS),
112
+ flags=re.VERBOSE,
113
+ )
114
+
115
+ UNIFIABLE = {
116
+ "rsquo": "'",
117
+ "lsquo": "'",
118
+ "rdquo": '"',
119
+ "ldquo": '"',
120
+ "copy": "(C)",
121
+ "mdash": "--",
122
+ "nbsp": " ",
123
+ "rarr": "->",
124
+ "larr": "<-",
125
+ "middot": "*",
126
+ "ndash": "-",
127
+ "oelig": "oe",
128
+ "aelig": "ae",
129
+ "agrave": "a",
130
+ "aacute": "a",
131
+ "acirc": "a",
132
+ "atilde": "a",
133
+ "auml": "a",
134
+ "aring": "a",
135
+ "egrave": "e",
136
+ "eacute": "e",
137
+ "ecirc": "e",
138
+ "euml": "e",
139
+ "igrave": "i",
140
+ "iacute": "i",
141
+ "icirc": "i",
142
+ "iuml": "i",
143
+ "ograve": "o",
144
+ "oacute": "o",
145
+ "ocirc": "o",
146
+ "otilde": "o",
147
+ "ouml": "o",
148
+ "ugrave": "u",
149
+ "uacute": "u",
150
+ "ucirc": "u",
151
+ "uuml": "u",
152
+ "lrm": "",
153
+ "rlm": "",
154
+ }
155
+
156
+ # Format tables in HTML rather than Markdown syntax
157
+ BYPASS_TABLES = False
158
+ # Ignore table-related tags (table, th, td, tr) while keeping rows
159
+ IGNORE_TABLES = False
160
+
161
+
162
+ # Use a single line break after a block element rather than two line breaks.
163
+ # NOTE: Requires body width setting to be 0.
164
+ SINGLE_LINE_BREAK = False
165
+
166
+
167
+ # Use double quotation marks when converting the <q> tag.
168
+ OPEN_QUOTE = '"'
169
+ CLOSE_QUOTE = '"'
170
+
171
+ # Include the <sup> and <sub> tags
172
+ INCLUDE_SUP_SUB = False
crawl4ai/html2text/elements.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional
2
+
3
+
4
+ class AnchorElement:
5
+ __slots__ = ["attrs", "count", "outcount"]
6
+
7
+ def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
8
+ self.attrs = attrs
9
+ self.count = count
10
+ self.outcount = outcount
11
+
12
+
13
+ class ListElement:
14
+ __slots__ = ["name", "num"]
15
+
16
+ def __init__(self, name: str, num: int):
17
+ self.name = name
18
+ self.num = num
crawl4ai/html2text/utils.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html.entities
2
+ from typing import Dict, List, Optional
3
+
4
+ from . import config
5
+
6
+ unifiable_n = {
7
+ html.entities.name2codepoint[k]: v
8
+ for k, v in config.UNIFIABLE.items()
9
+ if k != "nbsp"
10
+ }
11
+
12
+
13
+ def hn(tag: str) -> int:
14
+ if tag[0] == "h" and len(tag) == 2:
15
+ n = tag[1]
16
+ if "0" < n <= "9":
17
+ return int(n)
18
+ return 0
19
+
20
+
21
+ def dumb_property_dict(style: str) -> Dict[str, str]:
22
+ """
23
+ :returns: A hash of css attributes
24
+ """
25
+ return {
26
+ x.strip().lower(): y.strip().lower()
27
+ for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
28
+ }
29
+
30
+
31
+ def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
32
+ """
33
+ :type data: str
34
+
35
+ :returns: A hash of css selectors, each of which contains a hash of
36
+ css attributes.
37
+ :rtype: dict
38
+ """
39
+ # remove @import sentences
40
+ data += ";"
41
+ importIndex = data.find("@import")
42
+ while importIndex != -1:
43
+ data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
44
+ importIndex = data.find("@import")
45
+
46
+ # parse the css. reverted from dictionary comprehension in order to
47
+ # support older pythons
48
+ pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
49
+ try:
50
+ elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
51
+ except ValueError:
52
+ elements = {} # not that important
53
+
54
+ return elements
55
+
56
+
57
+ def element_style(
58
+ attrs: Dict[str, Optional[str]],
59
+ style_def: Dict[str, Dict[str, str]],
60
+ parent_style: Dict[str, str],
61
+ ) -> Dict[str, str]:
62
+ """
63
+ :type attrs: dict
64
+ :type style_def: dict
65
+ :type style_def: dict
66
+
67
+ :returns: A hash of the 'final' style attributes of the element
68
+ :rtype: dict
69
+ """
70
+ style = parent_style.copy()
71
+ if "class" in attrs:
72
+ assert attrs["class"] is not None
73
+ for css_class in attrs["class"].split():
74
+ css_style = style_def.get("." + css_class, {})
75
+ style.update(css_style)
76
+ if "style" in attrs:
77
+ assert attrs["style"] is not None
78
+ immediate_style = dumb_property_dict(attrs["style"])
79
+ style.update(immediate_style)
80
+
81
+ return style
82
+
83
+
84
+ def google_list_style(style: Dict[str, str]) -> str:
85
+ """
86
+ Finds out whether this is an ordered or unordered list
87
+
88
+ :type style: dict
89
+
90
+ :rtype: str
91
+ """
92
+ if "list-style-type" in style:
93
+ list_style = style["list-style-type"]
94
+ if list_style in ["disc", "circle", "square", "none"]:
95
+ return "ul"
96
+
97
+ return "ol"
98
+
99
+
100
+ def google_has_height(style: Dict[str, str]) -> bool:
101
+ """
102
+ Check if the style of the element has the 'height' attribute
103
+ explicitly defined
104
+
105
+ :type style: dict
106
+
107
+ :rtype: bool
108
+ """
109
+ return "height" in style
110
+
111
+
112
+ def google_text_emphasis(style: Dict[str, str]) -> List[str]:
113
+ """
114
+ :type style: dict
115
+
116
+ :returns: A list of all emphasis modifiers of the element
117
+ :rtype: list
118
+ """
119
+ emphasis = []
120
+ if "text-decoration" in style:
121
+ emphasis.append(style["text-decoration"])
122
+ if "font-style" in style:
123
+ emphasis.append(style["font-style"])
124
+ if "font-weight" in style:
125
+ emphasis.append(style["font-weight"])
126
+
127
+ return emphasis
128
+
129
+
130
+ def google_fixed_width_font(style: Dict[str, str]) -> bool:
131
+ """
132
+ Check if the css of the current element defines a fixed width font
133
+
134
+ :type style: dict
135
+
136
+ :rtype: bool
137
+ """
138
+ font_family = ""
139
+ if "font-family" in style:
140
+ font_family = style["font-family"]
141
+ return "courier new" == font_family or "consolas" == font_family
142
+
143
+
144
+ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
145
+ """
146
+ Extract numbering from list element attributes
147
+
148
+ :type attrs: dict
149
+
150
+ :rtype: int or None
151
+ """
152
+ if "start" in attrs:
153
+ assert attrs["start"] is not None
154
+ try:
155
+ return int(attrs["start"]) - 1
156
+ except ValueError:
157
+ pass
158
+
159
+ return 0
160
+
161
+
162
+ def skipwrap(
163
+ para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
164
+ ) -> bool:
165
+ # If it appears to contain a link
166
+ # don't wrap
167
+ if not wrap_links and config.RE_LINK.search(para):
168
+ return True
169
+ # If the text begins with four spaces or one tab, it's a code block;
170
+ # don't wrap
171
+ if para[0:4] == " " or para[0] == "\t":
172
+ return True
173
+
174
+ # If the text begins with only two "--", possibly preceded by
175
+ # whitespace, that's an emdash; so wrap.
176
+ stripped = para.lstrip()
177
+ if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
178
+ return False
179
+
180
+ # I'm not sure what this is for; I thought it was to detect lists,
181
+ # but there's a <br>-inside-<span> case in one of the tests that
182
+ # also depends upon it.
183
+ if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
184
+ return not wrap_list_items
185
+
186
+ # If text contains a pipe character it is likely a table
187
+ if not wrap_tables and config.RE_TABLE.search(para):
188
+ return True
189
+
190
+ # If the text begins with a single -, *, or +, followed by a space,
191
+ # or an integer, followed by a ., followed by a space (in either
192
+ # case optionally proceeded by whitespace), it's a list; don't wrap.
193
+ return bool(
194
+ config.RE_ORDERED_LIST_MATCHER.match(stripped)
195
+ or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
196
+ )
197
+
198
+
199
+ def escape_md(text: str) -> str:
200
+ """
201
+ Escapes markdown-sensitive characters within other markdown
202
+ constructs.
203
+ """
204
+ return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
205
+
206
+
207
+ def escape_md_section(
208
+ text: str,
209
+ escape_backslash: bool = True,
210
+ snob: bool = False,
211
+ escape_dot: bool = True,
212
+ escape_plus: bool = True,
213
+ escape_dash: bool = True
214
+ ) -> str:
215
+ """
216
+ Escapes markdown-sensitive characters across whole document sections.
217
+ Each escaping operation can be controlled individually.
218
+ """
219
+ if escape_backslash:
220
+ text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
221
+
222
+ if snob:
223
+ text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
224
+
225
+ if escape_dot:
226
+ text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
227
+
228
+ if escape_plus:
229
+ text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
230
+
231
+ if escape_dash:
232
+ text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
233
+
234
+ return text
235
+
236
+ def reformat_table(lines: List[str], right_margin: int) -> List[str]:
237
+ """
238
+ Given the lines of a table
239
+ padds the cells and returns the new lines
240
+ """
241
+ # find the maximum width of the columns
242
+ max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
243
+ max_cols = len(max_width)
244
+ for line in lines:
245
+ cols = [x.rstrip() for x in line.split("|")]
246
+ num_cols = len(cols)
247
+
248
+ # don't drop any data if colspan attributes result in unequal lengths
249
+ if num_cols < max_cols:
250
+ cols += [""] * (max_cols - num_cols)
251
+ elif max_cols < num_cols:
252
+ max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
253
+ max_cols = num_cols
254
+
255
+ max_width = [
256
+ max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
257
+ ]
258
+
259
+ # reformat
260
+ new_lines = []
261
+ for line in lines:
262
+ cols = [x.rstrip() for x in line.split("|")]
263
+ if set(line.strip()) == set("-|"):
264
+ filler = "-"
265
+ new_cols = [
266
+ x.rstrip() + (filler * (M - len(x.rstrip())))
267
+ for x, M in zip(cols, max_width)
268
+ ]
269
+ new_lines.append("|-" + "|".join(new_cols) + "|")
270
+ else:
271
+ filler = " "
272
+ new_cols = [
273
+ x.rstrip() + (filler * (M - len(x.rstrip())))
274
+ for x, M in zip(cols, max_width)
275
+ ]
276
+ new_lines.append("| " + "|".join(new_cols) + "|")
277
+ return new_lines
278
+
279
+
280
+ def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
281
+ """
282
+ Provide padding for tables in the text
283
+ """
284
+ lines = text.split("\n")
285
+ table_buffer = [] # type: List[str]
286
+ table_started = False
287
+ new_lines = []
288
+ for line in lines:
289
+ # Toggle table started
290
+ if config.TABLE_MARKER_FOR_PAD in line:
291
+ table_started = not table_started
292
+ if not table_started:
293
+ table = reformat_table(table_buffer, right_margin)
294
+ new_lines.extend(table)
295
+ table_buffer = []
296
+ new_lines.append("")
297
+ continue
298
+ # Process lines
299
+ if table_started:
300
+ table_buffer.append(line)
301
+ else:
302
+ new_lines.append(line)
303
+ return "\n".join(new_lines)
crawl4ai/install.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import asyncio
4
+ from .async_logger import AsyncLogger, LogLevel
5
+
6
+ # Initialize logger
7
+ logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
8
+
9
+ def post_install():
10
+ """Run all post-installation tasks"""
11
+ logger.info("Running post-installation setup...", tag="INIT")
12
+ install_playwright()
13
+ run_migration()
14
+ logger.success("Post-installation setup completed!", tag="COMPLETE")
15
+
16
+ def install_playwright():
17
+ logger.info("Installing Playwright browsers...", tag="INIT")
18
+ try:
19
+ # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
20
+ subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
21
+ logger.success("Playwright installation completed successfully.", tag="COMPLETE")
22
+ except subprocess.CalledProcessError as e:
23
+ # logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
24
+ logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
25
+ except Exception as e:
26
+ # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
27
+ logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
28
+
29
+ def run_migration():
30
+ """Initialize database during installation"""
31
+ try:
32
+ logger.info("Starting database initialization...", tag="INIT")
33
+ from crawl4ai.async_database import async_db_manager
34
+
35
+ asyncio.run(async_db_manager.initialize())
36
+ logger.success("Database initialization completed successfully.", tag="COMPLETE")
37
+ except ImportError:
38
+ logger.warning("Database module not found. Will initialize on first use.")
39
+ except Exception as e:
40
+ logger.warning(f"Database initialization failed: {e}")
41
+ logger.warning("Database will be initialized on first use")
42
+
43
+ async def run_doctor():
44
+ """Test if Crawl4AI is working properly"""
45
+ logger.info("Running Crawl4AI health check...", tag="INIT")
46
+ try:
47
+ from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
48
+
49
+ browser_config = BrowserConfig(
50
+ headless=True,
51
+ browser_type="chromium",
52
+ ignore_https_errors=True,
53
+ light_mode=True,
54
+ viewport_width=1280,
55
+ viewport_height=720
56
+ )
57
+
58
+ run_config = CrawlerRunConfig(
59
+ cache_mode=CacheMode.BYPASS,
60
+ screenshot=True,
61
+ )
62
+
63
+ async with AsyncWebCrawler(config=browser_config) as crawler:
64
+ logger.info("Testing crawling capabilities...", tag="TEST")
65
+ result = await crawler.arun(
66
+ url="https://crawl4ai.com",
67
+ config=run_config
68
+ )
69
+
70
+ if result and result.markdown:
71
+ logger.success("✅ Crawling test passed!", tag="COMPLETE")
72
+ return True
73
+ else:
74
+ raise Exception("Failed to get content")
75
+
76
+ except Exception as e:
77
+ logger.error(f"❌ Test failed: {e}", tag="ERROR")
78
+ return False
79
+
80
+ def doctor():
81
+ """Entry point for the doctor command"""
82
+ import asyncio
83
+ return asyncio.run(run_doctor())
crawl4ai/js_snippet/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+
3
+ # Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
4
+ def load_js_script(script_name):
5
+ # Get the path of the current script
6
+ current_script_path = os.path.dirname(os.path.realpath(__file__))
7
+ # Get the path of the script to load
8
+ script_path = os.path.join(current_script_path, script_name + '.js')
9
+ # Check if the script exists
10
+ if not os.path.exists(script_path):
11
+ raise ValueError(f"Script {script_name} not found in the folder {current_script_path}")
12
+ # Load the content of the script
13
+ with open(script_path, 'r') as f:
14
+ script_content = f.read()
15
+ return script_content
crawl4ai/js_snippet/navigator_overrider.js ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Pass the Permissions Test.
2
+ const originalQuery = window.navigator.permissions.query;
3
+ window.navigator.permissions.query = (parameters) =>
4
+ parameters.name === "notifications"
5
+ ? Promise.resolve({ state: Notification.permission })
6
+ : originalQuery(parameters);
7
+ Object.defineProperty(navigator, "webdriver", {
8
+ get: () => undefined,
9
+ });
10
+ window.navigator.chrome = {
11
+ runtime: {},
12
+ // Add other properties if necessary
13
+ };
14
+ Object.defineProperty(navigator, "plugins", {
15
+ get: () => [1, 2, 3, 4, 5],
16
+ });
17
+ Object.defineProperty(navigator, "languages", {
18
+ get: () => ["en-US", "en"],
19
+ });
20
+ Object.defineProperty(document, "hidden", {
21
+ get: () => false,
22
+ });
23
+ Object.defineProperty(document, "visibilityState", {
24
+ get: () => "visible",
25
+ });
crawl4ai/js_snippet/remove_overlay_elements.js ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ async () => {
2
+ // Function to check if element is visible
3
+ const isVisible = (elem) => {
4
+ const style = window.getComputedStyle(elem);
5
+ return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
6
+ };
7
+
8
+ // Common selectors for popups and overlays
9
+ const commonSelectors = [
10
+ // Close buttons first
11
+ 'button[class*="close" i]',
12
+ 'button[class*="dismiss" i]',
13
+ 'button[aria-label*="close" i]',
14
+ 'button[title*="close" i]',
15
+ 'a[class*="close" i]',
16
+ 'span[class*="close" i]',
17
+
18
+ // Cookie notices
19
+ '[class*="cookie-banner" i]',
20
+ '[id*="cookie-banner" i]',
21
+ '[class*="cookie-consent" i]',
22
+ '[id*="cookie-consent" i]',
23
+
24
+ // Newsletter/subscription dialogs
25
+ '[class*="newsletter" i]',
26
+ '[class*="subscribe" i]',
27
+
28
+ // Generic popups/modals
29
+ '[class*="popup" i]',
30
+ '[class*="modal" i]',
31
+ '[class*="overlay" i]',
32
+ '[class*="dialog" i]',
33
+ '[role="dialog"]',
34
+ '[role="alertdialog"]',
35
+ ];
36
+
37
+ // Try to click close buttons first
38
+ for (const selector of commonSelectors.slice(0, 6)) {
39
+ const closeButtons = document.querySelectorAll(selector);
40
+ for (const button of closeButtons) {
41
+ if (isVisible(button)) {
42
+ try {
43
+ button.click();
44
+ await new Promise((resolve) => setTimeout(resolve, 100));
45
+ } catch (e) {
46
+ console.log("Error clicking button:", e);
47
+ }
48
+ }
49
+ }
50
+ }
51
+
52
+ // Remove remaining overlay elements
53
+ const removeOverlays = () => {
54
+ // Find elements with high z-index
55
+ const allElements = document.querySelectorAll("*");
56
+ for (const elem of allElements) {
57
+ const style = window.getComputedStyle(elem);
58
+ const zIndex = parseInt(style.zIndex);
59
+ const position = style.position;
60
+
61
+ if (
62
+ isVisible(elem) &&
63
+ (zIndex > 999 || position === "fixed" || position === "absolute") &&
64
+ (elem.offsetWidth > window.innerWidth * 0.5 ||
65
+ elem.offsetHeight > window.innerHeight * 0.5 ||
66
+ style.backgroundColor.includes("rgba") ||
67
+ parseFloat(style.opacity) < 1)
68
+ ) {
69
+ elem.remove();
70
+ }
71
+ }
72
+
73
+ // Remove elements matching common selectors
74
+ for (const selector of commonSelectors) {
75
+ const elements = document.querySelectorAll(selector);
76
+ elements.forEach((elem) => {
77
+ if (isVisible(elem)) {
78
+ elem.remove();
79
+ }
80
+ });
81
+ }
82
+ };
83
+
84
+ // Remove overlay elements
85
+ removeOverlays();
86
+
87
+ // Remove any fixed/sticky position elements at the top/bottom
88
+ const removeFixedElements = () => {
89
+ const elements = document.querySelectorAll("*");
90
+ elements.forEach((elem) => {
91
+ const style = window.getComputedStyle(elem);
92
+ if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) {
93
+ elem.remove();
94
+ }
95
+ });
96
+ };
97
+
98
+ removeFixedElements();
99
+
100
+ // Remove empty block elements as: div, p, span, etc.
101
+ const removeEmptyBlockElements = () => {
102
+ const blockElements = document.querySelectorAll(
103
+ "div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6"
104
+ );
105
+ blockElements.forEach((elem) => {
106
+ if (elem.innerText.trim() === "") {
107
+ elem.remove();
108
+ }
109
+ });
110
+ };
111
+
112
+ // Remove margin-right and padding-right from body (often added by modal scripts)
113
+ document.body.style.marginRight = "0px";
114
+ document.body.style.paddingRight = "0px";
115
+ document.body.style.overflow = "auto";
116
+
117
+ // Wait a bit for any animations to complete
118
+ await new Promise((resolve) => setTimeout(resolve, 100));
119
+ };
crawl4ai/js_snippet/update_image_dimensions.js ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ () => {
2
+ return new Promise((resolve) => {
3
+ const filterImage = (img) => {
4
+ // Filter out images that are too small
5
+ if (img.width < 100 && img.height < 100) return false;
6
+
7
+ // Filter out images that are not visible
8
+ const rect = img.getBoundingClientRect();
9
+ if (rect.width === 0 || rect.height === 0) return false;
10
+
11
+ // Filter out images with certain class names (e.g., icons, thumbnails)
12
+ if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
13
+
14
+ // Filter out images with certain patterns in their src (e.g., placeholder images)
15
+ if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
16
+
17
+ return true;
18
+ };
19
+
20
+ const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
21
+ let imagesLeft = images.length;
22
+
23
+ if (imagesLeft === 0) {
24
+ resolve();
25
+ return;
26
+ }
27
+
28
+ const checkImage = (img) => {
29
+ if (img.complete && img.naturalWidth !== 0) {
30
+ img.setAttribute("width", img.naturalWidth);
31
+ img.setAttribute("height", img.naturalHeight);
32
+ imagesLeft--;
33
+ if (imagesLeft === 0) resolve();
34
+ }
35
+ };
36
+
37
+ images.forEach((img) => {
38
+ checkImage(img);
39
+ if (!img.complete) {
40
+ img.onload = () => {
41
+ checkImage(img);
42
+ };
43
+ img.onerror = () => {
44
+ imagesLeft--;
45
+ if (imagesLeft === 0) resolve();
46
+ };
47
+ }
48
+ });
49
+
50
+ // Fallback timeout of 5 seconds
51
+ // setTimeout(() => resolve(), 5000);
52
+ resolve();
53
+ });
54
+ };
crawl4ai/llmtxt.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import re
4
+ from typing import Dict, List, Tuple, Optional, Any
5
+ import json
6
+ from tqdm import tqdm
7
+ import time
8
+ import psutil
9
+ import numpy as np
10
+ from rank_bm25 import BM25Okapi
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.corpus import stopwords
13
+ from nltk.stem import WordNetLemmatizer
14
+ from litellm import completion, batch_completion
15
+ from .async_logger import AsyncLogger
16
+ import litellm
17
+ import pickle
18
+ import hashlib # <--- ADDED for file-hash
19
+ from fnmatch import fnmatch
20
+ import glob
21
+
22
+ litellm.set_verbose = False
23
+
24
+ def _compute_file_hash(file_path: Path) -> str:
25
+ """Compute MD5 hash for the file's entire content."""
26
+ hash_md5 = hashlib.md5()
27
+ with file_path.open("rb") as f:
28
+ for chunk in iter(lambda: f.read(4096), b""):
29
+ hash_md5.update(chunk)
30
+ return hash_md5.hexdigest()
31
+
32
+ class AsyncLLMTextManager:
33
+ def __init__(
34
+ self,
35
+ docs_dir: Path,
36
+ logger: Optional[AsyncLogger] = None,
37
+ max_concurrent_calls: int = 5,
38
+ batch_size: int = 3
39
+ ) -> None:
40
+ self.docs_dir = docs_dir
41
+ self.logger = logger
42
+ self.max_concurrent_calls = max_concurrent_calls
43
+ self.batch_size = batch_size
44
+ self.bm25_index = None
45
+ self.document_map: Dict[str, Any] = {}
46
+ self.tokenized_facts: List[str] = []
47
+ self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
48
+
49
+ async def _process_document_batch(self, doc_batch: List[Path]) -> None:
50
+ """Process a batch of documents in parallel"""
51
+ contents = []
52
+ for file_path in doc_batch:
53
+ try:
54
+ with open(file_path, 'r', encoding='utf-8') as f:
55
+ contents.append(f.read())
56
+ except Exception as e:
57
+ self.logger.error(f"Error reading {file_path}: {str(e)}")
58
+ contents.append("") # Add empty content to maintain batch alignment
59
+
60
+ prompt = """Given a documentation file, generate a list of atomic facts where each fact:
61
+ 1. Represents a single piece of knowledge
62
+ 2. Contains variations in terminology for the same concept
63
+ 3. References relevant code patterns if they exist
64
+ 4. Is written in a way that would match natural language queries
65
+
66
+ Each fact should follow this format:
67
+ <main_concept>: <fact_statement> | <related_terms> | <code_reference>
68
+
69
+ Example Facts:
70
+ browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
71
+ redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
72
+ pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
73
+
74
+ Wrap your response in <index>...</index> tags.
75
+ """
76
+
77
+ # Prepare messages for batch processing
78
+ messages_list = [
79
+ [
80
+ {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"}
81
+ ]
82
+ for content in contents if content
83
+ ]
84
+
85
+ try:
86
+ responses = batch_completion(
87
+ model="anthropic/claude-3-5-sonnet-latest",
88
+ messages=messages_list,
89
+ logger_fn=None
90
+ )
91
+
92
+ # Process responses and save index files
93
+ for response, file_path in zip(responses, doc_batch):
94
+ try:
95
+ index_content_match = re.search(
96
+ r'<index>(.*?)</index>',
97
+ response.choices[0].message.content,
98
+ re.DOTALL
99
+ )
100
+ if not index_content_match:
101
+ self.logger.warning(f"No <index>...</index> content found for {file_path}")
102
+ continue
103
+
104
+ index_content = re.sub(
105
+ r"\n\s*\n", "\n", index_content_match.group(1)
106
+ ).strip()
107
+ if index_content:
108
+ index_file = file_path.with_suffix('.q.md')
109
+ with open(index_file, 'w', encoding='utf-8') as f:
110
+ f.write(index_content)
111
+ self.logger.info(f"Created index file: {index_file}")
112
+ else:
113
+ self.logger.warning(f"No index content found in response for {file_path}")
114
+
115
+ except Exception as e:
116
+ self.logger.error(f"Error processing response for {file_path}: {str(e)}")
117
+
118
+ except Exception as e:
119
+ self.logger.error(f"Error in batch completion: {str(e)}")
120
+
121
+ def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
122
+ if "|" not in line:
123
+ return False, "Missing separator '|'"
124
+
125
+ parts = [p.strip() for p in line.split("|")]
126
+ if len(parts) != 3:
127
+ return False, f"Expected 3 parts, got {len(parts)}"
128
+
129
+ concept_part = parts[0]
130
+ if ":" not in concept_part:
131
+ return False, "Missing ':' in concept definition"
132
+
133
+ return True, None
134
+
135
+ def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
136
+ """
137
+ Load token cache from .q.tokens if present and matching file hash.
138
+ Otherwise return a new structure with updated file-hash.
139
+ """
140
+ cache_file = fact_file.with_suffix(".q.tokens")
141
+ current_hash = _compute_file_hash(fact_file)
142
+
143
+ if cache_file.exists():
144
+ try:
145
+ with open(cache_file, "r") as f:
146
+ cache = json.load(f)
147
+ # If the hash matches, return it directly
148
+ if cache.get("content_hash") == current_hash:
149
+ return cache
150
+ # Otherwise, we signal that it's changed
151
+ self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
152
+ except json.JSONDecodeError:
153
+ self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
154
+ except Exception as e:
155
+ self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
156
+
157
+ # Return a fresh cache
158
+ return {"facts": {}, "content_hash": current_hash}
159
+
160
+ def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
161
+ cache_file = fact_file.with_suffix(".q.tokens")
162
+ # Always ensure we're saving the correct file-hash
163
+ cache["content_hash"] = _compute_file_hash(fact_file)
164
+ with open(cache_file, "w") as f:
165
+ json.dump(cache, f)
166
+
167
+ def preprocess_text(self, text: str) -> List[str]:
168
+ parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
169
+ # Remove : after the first word of parts[0]
170
+ parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
171
+
172
+ lemmatizer = WordNetLemmatizer()
173
+ stop_words = set(stopwords.words("english")) - {
174
+ "how", "what", "when", "where", "why", "which",
175
+ }
176
+
177
+ tokens = []
178
+ for part in parts:
179
+ if "(" in part and ")" in part:
180
+ code_tokens = re.findall(
181
+ r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
182
+ )
183
+ tokens.extend(code_tokens)
184
+
185
+ words = word_tokenize(part.lower())
186
+ tokens.extend(
187
+ [
188
+ lemmatizer.lemmatize(token)
189
+ for token in words
190
+ if token not in stop_words
191
+ ]
192
+ )
193
+
194
+ return tokens
195
+
196
+ def maybe_load_bm25_index(self, clear_cache=False) -> bool:
197
+ """
198
+ Load existing BM25 index from disk, if present and clear_cache=False.
199
+ """
200
+ if not clear_cache and os.path.exists(self.bm25_index_file):
201
+ self.logger.info("Loading existing BM25 index from disk.")
202
+ with open(self.bm25_index_file, "rb") as f:
203
+ data = pickle.load(f)
204
+ self.tokenized_facts = data["tokenized_facts"]
205
+ self.bm25_index = data["bm25_index"]
206
+ return True
207
+ return False
208
+
209
+ def build_search_index(self, clear_cache=False) -> None:
210
+ """
211
+ Checks for new or modified .q.md files by comparing file-hash.
212
+ If none need reindexing and clear_cache is False, loads existing index if available.
213
+ Otherwise, reindexes only changed/new files and merges or creates a new index.
214
+ """
215
+ # If clear_cache is True, we skip partial logic: rebuild everything from scratch
216
+ if clear_cache:
217
+ self.logger.info("Clearing cache and rebuilding full search index.")
218
+ if self.bm25_index_file.exists():
219
+ self.bm25_index_file.unlink()
220
+
221
+ process = psutil.Process()
222
+ self.logger.info("Checking which .q.md files need (re)indexing...")
223
+
224
+ # Gather all .q.md files
225
+ q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")]
226
+
227
+ # We'll store known (unchanged) facts in these lists
228
+ existing_facts: List[str] = []
229
+ existing_tokens: List[List[str]] = []
230
+
231
+ # Keep track of invalid lines for logging
232
+ invalid_lines = []
233
+ needSet = [] # files that must be (re)indexed
234
+
235
+ for qf in q_files:
236
+ token_cache_file = qf.with_suffix(".q.tokens")
237
+
238
+ # If no .q.tokens or clear_cache is True → definitely reindex
239
+ if clear_cache or not token_cache_file.exists():
240
+ needSet.append(qf)
241
+ continue
242
+
243
+ # Otherwise, load the existing cache and compare hash
244
+ cache = self._load_or_create_token_cache(qf)
245
+ # If the .q.tokens was out of date (i.e. changed hash), we reindex
246
+ if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf):
247
+ needSet.append(qf)
248
+ else:
249
+ # File is unchanged → retrieve cached token data
250
+ for line, cache_data in cache["facts"].items():
251
+ existing_facts.append(line)
252
+ existing_tokens.append(cache_data["tokens"])
253
+ self.document_map[line] = qf # track the doc for that fact
254
+
255
+ if not needSet and not clear_cache:
256
+ # If no file needs reindexing, try loading existing index
257
+ if self.maybe_load_bm25_index(clear_cache=False):
258
+ self.logger.info("No new/changed .q.md files found. Using existing BM25 index.")
259
+ return
260
+ else:
261
+ # If there's no existing index, we must build a fresh index from the old caches
262
+ self.logger.info("No existing BM25 index found. Building from cached facts.")
263
+ if existing_facts:
264
+ self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.")
265
+ self.bm25_index = BM25Okapi(existing_tokens)
266
+ self.tokenized_facts = existing_facts
267
+ with open(self.bm25_index_file, "wb") as f:
268
+ pickle.dump({
269
+ "bm25_index": self.bm25_index,
270
+ "tokenized_facts": self.tokenized_facts
271
+ }, f)
272
+ else:
273
+ self.logger.warning("No facts found at all. Index remains empty.")
274
+ return
275
+
276
+ # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
277
+ # If we reach here, we have new or changed .q.md files
278
+ # We'll parse them, reindex them, and then combine with existing_facts
279
+ # -----------------------------------------------------
280
+
281
+ self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
282
+
283
+ # 1) Parse the new or changed .q.md files
284
+ new_facts = []
285
+ new_tokens = []
286
+ with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
287
+ for file in needSet:
288
+ # We'll build up a fresh cache
289
+ fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
290
+ try:
291
+ with open(file, "r", encoding="utf-8") as f_obj:
292
+ content = f_obj.read().strip()
293
+ lines = [l.strip() for l in content.split("\n") if l.strip()]
294
+
295
+ for line in lines:
296
+ is_valid, error = self._validate_fact_line(line)
297
+ if not is_valid:
298
+ invalid_lines.append((file, line, error))
299
+ continue
300
+
301
+ tokens = self.preprocess_text(line)
302
+ fresh_cache["facts"][line] = {
303
+ "tokens": tokens,
304
+ "added": time.time(),
305
+ }
306
+ new_facts.append(line)
307
+ new_tokens.append(tokens)
308
+ self.document_map[line] = file
309
+
310
+ # Save the new .q.tokens with updated hash
311
+ self._save_token_cache(file, fresh_cache)
312
+
313
+ mem_usage = process.memory_info().rss / 1024 / 1024
314
+ self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB")
315
+
316
+ except Exception as e:
317
+ self.logger.error(f"Error processing {file}: {str(e)}")
318
+
319
+ file_pbar.update(1)
320
+
321
+ if invalid_lines:
322
+ self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
323
+ for file, line, error in invalid_lines:
324
+ self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
325
+
326
+ # 2) Merge newly tokenized facts with the existing ones
327
+ all_facts = existing_facts + new_facts
328
+ all_tokens = existing_tokens + new_tokens
329
+
330
+ # 3) Build BM25 index from combined facts
331
+ self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).")
332
+ self.bm25_index = BM25Okapi(all_tokens)
333
+ self.tokenized_facts = all_facts
334
+
335
+ # 4) Save the updated BM25 index to disk
336
+ with open(self.bm25_index_file, "wb") as f:
337
+ pickle.dump({
338
+ "bm25_index": self.bm25_index,
339
+ "tokenized_facts": self.tokenized_facts
340
+ }, f)
341
+
342
+ final_mem = process.memory_info().rss / 1024 / 1024
343
+ self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
344
+
345
+ async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None:
346
+ """
347
+ Generate index files for all documents in parallel batches
348
+
349
+ Args:
350
+ force_generate_facts (bool): If True, regenerate indexes even if they exist
351
+ clear_bm25_cache (bool): If True, clear existing BM25 index cache
352
+ """
353
+ self.logger.info("Starting index generation for documentation files.")
354
+
355
+ md_files = [
356
+ self.docs_dir / f for f in os.listdir(self.docs_dir)
357
+ if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md'])
358
+ ]
359
+
360
+ # Filter out files that already have .q files unless force=True
361
+ if not force_generate_facts:
362
+ md_files = [
363
+ f for f in md_files
364
+ if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists()
365
+ ]
366
+
367
+ if not md_files:
368
+ self.logger.info("All index files exist. Use force=True to regenerate.")
369
+ else:
370
+ # Process documents in batches
371
+ for i in range(0, len(md_files), self.batch_size):
372
+ batch = md_files[i:i + self.batch_size]
373
+ self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}")
374
+ await self._process_document_batch(batch)
375
+
376
+ self.logger.info("Index generation complete, building/updating search index.")
377
+ self.build_search_index(clear_cache=clear_bm25_cache)
378
+
379
+ def generate(self, sections: List[str], mode: str = "extended") -> str:
380
+ # Get all markdown files
381
+ all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \
382
+ glob.glob(str(self.docs_dir / "[0-9]*.xs.md"))
383
+
384
+ # Extract base names without extensions
385
+ base_docs = {Path(f).name.split('.')[0] for f in all_files
386
+ if not Path(f).name.endswith('.q.md')}
387
+
388
+ # Filter by sections if provided
389
+ if sections:
390
+ base_docs = {doc for doc in base_docs
391
+ if any(section.lower() in doc.lower() for section in sections)}
392
+
393
+ # Get file paths based on mode
394
+ files = []
395
+ for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999):
396
+ if mode == "condensed":
397
+ xs_file = self.docs_dir / f"{doc}.xs.md"
398
+ regular_file = self.docs_dir / f"{doc}.md"
399
+ files.append(str(xs_file if xs_file.exists() else regular_file))
400
+ else:
401
+ files.append(str(self.docs_dir / f"{doc}.md"))
402
+
403
+ # Read and format content
404
+ content = []
405
+ for file in files:
406
+ try:
407
+ with open(file, 'r', encoding='utf-8') as f:
408
+ fname = Path(file).name
409
+ content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
410
+ except Exception as e:
411
+ self.logger.error(f"Error reading {file}: {str(e)}")
412
+
413
+ return "\n\n---\n\n".join(content) if content else ""
414
+
415
+ def search(self, query: str, top_k: int = 5) -> str:
416
+ if not self.bm25_index:
417
+ return "No search index available. Call build_search_index() first."
418
+
419
+ query_tokens = self.preprocess_text(query)
420
+ doc_scores = self.bm25_index.get_scores(query_tokens)
421
+
422
+ mean_score = np.mean(doc_scores)
423
+ std_score = np.std(doc_scores)
424
+ score_threshold = mean_score + (0.25 * std_score)
425
+
426
+ file_data = self._aggregate_search_scores(
427
+ doc_scores=doc_scores,
428
+ score_threshold=score_threshold,
429
+ query_tokens=query_tokens,
430
+ )
431
+
432
+ ranked_files = sorted(
433
+ file_data.items(),
434
+ key=lambda x: (
435
+ x[1]["code_match_score"] * 2.0
436
+ + x[1]["match_count"] * 1.5
437
+ + x[1]["total_score"]
438
+ ),
439
+ reverse=True,
440
+ )[:top_k]
441
+
442
+ results = []
443
+ for file, _ in ranked_files:
444
+ main_doc = str(file).replace(".q.md", ".md")
445
+ if os.path.exists(self.docs_dir / main_doc):
446
+ with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f:
447
+ only_file_name = main_doc.split("/")[-1]
448
+ content = [
449
+ "#" * 20,
450
+ f"# {only_file_name}",
451
+ "#" * 20,
452
+ "",
453
+ f.read()
454
+ ]
455
+ results.append("\n".join(content))
456
+
457
+ return "\n\n---\n\n".join(results)
458
+
459
+ def _aggregate_search_scores(
460
+ self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
461
+ ) -> Dict:
462
+ file_data = {}
463
+
464
+ for idx, score in enumerate(doc_scores):
465
+ if score <= score_threshold:
466
+ continue
467
+
468
+ fact = self.tokenized_facts[idx]
469
+ file_path = self.document_map[fact]
470
+
471
+ if file_path not in file_data:
472
+ file_data[file_path] = {
473
+ "total_score": 0,
474
+ "match_count": 0,
475
+ "code_match_score": 0,
476
+ "matched_facts": [],
477
+ }
478
+
479
+ components = fact.split("|") if "|" in fact else [fact]
480
+
481
+ code_match_score = 0
482
+ if len(components) == 3:
483
+ code_ref = components[2].strip()
484
+ code_tokens = self.preprocess_text(code_ref)
485
+ code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens)
486
+
487
+ file_data[file_path]["total_score"] += score
488
+ file_data[file_path]["match_count"] += 1
489
+ file_data[file_path]["code_match_score"] = max(
490
+ file_data[file_path]["code_match_score"], code_match_score
491
+ )
492
+ file_data[file_path]["matched_facts"].append(fact)
493
+
494
+ return file_data
495
+
496
+ def refresh_index(self) -> None:
497
+ """Convenience method for a full rebuild."""
498
+ self.build_search_index(clear_cache=True)
crawl4ai/markdown_generation_strategy.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, Dict, Any, Tuple
3
+ from .models import MarkdownGenerationResult
4
+ from .html2text import CustomHTML2Text
5
+ from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
6
+ import re
7
+ from urllib.parse import urljoin
8
+
9
+ # Pre-compile the regex pattern
10
+ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
11
+
12
+ def fast_urljoin(base: str, url: str) -> str:
13
+ """Fast URL joining for common cases."""
14
+ if url.startswith(('http://', 'https://', 'mailto:', '//')):
15
+ return url
16
+ if url.startswith('/'):
17
+ # Handle absolute paths
18
+ if base.endswith('/'):
19
+ return base[:-1] + url
20
+ return base + url
21
+ return urljoin(base, url)
22
+
23
+ class MarkdownGenerationStrategy(ABC):
24
+ """Abstract base class for markdown generation strategies."""
25
+ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
26
+ self.content_filter = content_filter
27
+ self.options = options or {}
28
+
29
+ @abstractmethod
30
+ def generate_markdown(self,
31
+ cleaned_html: str,
32
+ base_url: str = "",
33
+ html2text_options: Optional[Dict[str, Any]] = None,
34
+ content_filter: Optional[RelevantContentFilter] = None,
35
+ citations: bool = True,
36
+ **kwargs) -> MarkdownGenerationResult:
37
+ """Generate markdown from cleaned HTML."""
38
+ pass
39
+
40
+ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
41
+ """
42
+ Default implementation of markdown generation strategy.
43
+
44
+ How it works:
45
+ 1. Generate raw markdown from cleaned HTML.
46
+ 2. Convert links to citations.
47
+ 3. Generate fit markdown if content filter is provided.
48
+ 4. Return MarkdownGenerationResult.
49
+
50
+ Args:
51
+ content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
52
+ options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
53
+
54
+ Returns:
55
+ MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
56
+ """
57
+ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
58
+ super().__init__(content_filter, options)
59
+
60
+ def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
61
+ """
62
+ Convert links in markdown to citations.
63
+
64
+ How it works:
65
+ 1. Find all links in the markdown.
66
+ 2. Convert links to citations.
67
+ 3. Return converted markdown and references markdown.
68
+
69
+ Note:
70
+ This function uses a regex pattern to find links in markdown.
71
+
72
+ Args:
73
+ markdown (str): Markdown text.
74
+ base_url (str): Base URL for URL joins.
75
+
76
+ Returns:
77
+ Tuple[str, str]: Converted markdown and references markdown.
78
+ """
79
+ link_map = {}
80
+ url_cache = {} # Cache for URL joins
81
+ parts = []
82
+ last_end = 0
83
+ counter = 1
84
+
85
+ for match in LINK_PATTERN.finditer(markdown):
86
+ parts.append(markdown[last_end:match.start()])
87
+ text, url, title = match.groups()
88
+
89
+ # Use cached URL if available, otherwise compute and cache
90
+ if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
91
+ if url not in url_cache:
92
+ url_cache[url] = fast_urljoin(base_url, url)
93
+ url = url_cache[url]
94
+
95
+ if url not in link_map:
96
+ desc = []
97
+ if title: desc.append(title)
98
+ if text and text != title: desc.append(text)
99
+ link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
100
+ counter += 1
101
+
102
+ num = link_map[url][0]
103
+ parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
104
+ last_end = match.end()
105
+
106
+ parts.append(markdown[last_end:])
107
+ converted_text = ''.join(parts)
108
+
109
+ # Pre-build reference strings
110
+ references = ["\n\n## References\n\n"]
111
+ references.extend(
112
+ f"⟨{num}⟩ {url}{desc}\n"
113
+ for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
114
+ )
115
+
116
+ return converted_text, ''.join(references)
117
+
118
+ def generate_markdown(self,
119
+ cleaned_html: str,
120
+ base_url: str = "",
121
+ html2text_options: Optional[Dict[str, Any]] = None,
122
+ options: Optional[Dict[str, Any]] = None,
123
+ content_filter: Optional[RelevantContentFilter] = None,
124
+ citations: bool = True,
125
+ **kwargs) -> MarkdownGenerationResult:
126
+ """
127
+ Generate markdown with citations from cleaned HTML.
128
+
129
+ How it works:
130
+ 1. Generate raw markdown from cleaned HTML.
131
+ 2. Convert links to citations.
132
+ 3. Generate fit markdown if content filter is provided.
133
+ 4. Return MarkdownGenerationResult.
134
+
135
+ Args:
136
+ cleaned_html (str): Cleaned HTML content.
137
+ base_url (str): Base URL for URL joins.
138
+ html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
139
+ options (Optional[Dict[str, Any]]): Additional options for markdown generation.
140
+ content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
141
+ citations (bool): Whether to generate citations.
142
+
143
+ Returns:
144
+ MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
145
+ """
146
+ try:
147
+ # Initialize HTML2Text with default options for better conversion
148
+ h = CustomHTML2Text(baseurl=base_url)
149
+ default_options = {
150
+ 'body_width': 0, # Disable text wrapping
151
+ 'ignore_emphasis': False,
152
+ 'ignore_links': False,
153
+ 'ignore_images': False,
154
+ 'protect_links': True,
155
+ 'single_line_break': True,
156
+ 'mark_code': True,
157
+ 'escape_snob': False
158
+ }
159
+
160
+ # Update with custom options if provided
161
+ if html2text_options:
162
+ default_options.update(html2text_options)
163
+ elif options:
164
+ default_options.update(options)
165
+ elif self.options:
166
+ default_options.update(self.options)
167
+
168
+ h.update_params(**default_options)
169
+
170
+ # Ensure we have valid input
171
+ if not cleaned_html:
172
+ cleaned_html = ""
173
+ elif not isinstance(cleaned_html, str):
174
+ cleaned_html = str(cleaned_html)
175
+
176
+ # Generate raw markdown
177
+ try:
178
+ raw_markdown = h.handle(cleaned_html)
179
+ except Exception as e:
180
+ raw_markdown = f"Error converting HTML to markdown: {str(e)}"
181
+
182
+ raw_markdown = raw_markdown.replace(' ```', '```')
183
+
184
+ # Convert links to citations
185
+ markdown_with_citations: str = raw_markdown
186
+ references_markdown: str = ""
187
+ if citations:
188
+ try:
189
+ markdown_with_citations, references_markdown = self.convert_links_to_citations(
190
+ raw_markdown, base_url
191
+ )
192
+ except Exception as e:
193
+ markdown_with_citations = raw_markdown
194
+ references_markdown = f"Error generating citations: {str(e)}"
195
+
196
+ # Generate fit markdown if content filter is provided
197
+ fit_markdown: Optional[str] = ""
198
+ filtered_html: Optional[str] = ""
199
+ if content_filter or self.content_filter:
200
+ try:
201
+ content_filter = content_filter or self.content_filter
202
+ filtered_html = content_filter.filter_content(cleaned_html)
203
+ filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
204
+ fit_markdown = h.handle(filtered_html)
205
+ except Exception as e:
206
+ fit_markdown = f"Error generating fit markdown: {str(e)}"
207
+ filtered_html = ""
208
+
209
+ return MarkdownGenerationResult(
210
+ raw_markdown=raw_markdown or "",
211
+ markdown_with_citations=markdown_with_citations or "",
212
+ references_markdown=references_markdown or "",
213
+ fit_markdown=fit_markdown or "",
214
+ fit_html=filtered_html or "",
215
+ )
216
+ except Exception as e:
217
+ # If anything fails, return empty strings with error message
218
+ error_msg = f"Error in markdown generation: {str(e)}"
219
+ return MarkdownGenerationResult(
220
+ raw_markdown=error_msg,
221
+ markdown_with_citations=error_msg,
222
+ references_markdown="",
223
+ fit_markdown="",
224
+ fit_html="",
225
+ )
crawl4ai/migrations.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import logging
4
+ from pathlib import Path
5
+ import aiosqlite
6
+ from typing import Optional
7
+ import xxhash
8
+ import aiofiles
9
+ import shutil
10
+ import time
11
+ from datetime import datetime
12
+ from .async_logger import AsyncLogger, LogLevel
13
+
14
+ # Initialize logger
15
+ logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
16
+
17
+ # logging.basicConfig(level=logging.INFO)
18
+ # logger = logging.getLogger(__name__)
19
+
20
+ class DatabaseMigration:
21
+ def __init__(self, db_path: str):
22
+ self.db_path = db_path
23
+ self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
24
+
25
+ def _ensure_content_dirs(self, base_path: str) -> dict:
26
+ dirs = {
27
+ 'html': 'html_content',
28
+ 'cleaned': 'cleaned_html',
29
+ 'markdown': 'markdown_content',
30
+ 'extracted': 'extracted_content',
31
+ 'screenshots': 'screenshots'
32
+ }
33
+ content_paths = {}
34
+ for key, dirname in dirs.items():
35
+ path = os.path.join(base_path, dirname)
36
+ os.makedirs(path, exist_ok=True)
37
+ content_paths[key] = path
38
+ return content_paths
39
+
40
+ def _generate_content_hash(self, content: str) -> str:
41
+ x = xxhash.xxh64()
42
+ x.update(content.encode())
43
+ content_hash = x.hexdigest()
44
+ return content_hash
45
+ # return hashlib.sha256(content.encode()).hexdigest()
46
+
47
+ async def _store_content(self, content: str, content_type: str) -> str:
48
+ if not content:
49
+ return ""
50
+
51
+ content_hash = self._generate_content_hash(content)
52
+ file_path = os.path.join(self.content_paths[content_type], content_hash)
53
+
54
+ if not os.path.exists(file_path):
55
+ async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
56
+ await f.write(content)
57
+
58
+ return content_hash
59
+
60
+ async def migrate_database(self):
61
+ """Migrate existing database to file-based storage"""
62
+ # logger.info("Starting database migration...")
63
+ logger.info("Starting database migration...", tag="INIT")
64
+
65
+ try:
66
+ async with aiosqlite.connect(self.db_path) as db:
67
+ # Get all rows
68
+ async with db.execute(
69
+ '''SELECT url, html, cleaned_html, markdown,
70
+ extracted_content, screenshot FROM crawled_data'''
71
+ ) as cursor:
72
+ rows = await cursor.fetchall()
73
+
74
+ migrated_count = 0
75
+ for row in rows:
76
+ url, html, cleaned_html, markdown, extracted_content, screenshot = row
77
+
78
+ # Store content in files and get hashes
79
+ html_hash = await self._store_content(html, 'html')
80
+ cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
81
+ markdown_hash = await self._store_content(markdown, 'markdown')
82
+ extracted_hash = await self._store_content(extracted_content, 'extracted')
83
+ screenshot_hash = await self._store_content(screenshot, 'screenshots')
84
+
85
+ # Update database with hashes
86
+ await db.execute('''
87
+ UPDATE crawled_data
88
+ SET html = ?,
89
+ cleaned_html = ?,
90
+ markdown = ?,
91
+ extracted_content = ?,
92
+ screenshot = ?
93
+ WHERE url = ?
94
+ ''', (html_hash, cleaned_hash, markdown_hash,
95
+ extracted_hash, screenshot_hash, url))
96
+
97
+ migrated_count += 1
98
+ if migrated_count % 100 == 0:
99
+ logger.info(f"Migrated {migrated_count} records...", tag="INIT")
100
+
101
+
102
+ await db.commit()
103
+ logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
104
+
105
+ except Exception as e:
106
+ # logger.error(f"Migration failed: {e}")
107
+ logger.error(
108
+ message="Migration failed: {error}",
109
+ tag="ERROR",
110
+ params={"error": str(e)}
111
+ )
112
+ raise e
113
+
114
+ async def backup_database(db_path: str) -> str:
115
+ """Create backup of existing database"""
116
+ if not os.path.exists(db_path):
117
+ logger.info("No existing database found. Skipping backup.", tag="INIT")
118
+ return None
119
+
120
+ # Create backup with timestamp
121
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
122
+ backup_path = f"{db_path}.backup_{timestamp}"
123
+
124
+ try:
125
+ # Wait for any potential write operations to finish
126
+ await asyncio.sleep(1)
127
+
128
+ # Create backup
129
+ shutil.copy2(db_path, backup_path)
130
+ logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
131
+ return backup_path
132
+ except Exception as e:
133
+ # logger.error(f"Backup failed: {e}")
134
+ logger.error(
135
+ message="Migration failed: {error}",
136
+ tag="ERROR",
137
+ params={"error": str(e)}
138
+ )
139
+ raise e
140
+
141
+ async def run_migration(db_path: Optional[str] = None):
142
+ """Run database migration"""
143
+ if db_path is None:
144
+ db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
145
+
146
+ if not os.path.exists(db_path):
147
+ logger.info("No existing database found. Skipping migration.", tag="INIT")
148
+ return
149
+
150
+ # Create backup first
151
+ backup_path = await backup_database(db_path)
152
+ if not backup_path:
153
+ return
154
+
155
+ migration = DatabaseMigration(db_path)
156
+ await migration.migrate_database()
157
+
158
+ def main():
159
+ """CLI entry point for migration"""
160
+ import argparse
161
+ parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
162
+ parser.add_argument('--db-path', help='Custom database path')
163
+ args = parser.parse_args()
164
+
165
+ asyncio.run(run_migration(args.db_path))
166
+
167
+ if __name__ == "__main__":
168
+ main()
crawl4ai/model_loader.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from pathlib import Path
3
+ import subprocess, os
4
+ import shutil
5
+ import tarfile
6
+ from .model_loader import *
7
+ import argparse
8
+ import urllib.request
9
+ from crawl4ai.config import MODEL_REPO_BRANCH
10
+ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
11
+
12
+ @lru_cache()
13
+ def get_available_memory(device):
14
+ import torch
15
+ if device.type == 'cuda':
16
+ return torch.cuda.get_device_properties(device).total_memory
17
+ elif device.type == 'mps':
18
+ return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
19
+ else:
20
+ return 0
21
+
22
+ @lru_cache()
23
+ def calculate_batch_size(device):
24
+ available_memory = get_available_memory(device)
25
+
26
+ if device.type == 'cpu':
27
+ return 16
28
+ elif device.type in ['cuda', 'mps']:
29
+ # Adjust these thresholds based on your model size and available memory
30
+ if available_memory >= 31 * 1024 ** 3: # > 32GB
31
+ return 256
32
+ elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
33
+ return 128
34
+ elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
35
+ return 64
36
+ else:
37
+ return 32
38
+ else:
39
+ return 16 # Default batch size
40
+
41
+ @lru_cache()
42
+ def get_device():
43
+ import torch
44
+ if torch.cuda.is_available():
45
+ device = torch.device('cuda')
46
+ elif torch.backends.mps.is_available():
47
+ device = torch.device('mps')
48
+ else:
49
+ device = torch.device('cpu')
50
+ return device
51
+
52
+ def set_model_device(model):
53
+ device = get_device()
54
+ model.to(device)
55
+ return model, device
56
+
57
+ @lru_cache()
58
+ def get_home_folder():
59
+ home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
60
+ os.makedirs(home_folder, exist_ok=True)
61
+ os.makedirs(f"{home_folder}/cache", exist_ok=True)
62
+ os.makedirs(f"{home_folder}/models", exist_ok=True)
63
+ return home_folder
64
+
65
+ @lru_cache()
66
+ def load_bert_base_uncased():
67
+ from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
68
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
69
+ model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
70
+ model.eval()
71
+ model, device = set_model_device(model)
72
+ return tokenizer, model
73
+
74
+ @lru_cache()
75
+ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
76
+ """Load the Hugging Face model for embedding.
77
+
78
+ Args:
79
+ model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
80
+
81
+ Returns:
82
+ tuple: The tokenizer and model.
83
+ """
84
+ from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
85
+ tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
86
+ model = AutoModel.from_pretrained(model_name, resume_download=None)
87
+ model.eval()
88
+ model, device = set_model_device(model)
89
+ return tokenizer, model
90
+
91
+ @lru_cache()
92
+ def load_text_classifier():
93
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
94
+ from transformers import pipeline
95
+ import torch
96
+
97
+ tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
98
+ model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
99
+ model.eval()
100
+ model, device = set_model_device(model)
101
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
102
+ return pipe
103
+
104
+ @lru_cache()
105
+ def load_text_multilabel_classifier():
106
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
107
+ import numpy as np
108
+ from scipy.special import expit
109
+ import torch
110
+
111
+ # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
112
+ # if torch.cuda.is_available():
113
+ # device = torch.device("cuda")
114
+ # elif torch.backends.mps.is_available():
115
+ # device = torch.device("mps")
116
+ # else:
117
+ # device = torch.device("cpu")
118
+ # # return load_spacy_model(), torch.device("cpu")
119
+
120
+
121
+ MODEL = "cardiffnlp/tweet-topic-21-multi"
122
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
123
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
124
+ model.eval()
125
+ model, device = set_model_device(model)
126
+ class_mapping = model.config.id2label
127
+
128
+ def _classifier(texts, threshold=0.5, max_length=64):
129
+ tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
130
+ tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
131
+
132
+ with torch.no_grad():
133
+ output = model(**tokens)
134
+
135
+ scores = output.logits.detach().cpu().numpy()
136
+ scores = expit(scores)
137
+ predictions = (scores >= threshold) * 1
138
+
139
+ batch_labels = []
140
+ for prediction in predictions:
141
+ labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
142
+ batch_labels.append(labels)
143
+
144
+ return batch_labels
145
+
146
+ return _classifier, device
147
+
148
+ @lru_cache()
149
+ def load_nltk_punkt():
150
+ import nltk
151
+ try:
152
+ nltk.data.find('tokenizers/punkt')
153
+ except LookupError:
154
+ nltk.download('punkt')
155
+ return nltk.data.find('tokenizers/punkt')
156
+
157
+ @lru_cache()
158
+ def load_spacy_model():
159
+ import spacy
160
+ name = "models/reuters"
161
+ home_folder = get_home_folder()
162
+ model_folder = Path(home_folder) / name
163
+
164
+ # Check if the model directory already exists
165
+ if not (model_folder.exists() and any(model_folder.iterdir())):
166
+ repo_url = "https://github.com/unclecode/crawl4ai.git"
167
+ branch = MODEL_REPO_BRANCH
168
+ repo_folder = Path(home_folder) / "crawl4ai"
169
+
170
+ print("[LOG] ⏬ Downloading Spacy model for the first time...")
171
+
172
+ # Remove existing repo folder if it exists
173
+ if repo_folder.exists():
174
+ try:
175
+ shutil.rmtree(repo_folder)
176
+ if model_folder.exists():
177
+ shutil.rmtree(model_folder)
178
+ except PermissionError:
179
+ print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
180
+ print(f"- {repo_folder}")
181
+ print(f"- {model_folder}")
182
+ return None
183
+
184
+ try:
185
+ # Clone the repository
186
+ subprocess.run(
187
+ ["git", "clone", "-b", branch, repo_url, str(repo_folder)],
188
+ stdout=subprocess.DEVNULL,
189
+ stderr=subprocess.DEVNULL,
190
+ check=True
191
+ )
192
+
193
+ # Create the models directory if it doesn't exist
194
+ models_folder = Path(home_folder) / "models"
195
+ models_folder.mkdir(parents=True, exist_ok=True)
196
+
197
+ # Copy the reuters model folder to the models directory
198
+ source_folder = repo_folder / "models" / "reuters"
199
+ shutil.copytree(source_folder, model_folder)
200
+
201
+ # Remove the cloned repository
202
+ shutil.rmtree(repo_folder)
203
+
204
+ print("[LOG] ✅ Spacy Model downloaded successfully")
205
+ except subprocess.CalledProcessError as e:
206
+ print(f"An error occurred while cloning the repository: {e}")
207
+ return None
208
+ except Exception as e:
209
+ print(f"An error occurred: {e}")
210
+ return None
211
+
212
+ try:
213
+ return spacy.load(str(model_folder))
214
+ except Exception as e:
215
+ print(f"Error loading spacy model: {e}")
216
+ return None
217
+
218
+ def download_all_models(remove_existing=False):
219
+ """Download all models required for Crawl4AI."""
220
+ if remove_existing:
221
+ print("[LOG] Removing existing models...")
222
+ home_folder = get_home_folder()
223
+ model_folders = [
224
+ os.path.join(home_folder, "models/reuters"),
225
+ os.path.join(home_folder, "models"),
226
+ ]
227
+ for folder in model_folders:
228
+ if Path(folder).exists():
229
+ shutil.rmtree(folder)
230
+ print("[LOG] Existing models removed.")
231
+
232
+ # Load each model to trigger download
233
+ # print("[LOG] Downloading BERT Base Uncased...")
234
+ # load_bert_base_uncased()
235
+ # print("[LOG] Downloading BGE Small EN v1.5...")
236
+ # load_bge_small_en_v1_5()
237
+ # print("[LOG] Downloading ONNX model...")
238
+ # load_onnx_all_MiniLM_l6_v2()
239
+ print("[LOG] Downloading text classifier...")
240
+ _, device = load_text_multilabel_classifier()
241
+ print(f"[LOG] Text classifier loaded on {device}")
242
+ print("[LOG] Downloading custom NLTK Punkt model...")
243
+ load_nltk_punkt()
244
+ print("[LOG] ✅ All models downloaded successfully.")
245
+
246
+ def main():
247
+ print("[LOG] Welcome to the Crawl4AI Model Downloader!")
248
+ print("[LOG] This script will download all the models required for Crawl4AI.")
249
+ parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
250
+ parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
251
+ args = parser.parse_args()
252
+
253
+ download_all_models(remove_existing=args.remove_existing)
254
+
255
+ if __name__ == "__main__":
256
+ main()
crawl4ai/models.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, HttpUrl
2
+ from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
3
+ from dataclasses import dataclass
4
+ from .ssl_certificate import SSLCertificate
5
+
6
+ @dataclass
7
+ class TokenUsage:
8
+ completion_tokens: int = 0
9
+ prompt_tokens: int = 0
10
+ total_tokens: int = 0
11
+ completion_tokens_details: Optional[dict] = None
12
+ prompt_tokens_details: Optional[dict] = None
13
+
14
+
15
+ class UrlModel(BaseModel):
16
+ url: HttpUrl
17
+ forced: bool = False
18
+
19
+ class MarkdownGenerationResult(BaseModel):
20
+ raw_markdown: str
21
+ markdown_with_citations: str
22
+ references_markdown: str
23
+ fit_markdown: Optional[str] = None
24
+ fit_html: Optional[str] = None
25
+
26
+ class CrawlResult(BaseModel):
27
+ url: str
28
+ html: str
29
+ success: bool
30
+ cleaned_html: Optional[str] = None
31
+ media: Dict[str, List[Dict]] = {}
32
+ links: Dict[str, List[Dict]] = {}
33
+ downloaded_files: Optional[List[str]] = None
34
+ screenshot: Optional[str] = None
35
+ pdf : Optional[bytes] = None
36
+ markdown: Optional[Union[str, MarkdownGenerationResult]] = None
37
+ markdown_v2: Optional[MarkdownGenerationResult] = None
38
+ fit_markdown: Optional[str] = None
39
+ fit_html: Optional[str] = None
40
+ extracted_content: Optional[str] = None
41
+ metadata: Optional[dict] = None
42
+ error_message: Optional[str] = None
43
+ session_id: Optional[str] = None
44
+ response_headers: Optional[dict] = None
45
+ status_code: Optional[int] = None
46
+ ssl_certificate: Optional[SSLCertificate] = None
47
+ class Config:
48
+ arbitrary_types_allowed = True
49
+
50
+ class AsyncCrawlResponse(BaseModel):
51
+ html: str
52
+ response_headers: Dict[str, str]
53
+ status_code: int
54
+ screenshot: Optional[str] = None
55
+ pdf_data: Optional[bytes] = None
56
+ get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
57
+ downloaded_files: Optional[List[str]] = None
58
+ ssl_certificate: Optional[SSLCertificate] = None
59
+
60
+ class Config:
61
+ arbitrary_types_allowed = True
crawl4ai/prompts.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
2
+ <url>{URL}</url>
3
+
4
+ And here is the cleaned HTML content of that webpage:
5
+ <html>
6
+ {HTML}
7
+ </html>
8
+
9
+ Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
10
+
11
+ - index: an integer representing the index of the block in the content
12
+ - tags: a list of semantic tags that are relevant to the content of the block
13
+ - content: a list of strings containing the text content of the block
14
+ - questions: a list of 3 questions that a user may ask about the content in this block
15
+
16
+ To generate the JSON objects:
17
+
18
+ 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
19
+
20
+ 2. For each block:
21
+ a. Assign it an index based on its order in the content.
22
+ b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about.
23
+ c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field.
24
+ d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block.
25
+
26
+ 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
27
+
28
+ 4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.).
29
+
30
+ 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
31
+
32
+ 6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
33
+
34
+ Please provide your output within <blocks> tags, like this:
35
+
36
+ <blocks>
37
+ [{
38
+ "index": 0,
39
+ "tags": ["introduction", "overview"],
40
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."],
41
+ "questions": [
42
+ "What is the main topic of this article?",
43
+ "What can I expect to learn from reading this article?",
44
+ "Is this article suitable for beginners or experts in the field?"
45
+ ]
46
+ },
47
+ {
48
+ "index": 1,
49
+ "tags": ["history", "background"],
50
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
51
+ "It provides context and sets the stage for the rest of the article."],
52
+ "questions": [
53
+ "What historical events led to the development of this topic?",
54
+ "How has the understanding of this topic evolved over time?",
55
+ "What are some key milestones in the history of this topic?"
56
+ ]
57
+ }]
58
+ </blocks>
59
+
60
+ Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
61
+
62
+ PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
63
+ <url>{URL}</url>
64
+
65
+ And here is the cleaned HTML content of that webpage:
66
+ <html>
67
+ {HTML}
68
+ </html>
69
+
70
+ Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
71
+
72
+ - index: an integer representing the index of the block in the content
73
+ - content: a list of strings containing the text content of the block
74
+
75
+ To generate the JSON objects:
76
+
77
+ 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
78
+
79
+ 2. For each block:
80
+ a. Assign it an index based on its order in the content.
81
+ b. Analyze the content and generate ONE semantic tag that describe what the block is about.
82
+ c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
83
+
84
+ 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
85
+
86
+ 4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
87
+
88
+ 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
89
+
90
+ 6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
91
+
92
+ 7. Never alter the extracted content, just copy and paste it as it is.
93
+
94
+ Please provide your output within <blocks> tags, like this:
95
+
96
+ <blocks>
97
+ [{
98
+ "index": 0,
99
+ "tags": ["introduction"],
100
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
101
+ },
102
+ {
103
+ "index": 1,
104
+ "tags": ["background"],
105
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
106
+ "It provides context and sets the stage for the rest of the article."]
107
+ }]
108
+ </blocks>
109
+
110
+ Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
111
+
112
+ PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage:
113
+ <url>{URL}</url>
114
+
115
+ And here is the cleaned HTML content of that webpage:
116
+ <html>
117
+ {HTML}
118
+ </html>
119
+
120
+ Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys:
121
+
122
+ - index: an integer representing the index of the block in the content
123
+ - content: a list of strings containing the text content of the block
124
+
125
+ This is the user's REQUEST, pay attention to it:
126
+ <request>
127
+ {REQUEST}
128
+ </request>
129
+
130
+ To generate the JSON objects:
131
+
132
+ 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
133
+
134
+ 2. For each block:
135
+ a. Assign it an index based on its order in the content.
136
+ b. Analyze the content and generate ONE semantic tag that describe what the block is about.
137
+ c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
138
+
139
+ 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
140
+
141
+ 4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
142
+
143
+ 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
144
+
145
+ 6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
146
+
147
+ 7. Never alter the extracted content, just copy and paste it as it is.
148
+
149
+ Please provide your output within <blocks> tags, like this:
150
+
151
+ <blocks>
152
+ [{
153
+ "index": 0,
154
+ "tags": ["introduction"],
155
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
156
+ },
157
+ {
158
+ "index": 1,
159
+ "tags": ["background"],
160
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
161
+ "It provides context and sets the stage for the rest of the article."]
162
+ }]
163
+ </blocks>
164
+
165
+ **Make sure to follow the user instruction to extract blocks aligin with the instruction.**
166
+
167
+ Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
168
+
169
+ PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL:
170
+ <url>{URL}</url>
171
+
172
+ <url_content>
173
+ {HTML}
174
+ </url_content>
175
+
176
+ The user has made the following request for what information to extract from the above content:
177
+
178
+ <user_request>
179
+ {REQUEST}
180
+ </user_request>
181
+
182
+ <schema_block>
183
+ {SCHEMA}
184
+ </schema_block>
185
+
186
+ Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the <schema_block> above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for.
187
+
188
+ Extraction instructions:
189
+ Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in <blocks>...</blocks> XML tags.
190
+
191
+ Quality Reflection:
192
+ Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred.
193
+
194
+ Quality Score:
195
+ After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside <score> tags.
196
+
197
+ Avoid Common Mistakes:
198
+ - Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
199
+ - Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
200
+ - Do not miss closing </blocks> tag at the end of the JSON output.
201
+ - Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
202
+
203
+ Result
204
+ Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
crawl4ai/ssl_certificate.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SSL Certificate class for handling certificate operations."""
2
+
3
+ import ssl
4
+ import socket
5
+ import base64
6
+ import json
7
+ from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
9
+ import OpenSSL.crypto
10
+ from pathlib import Path
11
+
12
+
13
+ class SSLCertificate:
14
+ """
15
+ A class representing an SSL certificate with methods to export in various formats.
16
+
17
+ Attributes:
18
+ cert_info (Dict[str, Any]): The certificate information.
19
+
20
+ Methods:
21
+ from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
22
+ from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
23
+ from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
24
+ export_as_pem() -> str: Export the certificate as PEM format.
25
+ export_as_der() -> bytes: Export the certificate as DER format.
26
+ export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
27
+ export_as_text() -> str: Export the certificate as text format.
28
+ """
29
+ def __init__(self, cert_info: Dict[str, Any]):
30
+ self._cert_info = self._decode_cert_data(cert_info)
31
+
32
+ @staticmethod
33
+ def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
34
+ """
35
+ Create SSLCertificate instance from a URL.
36
+
37
+ Args:
38
+ url (str): URL of the website.
39
+ timeout (int): Timeout for the connection (default: 10).
40
+
41
+ Returns:
42
+ Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
43
+ """
44
+ try:
45
+ hostname = urlparse(url).netloc
46
+ if ':' in hostname:
47
+ hostname = hostname.split(':')[0]
48
+
49
+ context = ssl.create_default_context()
50
+ with socket.create_connection((hostname, 443), timeout=timeout) as sock:
51
+ with context.wrap_socket(sock, server_hostname=hostname) as ssock:
52
+ cert_binary = ssock.getpeercert(binary_form=True)
53
+ x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
54
+
55
+ cert_info = {
56
+ "subject": dict(x509.get_subject().get_components()),
57
+ "issuer": dict(x509.get_issuer().get_components()),
58
+ "version": x509.get_version(),
59
+ "serial_number": hex(x509.get_serial_number()),
60
+ "not_before": x509.get_notBefore(),
61
+ "not_after": x509.get_notAfter(),
62
+ "fingerprint": x509.digest("sha256").hex(),
63
+ "signature_algorithm": x509.get_signature_algorithm(),
64
+ "raw_cert": base64.b64encode(cert_binary)
65
+ }
66
+
67
+ # Add extensions
68
+ extensions = []
69
+ for i in range(x509.get_extension_count()):
70
+ ext = x509.get_extension(i)
71
+ extensions.append({
72
+ "name": ext.get_short_name(),
73
+ "value": str(ext)
74
+ })
75
+ cert_info["extensions"] = extensions
76
+
77
+ return SSLCertificate(cert_info)
78
+
79
+ except Exception as e:
80
+ return None
81
+
82
+ @staticmethod
83
+ def _decode_cert_data(data: Any) -> Any:
84
+ """Helper method to decode bytes in certificate data."""
85
+ if isinstance(data, bytes):
86
+ return data.decode('utf-8')
87
+ elif isinstance(data, dict):
88
+ return {
89
+ (k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
90
+ for k, v in data.items()
91
+ }
92
+ elif isinstance(data, list):
93
+ return [SSLCertificate._decode_cert_data(item) for item in data]
94
+ return data
95
+
96
+ def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
97
+ """
98
+ Export certificate as JSON.
99
+
100
+ Args:
101
+ filepath (Optional[str]): Path to save the JSON file (default: None).
102
+
103
+ Returns:
104
+ Optional[str]: JSON string if successful, None otherwise.
105
+ """
106
+ json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
107
+ if filepath:
108
+ Path(filepath).write_text(json_str, encoding='utf-8')
109
+ return None
110
+ return json_str
111
+
112
+ def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
113
+ """
114
+ Export certificate as PEM.
115
+
116
+ Args:
117
+ filepath (Optional[str]): Path to save the PEM file (default: None).
118
+
119
+ Returns:
120
+ Optional[str]: PEM string if successful, None otherwise.
121
+ """
122
+ try:
123
+ x509 = OpenSSL.crypto.load_certificate(
124
+ OpenSSL.crypto.FILETYPE_ASN1,
125
+ base64.b64decode(self._cert_info['raw_cert'])
126
+ )
127
+ pem_data = OpenSSL.crypto.dump_certificate(
128
+ OpenSSL.crypto.FILETYPE_PEM,
129
+ x509
130
+ ).decode('utf-8')
131
+
132
+ if filepath:
133
+ Path(filepath).write_text(pem_data, encoding='utf-8')
134
+ return None
135
+ return pem_data
136
+ except Exception as e:
137
+ return None
138
+
139
+ def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
140
+ """
141
+ Export certificate as DER.
142
+
143
+ Args:
144
+ filepath (Optional[str]): Path to save the DER file (default: None).
145
+
146
+ Returns:
147
+ Optional[bytes]: DER bytes if successful, None otherwise.
148
+ """
149
+ try:
150
+ der_data = base64.b64decode(self._cert_info['raw_cert'])
151
+ if filepath:
152
+ Path(filepath).write_bytes(der_data)
153
+ return None
154
+ return der_data
155
+ except Exception:
156
+ return None
157
+
158
+ @property
159
+ def issuer(self) -> Dict[str, str]:
160
+ """Get certificate issuer information."""
161
+ return self._cert_info.get('issuer', {})
162
+
163
+ @property
164
+ def subject(self) -> Dict[str, str]:
165
+ """Get certificate subject information."""
166
+ return self._cert_info.get('subject', {})
167
+
168
+ @property
169
+ def valid_from(self) -> str:
170
+ """Get certificate validity start date."""
171
+ return self._cert_info.get('not_before', '')
172
+
173
+ @property
174
+ def valid_until(self) -> str:
175
+ """Get certificate validity end date."""
176
+ return self._cert_info.get('not_after', '')
177
+
178
+ @property
179
+ def fingerprint(self) -> str:
180
+ """Get certificate fingerprint."""
181
+ return self._cert_info.get('fingerprint', '')
crawl4ai/user_agent_generator.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import Optional, Literal, List, Dict, Tuple
3
+ import re
4
+
5
+
6
+ class UserAgentGenerator:
7
+ """
8
+ Generate random user agents with specified constraints.
9
+
10
+ Attributes:
11
+ desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
12
+ mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
13
+ browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
14
+ rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
15
+ chrome_versions (list): A list of possible Chrome browser versions.
16
+ firefox_versions (list): A list of possible Firefox browser versions.
17
+ edge_versions (list): A list of possible Edge browser versions.
18
+ safari_versions (list): A list of possible Safari browser versions.
19
+ ios_versions (list): A list of possible iOS browser versions.
20
+ android_versions (list): A list of possible Android browser versions.
21
+
22
+ Methods:
23
+ generate_user_agent(
24
+ platform: Literal["desktop", "mobile"] = "desktop",
25
+ browser: str = "chrome",
26
+ rendering_engine: str = "chrome_webkit",
27
+ chrome_version: Optional[str] = None,
28
+ firefox_version: Optional[str] = None,
29
+ edge_version: Optional[str] = None,
30
+ safari_version: Optional[str] = None,
31
+ ios_version: Optional[str] = None,
32
+ android_version: Optional[str] = None
33
+ ): Generates a random user agent string based on the specified parameters.
34
+ """
35
+ def __init__(self):
36
+ # Previous platform definitions remain the same...
37
+ self.desktop_platforms = {
38
+ "windows": {
39
+ "10_64": "(Windows NT 10.0; Win64; x64)",
40
+ "10_32": "(Windows NT 10.0; WOW64)",
41
+ },
42
+ "macos": {
43
+ "intel": "(Macintosh; Intel Mac OS X 10_15_7)",
44
+ "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)",
45
+ },
46
+ "linux": {
47
+ "generic": "(X11; Linux x86_64)",
48
+ "ubuntu": "(X11; Ubuntu; Linux x86_64)",
49
+ "chrome_os": "(X11; CrOS x86_64 14541.0.0)",
50
+ }
51
+ }
52
+
53
+ self.mobile_platforms = {
54
+ "android": {
55
+ "samsung": "(Linux; Android 13; SM-S901B)",
56
+ "pixel": "(Linux; Android 12; Pixel 6)",
57
+ "oneplus": "(Linux; Android 13; OnePlus 9 Pro)",
58
+ "xiaomi": "(Linux; Android 12; M2102J20SG)",
59
+ },
60
+ "ios": {
61
+ "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)",
62
+ "ipad": "(iPad; CPU OS 16_5 like Mac OS X)",
63
+ }
64
+ }
65
+
66
+ # Browser Combinations
67
+ self.browser_combinations = {
68
+ 1: [
69
+ ["chrome"],
70
+ ["firefox"],
71
+ ["safari"],
72
+ ["edge"]
73
+ ],
74
+ 2: [
75
+ ["gecko", "firefox"],
76
+ ["chrome", "safari"],
77
+ ["webkit", "safari"]
78
+ ],
79
+ 3: [
80
+ ["chrome", "safari", "edge"],
81
+ ["webkit", "chrome", "safari"]
82
+ ]
83
+ }
84
+
85
+ # Rendering Engines with versions
86
+ self.rendering_engines = {
87
+ "chrome_webkit": "AppleWebKit/537.36",
88
+ "safari_webkit": "AppleWebKit/605.1.15",
89
+ "gecko": [ # Added Gecko versions
90
+ "Gecko/20100101",
91
+ "Gecko/20100101", # Firefox usually uses this constant version
92
+ "Gecko/2010010",
93
+ ]
94
+ }
95
+
96
+ # Browser Versions
97
+ self.chrome_versions = [
98
+ "Chrome/119.0.6045.199",
99
+ "Chrome/118.0.5993.117",
100
+ "Chrome/117.0.5938.149",
101
+ "Chrome/116.0.5845.187",
102
+ "Chrome/115.0.5790.171",
103
+ ]
104
+
105
+ self.edge_versions = [
106
+ "Edg/119.0.2151.97",
107
+ "Edg/118.0.2088.76",
108
+ "Edg/117.0.2045.47",
109
+ "Edg/116.0.1938.81",
110
+ "Edg/115.0.1901.203",
111
+ ]
112
+
113
+ self.safari_versions = [
114
+ "Safari/537.36", # For Chrome-based
115
+ "Safari/605.1.15",
116
+ "Safari/604.1",
117
+ "Safari/602.1",
118
+ "Safari/601.5.17",
119
+ ]
120
+
121
+ # Added Firefox versions
122
+ self.firefox_versions = [
123
+ "Firefox/119.0",
124
+ "Firefox/118.0.2",
125
+ "Firefox/117.0.1",
126
+ "Firefox/116.0",
127
+ "Firefox/115.0.3",
128
+ "Firefox/114.0.2",
129
+ "Firefox/113.0.1",
130
+ "Firefox/112.0",
131
+ "Firefox/111.0.1",
132
+ "Firefox/110.0",
133
+ ]
134
+
135
+ def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
136
+ """
137
+ Get a valid combination of browser versions.
138
+
139
+ How it works:
140
+ 1. Check if the number of browsers is supported.
141
+ 2. Randomly choose a combination of browsers.
142
+ 3. Iterate through the combination and add browser versions.
143
+ 4. Return the browser stack.
144
+
145
+ Args:
146
+ num_browsers: Number of browser specifications (1-3)
147
+
148
+ Returns:
149
+ List[str]: A list of browser versions.
150
+ """
151
+ if num_browsers not in self.browser_combinations:
152
+ raise ValueError(f"Unsupported number of browsers: {num_browsers}")
153
+
154
+ combination = random.choice(self.browser_combinations[num_browsers])
155
+ browser_stack = []
156
+
157
+ for browser in combination:
158
+ if browser == "chrome":
159
+ browser_stack.append(random.choice(self.chrome_versions))
160
+ elif browser == "firefox":
161
+ browser_stack.append(random.choice(self.firefox_versions))
162
+ elif browser == "safari":
163
+ browser_stack.append(random.choice(self.safari_versions))
164
+ elif browser == "edge":
165
+ browser_stack.append(random.choice(self.edge_versions))
166
+ elif browser == "gecko":
167
+ browser_stack.append(random.choice(self.rendering_engines["gecko"]))
168
+ elif browser == "webkit":
169
+ browser_stack.append(self.rendering_engines["chrome_webkit"])
170
+
171
+ return browser_stack
172
+
173
+ def generate(self,
174
+ device_type: Optional[Literal['desktop', 'mobile']] = None,
175
+ os_type: Optional[str] = None,
176
+ device_brand: Optional[str] = None,
177
+ browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None,
178
+ num_browsers: int = 3) -> str:
179
+ """
180
+ Generate a random user agent with specified constraints.
181
+
182
+ Args:
183
+ device_type: 'desktop' or 'mobile'
184
+ os_type: 'windows', 'macos', 'linux', 'android', 'ios'
185
+ device_brand: Specific device brand
186
+ browser_type: 'chrome', 'edge', 'safari', or 'firefox'
187
+ num_browsers: Number of browser specifications (1-3)
188
+ """
189
+ # Get platform string
190
+ platform = self.get_random_platform(device_type, os_type, device_brand)
191
+
192
+ # Start with Mozilla
193
+ components = ["Mozilla/5.0", platform]
194
+
195
+ # Add browser stack
196
+ browser_stack = self.get_browser_stack(num_browsers)
197
+
198
+ # Add appropriate legacy token based on browser stack
199
+ if "Firefox" in str(browser_stack):
200
+ components.append(random.choice(self.rendering_engines["gecko"]))
201
+ elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
202
+ components.append(self.rendering_engines["chrome_webkit"])
203
+ components.append("(KHTML, like Gecko)")
204
+
205
+ # Add browser versions
206
+ components.extend(browser_stack)
207
+
208
+ return " ".join(components)
209
+
210
+ def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]:
211
+ """Generate both user agent and matching client hints"""
212
+ user_agent = self.generate(**kwargs)
213
+ client_hints = self.generate_client_hints(user_agent)
214
+ return user_agent, client_hints
215
+
216
+ def get_random_platform(self, device_type, os_type, device_brand):
217
+ """Helper method to get random platform based on constraints"""
218
+ platforms = self.desktop_platforms if device_type == 'desktop' else \
219
+ self.mobile_platforms if device_type == 'mobile' else \
220
+ {**self.desktop_platforms, **self.mobile_platforms}
221
+
222
+ if os_type:
223
+ for platform_group in [self.desktop_platforms, self.mobile_platforms]:
224
+ if os_type in platform_group:
225
+ platforms = {os_type: platform_group[os_type]}
226
+ break
227
+
228
+ os_key = random.choice(list(platforms.keys()))
229
+ if device_brand and device_brand in platforms[os_key]:
230
+ return platforms[os_key][device_brand]
231
+ return random.choice(list(platforms[os_key].values()))
232
+
233
+ def parse_user_agent(self, user_agent: str) -> Dict[str, str]:
234
+ """Parse a user agent string to extract browser and version information"""
235
+ browsers = {
236
+ 'chrome': r'Chrome/(\d+)',
237
+ 'edge': r'Edg/(\d+)',
238
+ 'safari': r'Version/(\d+)',
239
+ 'firefox': r'Firefox/(\d+)'
240
+ }
241
+
242
+ result = {}
243
+ for browser, pattern in browsers.items():
244
+ match = re.search(pattern, user_agent)
245
+ if match:
246
+ result[browser] = match.group(1)
247
+
248
+ return result
249
+
250
+ def generate_client_hints(self, user_agent: str) -> str:
251
+ """Generate Sec-CH-UA header value based on user agent string"""
252
+ browsers = self.parse_user_agent(user_agent)
253
+
254
+ # Client hints components
255
+ hints = []
256
+
257
+ # Handle different browser combinations
258
+ if 'chrome' in browsers:
259
+ hints.append(f'"Chromium";v="{browsers["chrome"]}"')
260
+ hints.append('"Not_A Brand";v="8"')
261
+
262
+ if 'edge' in browsers:
263
+ hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
264
+ else:
265
+ hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
266
+
267
+ elif 'firefox' in browsers:
268
+ # Firefox doesn't typically send Sec-CH-UA
269
+ return '""'
270
+
271
+ elif 'safari' in browsers:
272
+ # Safari's format for client hints
273
+ hints.append(f'"Safari";v="{browsers["safari"]}"')
274
+ hints.append('"Not_A Brand";v="8"')
275
+
276
+ return ', '.join(hints)
277
+
278
+ # Example usage:
279
+ if __name__ == "__main__":
280
+ generator = UserAgentGenerator()
281
+ print(generator.generate())
282
+
283
+ print("\nSingle browser (Chrome):")
284
+ print(generator.generate(num_browsers=1, browser_type='chrome'))
285
+
286
+ print("\nTwo browsers (Gecko/Firefox):")
287
+ print(generator.generate(num_browsers=2))
288
+
289
+ print("\nThree browsers (Chrome/Safari/Edge):")
290
+ print(generator.generate(num_browsers=3))
291
+
292
+ print("\nFirefox on Linux:")
293
+ print(generator.generate(
294
+ device_type='desktop',
295
+ os_type='linux',
296
+ browser_type='firefox',
297
+ num_browsers=2
298
+ ))
299
+
300
+ print("\nChrome/Safari/Edge on Windows:")
301
+ print(generator.generate(
302
+ device_type='desktop',
303
+ os_type='windows',
304
+ num_browsers=3
305
+ ))