amaye15
commited on
Commit
·
03c0888
1
Parent(s):
1214696
test
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.txt +4 -0
- .gitattributes +12 -35
- .gitignore +232 -0
- CHANGELOG.md +1089 -0
- CODE_OF_CONDUCT.md +131 -0
- CONTRIBUTORS.md +42 -0
- Dockerfile +136 -0
- LICENSE +51 -0
- MANIFEST.in +2 -0
- MISSION.md +46 -0
- README.md +560 -1
- ROADMAP.md +503 -0
- crawl4ai/__init__.py +46 -0
- crawl4ai/__version__.py +2 -0
- crawl4ai/async_configs.py +603 -0
- crawl4ai/async_crawler_strategy.py +2191 -0
- crawl4ai/async_database.py +495 -0
- crawl4ai/async_logger.py +231 -0
- crawl4ai/async_webcrawler.py +833 -0
- crawl4ai/cache_context.py +115 -0
- crawl4ai/chunking_strategy.py +231 -0
- crawl4ai/cli.py +105 -0
- crawl4ai/config.py +64 -0
- crawl4ai/content_filter_strategy.py +627 -0
- crawl4ai/content_scraping_strategy.py +723 -0
- crawl4ai/crawler_strategy.py +360 -0
- crawl4ai/database.py +135 -0
- crawl4ai/docs_manager.py +67 -0
- crawl4ai/extraction_strategy.bak.py +1440 -0
- crawl4ai/extraction_strategy.py +1052 -0
- crawl4ai/html2text/__init__.py +1141 -0
- crawl4ai/html2text/__main__.py +3 -0
- crawl4ai/html2text/_typing.py +2 -0
- crawl4ai/html2text/cli.py +330 -0
- crawl4ai/html2text/config.py +172 -0
- crawl4ai/html2text/elements.py +18 -0
- crawl4ai/html2text/utils.py +303 -0
- crawl4ai/install.py +83 -0
- crawl4ai/js_snippet/__init__.py +15 -0
- crawl4ai/js_snippet/navigator_overrider.js +25 -0
- crawl4ai/js_snippet/remove_overlay_elements.js +119 -0
- crawl4ai/js_snippet/update_image_dimensions.js +54 -0
- crawl4ai/llmtxt.py +498 -0
- crawl4ai/markdown_generation_strategy.py +225 -0
- crawl4ai/migrations.py +168 -0
- crawl4ai/model_loader.py +256 -0
- crawl4ai/models.py +61 -0
- crawl4ai/prompts.py +204 -0
- crawl4ai/ssl_certificate.py +181 -0
- crawl4ai/user_agent_generator.py +305 -0
.env.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GROQ_API_KEY = "YOUR_GROQ_API"
|
2 |
+
OPENAI_API_KEY = "YOUR_OPENAI_API"
|
3 |
+
ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API"
|
4 |
+
# You can add more API keys here
|
.gitattributes
CHANGED
@@ -1,35 +1,12 @@
|
|
1 |
-
|
2 |
-
*.
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
*.
|
9 |
-
*.
|
10 |
-
|
11 |
-
|
12 |
-
*.
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Documentation
|
2 |
+
*.html linguist-documentation
|
3 |
+
docs/* linguist-documentation
|
4 |
+
docs/examples/* linguist-documentation
|
5 |
+
docs/md_v2/* linguist-documentation
|
6 |
+
|
7 |
+
# Explicitly mark Python as the main language
|
8 |
+
*.py linguist-detectable=true
|
9 |
+
*.py linguist-language=Python
|
10 |
+
|
11 |
+
# Exclude HTML from language statistics
|
12 |
+
*.html linguist-detectable=false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
Crawl4AI.egg-info/
|
165 |
+
Crawl4AI.egg-info/*
|
166 |
+
crawler_data.db
|
167 |
+
.vscode/
|
168 |
+
.tests/
|
169 |
+
.test_pads/
|
170 |
+
test_pad.py
|
171 |
+
test_pad*.py
|
172 |
+
.data/
|
173 |
+
Crawl4AI.egg-info/
|
174 |
+
|
175 |
+
requirements0.txt
|
176 |
+
a.txt
|
177 |
+
|
178 |
+
*.sh
|
179 |
+
.idea
|
180 |
+
docs/examples/.chainlit/
|
181 |
+
docs/examples/.chainlit/*
|
182 |
+
.chainlit/config.toml
|
183 |
+
.chainlit/translations/en-US.json
|
184 |
+
|
185 |
+
local/
|
186 |
+
.files/
|
187 |
+
|
188 |
+
a.txt
|
189 |
+
.lambda_function.py
|
190 |
+
ec2*
|
191 |
+
|
192 |
+
update_changelog.sh
|
193 |
+
|
194 |
+
.DS_Store
|
195 |
+
docs/.DS_Store
|
196 |
+
tmp/
|
197 |
+
test_env/
|
198 |
+
**/.DS_Store
|
199 |
+
**/.DS_Store
|
200 |
+
|
201 |
+
todo.md
|
202 |
+
todo_executor.md
|
203 |
+
git_changes.py
|
204 |
+
git_changes.md
|
205 |
+
pypi_build.sh
|
206 |
+
git_issues.py
|
207 |
+
git_issues.md
|
208 |
+
|
209 |
+
.next/
|
210 |
+
.tests/
|
211 |
+
# .issues/
|
212 |
+
.docs/
|
213 |
+
.issues/
|
214 |
+
.gitboss/
|
215 |
+
todo_executor.md
|
216 |
+
protect-all-except-feature.sh
|
217 |
+
manage-collab.sh
|
218 |
+
publish.sh
|
219 |
+
combine.sh
|
220 |
+
combined_output.txt
|
221 |
+
.local
|
222 |
+
.scripts
|
223 |
+
tree.md
|
224 |
+
tree.md
|
225 |
+
.scripts
|
226 |
+
.local
|
227 |
+
.do
|
228 |
+
/plans
|
229 |
+
plans/
|
230 |
+
|
231 |
+
# Codeium
|
232 |
+
.codeiumignore
|
CHANGELOG.md
ADDED
@@ -0,0 +1,1089 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Changelog
|
2 |
+
|
3 |
+
All notable changes to Crawl4AI will be documented in this file.
|
4 |
+
|
5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6 |
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
## [0.4.267] - 2025 - 01 - 06
|
11 |
+
|
12 |
+
### Added
|
13 |
+
- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
|
14 |
+
- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
|
15 |
+
|
16 |
+
### Changed
|
17 |
+
- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py))
|
18 |
+
- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
|
19 |
+
- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py))
|
20 |
+
- **Documentation Update**:
|
21 |
+
- Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
|
22 |
+
|
23 |
+
### Removed
|
24 |
+
- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py))
|
25 |
+
|
26 |
+
### Fixed
|
27 |
+
- **Page Closing to Prevent Memory Leaks**:
|
28 |
+
- **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided.
|
29 |
+
- **Impact**: Prevents memory leaks caused by lingering pages after a crawl.
|
30 |
+
- **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py)
|
31 |
+
- **Code**:
|
32 |
+
```python
|
33 |
+
finally:
|
34 |
+
# If no session_id is given we should close the page
|
35 |
+
if not config.session_id:
|
36 |
+
await page.close()
|
37 |
+
```
|
38 |
+
- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py))
|
39 |
+
- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
|
40 |
+
|
41 |
+
### Other
|
42 |
+
- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore))
|
43 |
+
|
44 |
+
|
45 |
+
## [0.4.24] - 2024-12-31
|
46 |
+
|
47 |
+
### Added
|
48 |
+
- **Browser and SSL Handling**
|
49 |
+
- SSL certificate validation options in extraction strategies
|
50 |
+
- Custom certificate paths support
|
51 |
+
- Configurable certificate validation skipping
|
52 |
+
- Enhanced response status code handling with retry logic
|
53 |
+
|
54 |
+
- **Content Processing**
|
55 |
+
- New content filtering system with regex support
|
56 |
+
- Advanced chunking strategies for large content
|
57 |
+
- Memory-efficient parallel processing
|
58 |
+
- Configurable chunk size optimization
|
59 |
+
|
60 |
+
- **JSON Extraction**
|
61 |
+
- Complex JSONPath expression support
|
62 |
+
- JSON-CSS and Microdata extraction
|
63 |
+
- RDFa parsing capabilities
|
64 |
+
- Advanced data transformation pipeline
|
65 |
+
|
66 |
+
- **Field Types**
|
67 |
+
- New field types: `computed`, `conditional`, `aggregate`, `template`
|
68 |
+
- Field inheritance system
|
69 |
+
- Reusable field definitions
|
70 |
+
- Custom validation rules
|
71 |
+
|
72 |
+
### Changed
|
73 |
+
- **Performance**
|
74 |
+
- Optimized selector compilation with caching
|
75 |
+
- Improved HTML parsing efficiency
|
76 |
+
- Enhanced memory management for large documents
|
77 |
+
- Batch processing optimizations
|
78 |
+
|
79 |
+
- **Error Handling**
|
80 |
+
- More detailed error messages and categorization
|
81 |
+
- Enhanced debugging capabilities
|
82 |
+
- Improved performance metrics tracking
|
83 |
+
- Better error recovery mechanisms
|
84 |
+
|
85 |
+
### Deprecated
|
86 |
+
- Old field computation method using `eval`
|
87 |
+
- Direct browser manipulation without proper SSL handling
|
88 |
+
- Simple text-based content filtering
|
89 |
+
|
90 |
+
### Removed
|
91 |
+
- Legacy extraction patterns without proper error handling
|
92 |
+
- Unsafe eval-based field computation
|
93 |
+
- Direct DOM manipulation without sanitization
|
94 |
+
|
95 |
+
### Fixed
|
96 |
+
- Memory leaks in large document processing
|
97 |
+
- SSL certificate validation issues
|
98 |
+
- Incorrect handling of nested JSON structures
|
99 |
+
- Performance bottlenecks in parallel processing
|
100 |
+
|
101 |
+
### Security
|
102 |
+
- Improved input validation and sanitization
|
103 |
+
- Safe expression evaluation system
|
104 |
+
- Enhanced resource protection
|
105 |
+
- Rate limiting implementation
|
106 |
+
|
107 |
+
## [0.4.1] - 2024-12-08
|
108 |
+
|
109 |
+
### **File: `crawl4ai/async_crawler_strategy.py`**
|
110 |
+
|
111 |
+
#### **New Parameters and Attributes Added**
|
112 |
+
- **`text_mode` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering.
|
113 |
+
- **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency.
|
114 |
+
- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_mode` mode (default values: 800x600 for `text_mode`, 1920x1080 otherwise).
|
115 |
+
- **`extra_args`**: Adds browser-specific flags for `text_mode` mode.
|
116 |
+
- **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering.
|
117 |
+
|
118 |
+
#### **Browser Context Adjustments**
|
119 |
+
- Added **`viewport` adjustments**: Dynamically computed based on `text_mode` or custom configuration.
|
120 |
+
- Enhanced support for `light_mode` and `text_mode` by adding specific browser arguments to reduce resource consumption.
|
121 |
+
|
122 |
+
#### **Dynamic Content Handling**
|
123 |
+
- **Full Page Scan Feature**:
|
124 |
+
- Scrolls through the entire page while dynamically detecting content changes.
|
125 |
+
- Ensures scrolling stops when no new dynamic content is loaded.
|
126 |
+
|
127 |
+
#### **Session Management**
|
128 |
+
- Added **`create_session`** method:
|
129 |
+
- Creates a new browser session and assigns a unique ID.
|
130 |
+
- Supports persistent and non-persistent contexts with full compatibility for cookies, headers, and proxies.
|
131 |
+
|
132 |
+
#### **Improved Content Loading and Adjustment**
|
133 |
+
- **`adjust_viewport_to_content`**:
|
134 |
+
- Automatically adjusts viewport to match content dimensions.
|
135 |
+
- Includes scaling via Chrome DevTools Protocol (CDP).
|
136 |
+
- Enhanced content loading:
|
137 |
+
- Waits for images to load and ensures network activity is idle before proceeding.
|
138 |
+
|
139 |
+
#### **Error Handling and Logging**
|
140 |
+
- Improved error handling and detailed logging for:
|
141 |
+
- Viewport adjustment (`adjust_viewport_to_content`).
|
142 |
+
- Full page scanning (`scan_full_page`).
|
143 |
+
- Dynamic content loading.
|
144 |
+
|
145 |
+
#### **Refactoring and Cleanup**
|
146 |
+
- Removed hardcoded viewport dimensions in multiple places, replaced with dynamic values (`self.viewport_width`, `self.viewport_height`).
|
147 |
+
- Removed commented-out and unused code for better readability.
|
148 |
+
- Added default value for `delay_before_return_html` parameter.
|
149 |
+
|
150 |
+
#### **Optimizations**
|
151 |
+
- Reduced resource usage in `light_mode` by disabling unnecessary browser features such as extensions, background timers, and sync.
|
152 |
+
- Improved compatibility for different browser types (`chrome`, `firefox`, `webkit`).
|
153 |
+
|
154 |
+
---
|
155 |
+
|
156 |
+
### **File: `docs/examples/quickstart_async.py`**
|
157 |
+
|
158 |
+
#### **Schema Adjustment**
|
159 |
+
- Changed schema reference for `LLMExtractionStrategy`:
|
160 |
+
- **Old**: `OpenAIModelFee.schema()`
|
161 |
+
- **New**: `OpenAIModelFee.model_json_schema()`
|
162 |
+
- This likely ensures better compatibility with the `OpenAIModelFee` class and its JSON schema.
|
163 |
+
|
164 |
+
#### **Documentation Comments Updated**
|
165 |
+
- Improved extraction instruction for schema-based LLM strategies.
|
166 |
+
|
167 |
+
---
|
168 |
+
|
169 |
+
### **New Features Added**
|
170 |
+
1. **Text-Only Mode**:
|
171 |
+
- Focuses on minimal resource usage by disabling non-essential browser features.
|
172 |
+
2. **Light Mode**:
|
173 |
+
- Optimizes browser for performance by disabling background tasks and unnecessary services.
|
174 |
+
3. **Full Page Scanning**:
|
175 |
+
- Ensures the entire content of a page is crawled, including dynamic elements loaded during scrolling.
|
176 |
+
4. **Dynamic Viewport Adjustment**:
|
177 |
+
- Automatically resizes the viewport to match content dimensions, improving compatibility and rendering accuracy.
|
178 |
+
5. **Session Management**:
|
179 |
+
- Simplifies session handling with better support for persistent and non-persistent contexts.
|
180 |
+
|
181 |
+
---
|
182 |
+
|
183 |
+
### **Bug Fixes**
|
184 |
+
- Fixed potential viewport mismatches by ensuring consistent use of `self.viewport_width` and `self.viewport_height` throughout the code.
|
185 |
+
- Improved robustness of dynamic content loading to avoid timeouts and failed evaluations.
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
## [0.3.75] December 1, 2024
|
194 |
+
|
195 |
+
### PruningContentFilter
|
196 |
+
|
197 |
+
#### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
|
198 |
+
A new content filtering strategy that removes less relevant nodes based on metrics like text and link density.
|
199 |
+
|
200 |
+
**Affected Files:**
|
201 |
+
- `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities.
|
202 |
+
```diff
|
203 |
+
Implemented effective pruning algorithm with comprehensive scoring.
|
204 |
+
```
|
205 |
+
- `README.md`: Improved documentation regarding new features.
|
206 |
+
```diff
|
207 |
+
Updated to include usage and explanation for the PruningContentFilter.
|
208 |
+
```
|
209 |
+
- `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users.
|
210 |
+
```diff
|
211 |
+
Added detailed section explaining the PruningContentFilter.
|
212 |
+
```
|
213 |
+
|
214 |
+
#### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
|
215 |
+
Comprehensive tests added to ensure correct functionality of PruningContentFilter
|
216 |
+
|
217 |
+
**Affected Files:**
|
218 |
+
- `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies.
|
219 |
+
```diff
|
220 |
+
Created test cases for various scenarios using the PruningContentFilter.
|
221 |
+
```
|
222 |
+
|
223 |
+
### Development Updates
|
224 |
+
|
225 |
+
#### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024)
|
226 |
+
Extended testing to cover additional edge cases and performance metrics.
|
227 |
+
|
228 |
+
**Affected Files:**
|
229 |
+
- `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance.
|
230 |
+
```diff
|
231 |
+
Added tests for new extraction scenarios including malformed HTML.
|
232 |
+
```
|
233 |
+
|
234 |
+
### Infrastructure & Documentation
|
235 |
+
|
236 |
+
#### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024)
|
237 |
+
Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies.
|
238 |
+
|
239 |
+
**Affected Files:**
|
240 |
+
- `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users.
|
241 |
+
- Revised example to illustrate usage of PruningContentFilter.
|
242 |
+
|
243 |
+
## [0.3.746] November 29, 2024
|
244 |
+
|
245 |
+
### Major Features
|
246 |
+
1. Enhanced Docker Support (Nov 29, 2024)
|
247 |
+
- Improved GPU support in Docker images.
|
248 |
+
- Dockerfile refactored for better platform-specific installations.
|
249 |
+
- Introduced new Docker commands for different platforms:
|
250 |
+
- `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64.
|
251 |
+
- `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64.
|
252 |
+
|
253 |
+
### Infrastructure & Documentation
|
254 |
+
- Enhanced README.md to improve user guidance and installation instructions.
|
255 |
+
- Added installation instructions for Playwright setup in README.
|
256 |
+
- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly.
|
257 |
+
- Updated `requirements.txt` with a new `pydantic` dependency.
|
258 |
+
- Bumped version number in `crawl4ai/__version__.py` to 0.3.746.
|
259 |
+
|
260 |
+
### Breaking Changes
|
261 |
+
- Streamlined application structure:
|
262 |
+
- Removed static pages and related code from `main.py` which might affect existing deployments relying on static content.
|
263 |
+
|
264 |
+
### Development Updates
|
265 |
+
- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks.
|
266 |
+
- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility.
|
267 |
+
- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities.
|
268 |
+
- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing.
|
269 |
+
|
270 |
+
### README.md
|
271 |
+
Updated README with new docker commands and setup instructions.
|
272 |
+
Enhanced installation instructions and guidance.
|
273 |
+
|
274 |
+
### crawl4ai/install.py
|
275 |
+
Added post-install script functionality.
|
276 |
+
Introduced `post_install` method for automation of post-installation tasks.
|
277 |
+
|
278 |
+
### crawl4ai/migrations.py
|
279 |
+
Improved migration logging.
|
280 |
+
Refined migration processes and added better logging.
|
281 |
+
|
282 |
+
### docker-compose.yml
|
283 |
+
Refactored docker-compose for better service management.
|
284 |
+
Updated to define services for different platforms and versions.
|
285 |
+
|
286 |
+
### requirements.txt
|
287 |
+
Updated dependencies.
|
288 |
+
Added `pydantic` to requirements file.
|
289 |
+
|
290 |
+
### crawler/__version__.py
|
291 |
+
Updated version number.
|
292 |
+
Bumped version number to 0.3.746.
|
293 |
+
|
294 |
+
### docs/examples/quickstart_async.py
|
295 |
+
Enhanced example scripts.
|
296 |
+
Uncommented example usage in async guide for user functionality.
|
297 |
+
|
298 |
+
### main.py
|
299 |
+
Refactored code to improve maintainability.
|
300 |
+
Streamlined app structure by removing static pages code.
|
301 |
+
|
302 |
+
## [0.3.743] November 27, 2024
|
303 |
+
|
304 |
+
Enhance features and documentation
|
305 |
+
- Updated version to 0.3.743
|
306 |
+
- Improved ManagedBrowser configuration with dynamic host/port
|
307 |
+
- Implemented fast HTML formatting in web crawler
|
308 |
+
- Enhanced markdown generation with a new generator class
|
309 |
+
- Improved sanitization and utility functions
|
310 |
+
- Added contributor details and pull request acknowledgments
|
311 |
+
- Updated documentation for clearer usage scenarios
|
312 |
+
- Adjusted tests to reflect class name changes
|
313 |
+
|
314 |
+
### CONTRIBUTORS.md
|
315 |
+
Added new contributors and pull request details.
|
316 |
+
Updated community contributions and acknowledged pull requests.
|
317 |
+
|
318 |
+
### crawl4ai/__version__.py
|
319 |
+
Version update.
|
320 |
+
Bumped version to 0.3.743.
|
321 |
+
|
322 |
+
### crawl4ai/async_crawler_strategy.py
|
323 |
+
Improved ManagedBrowser configuration.
|
324 |
+
Enhanced browser initialization with configurable host and debugging port; improved hook execution.
|
325 |
+
|
326 |
+
### crawl4ai/async_webcrawler.py
|
327 |
+
Optimized HTML processing.
|
328 |
+
Implemented 'fast_format_html' for optimized HTML formatting; applied it when 'prettiify' is enabled.
|
329 |
+
|
330 |
+
### crawl4ai/content_scraping_strategy.py
|
331 |
+
Enhanced markdown generation strategy.
|
332 |
+
Updated to use DefaultMarkdownGenerator and improved markdown generation with filters option.
|
333 |
+
|
334 |
+
### crawl4ai/markdown_generation_strategy.py
|
335 |
+
Refactored markdown generation class.
|
336 |
+
Renamed DefaultMarkdownGenerationStrategy to DefaultMarkdownGenerator; added content filter handling.
|
337 |
+
|
338 |
+
### crawl4ai/utils.py
|
339 |
+
Enhanced utility functions.
|
340 |
+
Improved input sanitization and enhanced HTML formatting method.
|
341 |
+
|
342 |
+
### docs/md_v2/advanced/hooks-auth.md
|
343 |
+
Improved documentation for hooks.
|
344 |
+
Updated code examples to include cookies in crawler strategy initialization.
|
345 |
+
|
346 |
+
### tests/async/test_markdown_genertor.py
|
347 |
+
Refactored tests to match class renaming.
|
348 |
+
Updated tests to use renamed DefaultMarkdownGenerator class.
|
349 |
+
|
350 |
+
## [0.3.74] November 17, 2024
|
351 |
+
|
352 |
+
This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information.
|
353 |
+
|
354 |
+
### 1. File Download Processing
|
355 |
+
|
356 |
+
- Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory.
|
357 |
+
- File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths.
|
358 |
+
- Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download.
|
359 |
+
|
360 |
+
**Example:**
|
361 |
+
|
362 |
+
```python
|
363 |
+
import asyncio
|
364 |
+
import os
|
365 |
+
from pathlib import Path
|
366 |
+
from crawl4ai import AsyncWebCrawler
|
367 |
+
|
368 |
+
async def download_example():
|
369 |
+
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
370 |
+
os.makedirs(downloads_path, exist_ok=True)
|
371 |
+
|
372 |
+
async with AsyncWebCrawler(
|
373 |
+
accept_downloads=True,
|
374 |
+
downloads_path=downloads_path,
|
375 |
+
verbose=True
|
376 |
+
) as crawler:
|
377 |
+
result = await crawler.arun(
|
378 |
+
url="https://www.python.org/downloads/",
|
379 |
+
js_code="""
|
380 |
+
const downloadLink = document.querySelector('a[href$=".exe"]');
|
381 |
+
if (downloadLink) { downloadLink.click(); }
|
382 |
+
""",
|
383 |
+
wait_for=5 # To ensure download has started
|
384 |
+
)
|
385 |
+
|
386 |
+
if result.downloaded_files:
|
387 |
+
print("Downloaded files:")
|
388 |
+
for file in result.downloaded_files:
|
389 |
+
print(f"- {file}")
|
390 |
+
|
391 |
+
asyncio.run(download_example())
|
392 |
+
|
393 |
+
```
|
394 |
+
|
395 |
+
### 2. Refined Content Filtering
|
396 |
+
|
397 |
+
- Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query.
|
398 |
+
- The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords.
|
399 |
+
|
400 |
+
**Example:**
|
401 |
+
|
402 |
+
```python
|
403 |
+
from crawl4ai import AsyncWebCrawler
|
404 |
+
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
405 |
+
|
406 |
+
async def filter_content(url, query):
|
407 |
+
async with AsyncWebCrawler() as crawler:
|
408 |
+
content_filter = BM25ContentFilter(user_query=query)
|
409 |
+
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
|
410 |
+
print(result.extracted_content) # Or result.fit_markdown for the markdown version
|
411 |
+
print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content
|
412 |
+
|
413 |
+
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health"))
|
414 |
+
```
|
415 |
+
|
416 |
+
### 3. Raw HTML and Local File Support
|
417 |
+
|
418 |
+
- Added support for crawling local files and raw HTML content directly.
|
419 |
+
- Use the `file://` prefix for local file paths.
|
420 |
+
- Use the `raw:` prefix for raw HTML strings.
|
421 |
+
|
422 |
+
**Example:**
|
423 |
+
|
424 |
+
```python
|
425 |
+
async def crawl_local_or_raw(crawler, content, content_type):
|
426 |
+
prefix = "file://" if content_type == "local" else "raw:"
|
427 |
+
url = f"{prefix}{content}"
|
428 |
+
result = await crawler.arun(url=url)
|
429 |
+
if result.success:
|
430 |
+
print(f"Markdown Content from {content_type.title()} Source:")
|
431 |
+
print(result.markdown)
|
432 |
+
|
433 |
+
# Example usage with local file and raw HTML
|
434 |
+
async def main():
|
435 |
+
async with AsyncWebCrawler() as crawler:
|
436 |
+
# Local File
|
437 |
+
await crawl_local_or_raw(
|
438 |
+
crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local"
|
439 |
+
)
|
440 |
+
# Raw HTML
|
441 |
+
await crawl_raw_html(crawler, "<h1>Raw Test</h1><p>This is raw HTML.</p>")
|
442 |
+
|
443 |
+
|
444 |
+
asyncio.run(main())
|
445 |
+
```
|
446 |
+
|
447 |
+
### 4. Browser Management
|
448 |
+
|
449 |
+
- New asynchronous crawler strategy implemented using Playwright.
|
450 |
+
- `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring.
|
451 |
+
- Updated to tf-playwright-stealth for enhanced stealth capabilities.
|
452 |
+
- Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy.
|
453 |
+
|
454 |
+
|
455 |
+
**Example:**
|
456 |
+
```python
|
457 |
+
async def browser_management_demo():
|
458 |
+
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir")
|
459 |
+
os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists
|
460 |
+
async with AsyncWebCrawler(
|
461 |
+
use_managed_browser=True,
|
462 |
+
user_data_dir=user_data_dir,
|
463 |
+
use_persistent_context=True,
|
464 |
+
verbose=True
|
465 |
+
) as crawler:
|
466 |
+
result1 = await crawler.arun(
|
467 |
+
url="https://example.com", session_id="my_session"
|
468 |
+
)
|
469 |
+
result2 = await crawler.arun(
|
470 |
+
url="https://example.com/anotherpage", session_id="my_session"
|
471 |
+
)
|
472 |
+
|
473 |
+
asyncio.run(browser_management_demo())
|
474 |
+
```
|
475 |
+
|
476 |
+
|
477 |
+
### 5. API Server & Cache Improvements
|
478 |
+
|
479 |
+
- Added CORS support to API server.
|
480 |
+
- Implemented static file serving.
|
481 |
+
- Enhanced root redirect functionality.
|
482 |
+
- Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently.
|
483 |
+
- New, more efficient caching database built using xxhash and file system approach.
|
484 |
+
- Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`.
|
485 |
+
|
486 |
+
|
487 |
+
### 🗑️ Removals
|
488 |
+
|
489 |
+
- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`.
|
490 |
+
- Removed internal class ContentCleaningStrategy
|
491 |
+
- Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`.
|
492 |
+
|
493 |
+
|
494 |
+
### ⚙️ Other Changes
|
495 |
+
|
496 |
+
- Moved version file to `crawl4ai/__version__.py`.
|
497 |
+
- Added `crawl4ai/cache_context.py`.
|
498 |
+
- Added `crawl4ai/version_manager.py`.
|
499 |
+
- Added `crawl4ai/migrations.py`.
|
500 |
+
- Added `crawl4ai-migrate` entry point.
|
501 |
+
- Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`.
|
502 |
+
- API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security.
|
503 |
+
- Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue.
|
504 |
+
|
505 |
+
|
506 |
+
### ⚠️ Deprecation Notices
|
507 |
+
|
508 |
+
- The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage.
|
509 |
+
|
510 |
+
|
511 |
+
### 🐛 Bug Fixes
|
512 |
+
|
513 |
+
- Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments.
|
514 |
+
- Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers.
|
515 |
+
- Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues.
|
516 |
+
- Fixed issue with incorrect text parsing in specific HTML structures.
|
517 |
+
|
518 |
+
|
519 |
+
### Example of migrating to the new CacheMode:
|
520 |
+
|
521 |
+
**Old way:**
|
522 |
+
|
523 |
+
```python
|
524 |
+
crawler = AsyncWebCrawler(always_by_pass_cache=True)
|
525 |
+
result = await crawler.arun(url="https://example.com", bypass_cache=True)
|
526 |
+
```
|
527 |
+
|
528 |
+
**New way:**
|
529 |
+
|
530 |
+
```python
|
531 |
+
from crawl4ai import CacheMode
|
532 |
+
|
533 |
+
crawler = AsyncWebCrawler(always_bypass_cache=True)
|
534 |
+
result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS)
|
535 |
+
```
|
536 |
+
|
537 |
+
|
538 |
+
## [0.3.74] - November 13, 2024
|
539 |
+
|
540 |
+
1. **File Download Processing** (Nov 14, 2024)
|
541 |
+
- Added capability for users to specify download folders
|
542 |
+
- Implemented file download tracking in crowd result object
|
543 |
+
- Created new file: `tests/async/test_async_doanloader.py`
|
544 |
+
|
545 |
+
2. **Content Filtering Improvements** (Nov 14, 2024)
|
546 |
+
- Introduced Relevance Content Filter as an improvement over Fit Markdown
|
547 |
+
- Implemented BM25 algorithm for content relevance matching
|
548 |
+
- Added new file: `crawl4ai/content_filter_strategy.py`
|
549 |
+
- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`
|
550 |
+
|
551 |
+
3. **Local File and Raw HTML Support** (Nov 13, 2024)
|
552 |
+
- Added support for processing local files
|
553 |
+
- Implemented raw HTML input handling in AsyncWebCrawler
|
554 |
+
- Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements
|
555 |
+
|
556 |
+
4. **Browser Management Enhancements** (Nov 12, 2024)
|
557 |
+
- Implemented new async crawler strategy using Playwright
|
558 |
+
- Introduced ManagedBrowser for better browser session handling
|
559 |
+
- Added support for persistent browser sessions
|
560 |
+
- Updated from playwright_stealth to tf-playwright-stealth
|
561 |
+
|
562 |
+
5. **API Server Component**
|
563 |
+
- Added CORS support
|
564 |
+
- Implemented static file serving
|
565 |
+
- Enhanced root redirect functionality
|
566 |
+
|
567 |
+
|
568 |
+
|
569 |
+
## [0.3.731] - November 13, 2024
|
570 |
+
|
571 |
+
### Added
|
572 |
+
- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://')
|
573 |
+
- Browser process monitoring for managed browser instances
|
574 |
+
- Screenshot capability for raw HTML and local file content
|
575 |
+
- Response headers storage in cache database
|
576 |
+
- New `fit_markdown` flag for optional markdown generation
|
577 |
+
|
578 |
+
### Changed
|
579 |
+
- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement
|
580 |
+
- Optimized BeautifulSoup text conversion and element selection
|
581 |
+
- Pre-compiled regular expressions for better performance
|
582 |
+
- Improved metadata extraction efficiency
|
583 |
+
- Response headers now stored alongside HTML in cache
|
584 |
+
|
585 |
+
### Removed
|
586 |
+
- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues
|
587 |
+
|
588 |
+
### Fixed
|
589 |
+
- Issue #256: Added support for crawling raw HTML content
|
590 |
+
- Issue #253: Implemented file:// protocol handling
|
591 |
+
- Missing response headers in cached results
|
592 |
+
- Memory leaks from improper async cleanup
|
593 |
+
|
594 |
+
## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix
|
595 |
+
- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations.
|
596 |
+
- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers.
|
597 |
+
- Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations.
|
598 |
+
- Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability.
|
599 |
+
- Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment.
|
600 |
+
|
601 |
+
## [v0.3.73] - 2024-11-05
|
602 |
+
|
603 |
+
### Major Features
|
604 |
+
- **New Doctor Feature**
|
605 |
+
- Added comprehensive system diagnostics tool
|
606 |
+
- Available through package hub and CLI
|
607 |
+
- Provides automated troubleshooting and system health checks
|
608 |
+
- Includes detailed reporting of configuration issues
|
609 |
+
|
610 |
+
- **Dockerized API Server**
|
611 |
+
- Released complete Docker implementation for API server
|
612 |
+
- Added comprehensive documentation for Docker deployment
|
613 |
+
- Implemented container communication protocols
|
614 |
+
- Added environment configuration guides
|
615 |
+
|
616 |
+
- **Managed Browser Integration**
|
617 |
+
- Added support for user-controlled browser instances
|
618 |
+
- Implemented `ManagedBrowser` class for better browser lifecycle management
|
619 |
+
- Added ability to connect to existing Chrome DevTools Protocol (CDP) endpoints
|
620 |
+
- Introduced user data directory support for persistent browser profiles
|
621 |
+
|
622 |
+
- **Enhanced HTML Processing**
|
623 |
+
- Added HTML tag preservation feature during markdown conversion
|
624 |
+
- Introduced configurable tag preservation system
|
625 |
+
- Improved pre-tag and code block handling
|
626 |
+
- Added support for nested preserved tags with attribute retention
|
627 |
+
|
628 |
+
### Improvements
|
629 |
+
- **Browser Handling**
|
630 |
+
- Added flag to ignore body visibility for problematic pages
|
631 |
+
- Improved browser process cleanup and management
|
632 |
+
- Enhanced temporary directory handling for browser profiles
|
633 |
+
- Added configurable browser launch arguments
|
634 |
+
|
635 |
+
- **Database Management**
|
636 |
+
- Implemented connection pooling for better performance
|
637 |
+
- Added retry logic for database operations
|
638 |
+
- Improved error handling and logging
|
639 |
+
- Enhanced cleanup procedures for database connections
|
640 |
+
|
641 |
+
- **Resource Management**
|
642 |
+
- Added memory and CPU monitoring
|
643 |
+
- Implemented dynamic task slot allocation based on system resources
|
644 |
+
- Added configurable cleanup intervals
|
645 |
+
|
646 |
+
### Technical Improvements
|
647 |
+
- **Code Structure**
|
648 |
+
- Moved version management to dedicated _version.py file
|
649 |
+
- Improved error handling throughout the codebase
|
650 |
+
- Enhanced logging system with better error reporting
|
651 |
+
- Reorganized core components for better maintainability
|
652 |
+
|
653 |
+
### Bug Fixes
|
654 |
+
- Fixed issues with browser process termination
|
655 |
+
- Improved handling of connection timeouts
|
656 |
+
- Enhanced error recovery in database operations
|
657 |
+
- Fixed memory leaks in long-running processes
|
658 |
+
|
659 |
+
### Dependencies
|
660 |
+
- Updated Playwright to v1.47
|
661 |
+
- Updated core dependencies with more flexible version constraints
|
662 |
+
- Added new development dependencies for testing
|
663 |
+
|
664 |
+
### Breaking Changes
|
665 |
+
- Changed default browser handling behavior
|
666 |
+
- Modified database connection management approach
|
667 |
+
- Updated API response structure for better consistency
|
668 |
+
|
669 |
+
### Migration Guide
|
670 |
+
When upgrading to v0.3.73, be aware of the following changes:
|
671 |
+
|
672 |
+
1. Docker Deployment:
|
673 |
+
- Review Docker documentation for new deployment options
|
674 |
+
- Update environment configurations as needed
|
675 |
+
- Check container communication settings
|
676 |
+
|
677 |
+
2. If using custom browser management:
|
678 |
+
- Update browser initialization code to use new ManagedBrowser class
|
679 |
+
- Review browser cleanup procedures
|
680 |
+
|
681 |
+
3. For database operations:
|
682 |
+
- Check custom database queries for compatibility with new connection pooling
|
683 |
+
- Update error handling to work with new retry logic
|
684 |
+
|
685 |
+
4. Using the Doctor:
|
686 |
+
- Run doctor command for system diagnostics: `crawl4ai doctor`
|
687 |
+
- Review generated reports for potential issues
|
688 |
+
- Follow recommended fixes for any identified problems
|
689 |
+
|
690 |
+
|
691 |
+
## [v0.3.73] - 2024-11-04
|
692 |
+
This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities.
|
693 |
+
|
694 |
+
## [v0.3.73] - 2024-10-24
|
695 |
+
|
696 |
+
### Added
|
697 |
+
- preserve_tags: Added support for preserving specific HTML tags during markdown conversion.
|
698 |
+
- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy:
|
699 |
+
- Automatic removal of popups, modals, and cookie notices
|
700 |
+
- Detection and removal of fixed/sticky position elements
|
701 |
+
- Cleaning of empty block elements
|
702 |
+
- Configurable via `remove_overlay_elements` parameter
|
703 |
+
- Enhanced screenshot capabilities:
|
704 |
+
- Added `screenshot_wait_for` parameter to control timing
|
705 |
+
- Improved screenshot handling with existing page context
|
706 |
+
- Better error handling with fallback error images
|
707 |
+
- New URL normalization utilities:
|
708 |
+
- `normalize_url` function for consistent URL formatting
|
709 |
+
- `is_external_url` function for better link classification
|
710 |
+
- Custom base directory support for cache storage:
|
711 |
+
- New `base_directory` parameter in AsyncWebCrawler
|
712 |
+
- Allows specifying alternative locations for `.crawl4ai` folder
|
713 |
+
|
714 |
+
### Enhanced
|
715 |
+
- Link handling improvements:
|
716 |
+
- Better duplicate link detection
|
717 |
+
- Enhanced internal/external link classification
|
718 |
+
- Improved handling of special URL protocols
|
719 |
+
- Support for anchor links and protocol-relative URLs
|
720 |
+
- Configuration refinements:
|
721 |
+
- Streamlined social media domain list
|
722 |
+
- More focused external content filtering
|
723 |
+
- LLM extraction strategy:
|
724 |
+
- Added support for separate API base URL via `api_base` parameter
|
725 |
+
- Better handling of base URLs in configuration
|
726 |
+
|
727 |
+
### Fixed
|
728 |
+
- Screenshot functionality:
|
729 |
+
- Resolved issues with screenshot timing and context
|
730 |
+
- Improved error handling and recovery
|
731 |
+
- Link processing:
|
732 |
+
- Fixed URL normalization edge cases
|
733 |
+
- Better handling of invalid URLs
|
734 |
+
- Improved error messages for link processing failures
|
735 |
+
|
736 |
+
### Developer Notes
|
737 |
+
- The overlay removal system uses advanced JavaScript injection for better compatibility
|
738 |
+
- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs
|
739 |
+
- Screenshot system now reuses existing page context for better performance
|
740 |
+
- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness
|
741 |
+
|
742 |
+
## [v0.3.72] - 2024-10-22
|
743 |
+
|
744 |
+
### Added
|
745 |
+
- New `ContentCleaningStrategy` class:
|
746 |
+
- Smart content extraction based on text density and element scoring
|
747 |
+
- Automatic removal of boilerplate content
|
748 |
+
- DOM tree analysis for better content identification
|
749 |
+
- Configurable thresholds for content detection
|
750 |
+
- Advanced proxy support:
|
751 |
+
- Added `proxy_config` option for authenticated proxy connections
|
752 |
+
- Support for username/password in proxy configuration
|
753 |
+
- New content output formats:
|
754 |
+
- `fit_markdown`: Optimized markdown output with main content focus
|
755 |
+
- `fit_html`: Clean HTML with only essential content
|
756 |
+
|
757 |
+
### Enhanced
|
758 |
+
- Image source detection:
|
759 |
+
- Support for multiple image source attributes (`src`, `data-src`, `srcset`, etc.)
|
760 |
+
- Automatic fallback through potential source attributes
|
761 |
+
- Smart handling of srcset attribute
|
762 |
+
- External content handling:
|
763 |
+
- Made external link exclusion optional (disabled by default)
|
764 |
+
- Improved detection and handling of social media links
|
765 |
+
- Better control over external image filtering
|
766 |
+
|
767 |
+
### Fixed
|
768 |
+
- Image extraction reliability with multiple source attribute checks
|
769 |
+
- External link and image handling logic for better accuracy
|
770 |
+
|
771 |
+
### Developer Notes
|
772 |
+
- The new `ContentCleaningStrategy` uses configurable thresholds for customization
|
773 |
+
- Proxy configuration now supports more complex authentication scenarios
|
774 |
+
- Content extraction process now provides both regular and optimized outputs
|
775 |
+
|
776 |
+
## [v0.3.72] - 2024-10-20
|
777 |
+
|
778 |
+
### Fixed
|
779 |
+
- Added support for parsing Base64 encoded images in WebScrapingStrategy
|
780 |
+
|
781 |
+
### Added
|
782 |
+
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
783 |
+
- New configuration options for controlling external content:
|
784 |
+
- Ability to exclude all external links
|
785 |
+
- Option to specify domains to exclude (default includes major social media platforms)
|
786 |
+
- Control over excluding external images
|
787 |
+
|
788 |
+
### Changed
|
789 |
+
- Improved Markdown generation process:
|
790 |
+
- Added fine-grained control over character escaping in Markdown output
|
791 |
+
- Enhanced handling of code blocks and pre-formatted text
|
792 |
+
- Updated `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500)
|
793 |
+
- Enhanced flexibility in `CosineStrategy` with a more generic `load_HF_embedding_model` function
|
794 |
+
|
795 |
+
### Improved
|
796 |
+
- Optimized content scraping and processing for better efficiency
|
797 |
+
- Enhanced error handling and logging in various components
|
798 |
+
|
799 |
+
### Developer Notes
|
800 |
+
- The customized html2text library is now located within the crawl4ai package
|
801 |
+
- New configuration options are available in the `config.py` file for external content handling
|
802 |
+
- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options
|
803 |
+
|
804 |
+
## [v0.3.71] - 2024-10-19
|
805 |
+
|
806 |
+
### Added
|
807 |
+
- New chunking strategies:
|
808 |
+
- `OverlappingWindowChunking`: Allows for overlapping chunks of text, useful for maintaining context between chunks.
|
809 |
+
- Enhanced `SlidingWindowChunking`: Improved to handle edge cases and last chunks more effectively.
|
810 |
+
|
811 |
+
### Changed
|
812 |
+
- Updated `CHUNK_TOKEN_THRESHOLD` in config to 2048 tokens (2^11) for better compatibility with most LLM models.
|
813 |
+
- Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler.
|
814 |
+
- Enhanced flexibility in `CosineStrategy`:
|
815 |
+
- Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models.
|
816 |
+
- Updated `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` for better JSON-based extraction.
|
817 |
+
|
818 |
+
### Fixed
|
819 |
+
- Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked.
|
820 |
+
|
821 |
+
### Developer Notes
|
822 |
+
- Added more comprehensive docstrings to chunking strategies for better code documentation.
|
823 |
+
- Removed hardcoded device setting in `CosineStrategy`, now using the automatically detected device.
|
824 |
+
- Added a new example in `quickstart_async.py` for generating a knowledge graph from crawled content.
|
825 |
+
|
826 |
+
These updates aim to provide more flexibility in text processing, improve performance, and enhance the overall capabilities of the crawl4ai library. The new chunking strategies, in particular, offer more options for handling large texts in various scenarios.
|
827 |
+
|
828 |
+
## [v0.3.71] - 2024-10-18
|
829 |
+
|
830 |
+
### Changes
|
831 |
+
1. **Version Update**:
|
832 |
+
- Updated version number from 0.3.7 to 0.3.71.
|
833 |
+
|
834 |
+
2. **Crawler Enhancements**:
|
835 |
+
- Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
|
836 |
+
- Improved context creation with additional options:
|
837 |
+
- Enabled `accept_downloads` and `java_script_enabled`.
|
838 |
+
- Added a cookie to enable cookies by default.
|
839 |
+
|
840 |
+
3. **Error Handling Improvements**:
|
841 |
+
- Enhanced error messages in AsyncWebCrawler's `arun` method.
|
842 |
+
- Updated error reporting format for better visibility and consistency.
|
843 |
+
|
844 |
+
4. **Performance Optimization**:
|
845 |
+
- Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
|
846 |
+
|
847 |
+
### Documentation
|
848 |
+
- Updated quickstart notebook:
|
849 |
+
- Changed installation command to use the released package instead of GitHub repository.
|
850 |
+
- Updated kernel display name.
|
851 |
+
|
852 |
+
### Developer Notes
|
853 |
+
- Minor code refactoring and cleanup.
|
854 |
+
|
855 |
+
## [v0.3.7] - 2024-10-17
|
856 |
+
|
857 |
+
### New Features
|
858 |
+
1. **Enhanced Browser Stealth**:
|
859 |
+
- Implemented `playwright_stealth` for improved bot detection avoidance.
|
860 |
+
- Added `StealthConfig` for fine-tuned control over stealth parameters.
|
861 |
+
|
862 |
+
2. **User Simulation**:
|
863 |
+
- New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses).
|
864 |
+
|
865 |
+
3. **Navigator Override**:
|
866 |
+
- Added `override_navigator` option to modify navigator properties, further improving bot detection evasion.
|
867 |
+
|
868 |
+
4. **Improved iframe Handling**:
|
869 |
+
- New `process_iframes` parameter to extract and integrate iframe content into the main page.
|
870 |
+
|
871 |
+
5. **Flexible Browser Selection**:
|
872 |
+
- Support for choosing between Chromium, Firefox, and WebKit browsers.
|
873 |
+
|
874 |
+
6. **Include Links in Markdown**:
|
875 |
+
- Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
|
876 |
+
|
877 |
+
### Improvements
|
878 |
+
1. **Better Error Handling**:
|
879 |
+
- Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions.
|
880 |
+
- Added console message and error logging for better debugging.
|
881 |
+
|
882 |
+
2. **Image Processing Enhancements**:
|
883 |
+
- Improved image dimension updating and filtering logic.
|
884 |
+
|
885 |
+
3. **Crawling Flexibility**:
|
886 |
+
- Added support for custom viewport sizes.
|
887 |
+
- Implemented delayed content retrieval with `delay_before_return_html` parameter.
|
888 |
+
|
889 |
+
4. **Performance Optimization**:
|
890 |
+
- Adjusted default semaphore count for parallel crawling.
|
891 |
+
|
892 |
+
### Bug Fixes
|
893 |
+
- Fixed an issue where the HTML content could be empty after processing.
|
894 |
+
|
895 |
+
### Examples
|
896 |
+
- Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features.
|
897 |
+
|
898 |
+
### Developer Notes
|
899 |
+
- Refactored code for better maintainability and readability.
|
900 |
+
- Updated browser launch arguments for improved compatibility and performance.
|
901 |
+
|
902 |
+
## [v0.3.6] - 2024-10-12
|
903 |
+
|
904 |
+
### 1. Improved Crawling Control
|
905 |
+
- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`.
|
906 |
+
- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content.
|
907 |
+
- Useful for pages with delayed content loading.
|
908 |
+
- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout.
|
909 |
+
- Provides better handling for slow-loading pages.
|
910 |
+
- **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`.
|
911 |
+
|
912 |
+
### 2. Browser Type Selection
|
913 |
+
- Added support for different browser types (Chromium, Firefox, WebKit).
|
914 |
+
- Users can now specify the browser type when initializing AsyncWebCrawler.
|
915 |
+
- **How to use**: Set `browser_type="firefox"` or `browser_type="webkit"` when initializing AsyncWebCrawler.
|
916 |
+
|
917 |
+
### 3. Screenshot Capture
|
918 |
+
- Added ability to capture screenshots during crawling.
|
919 |
+
- Useful for debugging and content verification.
|
920 |
+
- **How to use**: Set `screenshot=True` when calling `crawler.arun()`.
|
921 |
+
|
922 |
+
### 4. Enhanced LLM Extraction Strategy
|
923 |
+
- Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama).
|
924 |
+
- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter.
|
925 |
+
- **Custom Headers**: Users can now pass custom headers to the extraction strategy.
|
926 |
+
- **How to use**: Specify the desired provider and custom arguments when using `LLMExtractionStrategy`.
|
927 |
+
|
928 |
+
### 5. iframe Content Extraction
|
929 |
+
- New feature to process and extract content from iframes.
|
930 |
+
- **How to use**: Set `process_iframes=True` in the crawl method.
|
931 |
+
|
932 |
+
### 6. Delayed Content Retrieval
|
933 |
+
- Introduced `get_delayed_content` method in `AsyncCrawlResponse`.
|
934 |
+
- Allows retrieval of content after a specified delay, useful for dynamically loaded content.
|
935 |
+
- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling.
|
936 |
+
|
937 |
+
### Improvements and Optimizations
|
938 |
+
|
939 |
+
#### 1. AsyncWebCrawler Enhancements
|
940 |
+
- **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy.
|
941 |
+
- Allows for more customized setups.
|
942 |
+
|
943 |
+
#### 2. Image Processing Optimization
|
944 |
+
- Enhanced image handling in WebScrapingStrategy.
|
945 |
+
- Added filtering for small, invisible, or irrelevant images.
|
946 |
+
- Improved image scoring system for better content relevance.
|
947 |
+
- Implemented JavaScript-based image dimension updating for more accurate representation.
|
948 |
+
|
949 |
+
#### 3. Database Schema Auto-updates
|
950 |
+
- Automatic database schema updates ensure compatibility with the latest version.
|
951 |
+
|
952 |
+
#### 4. Enhanced Error Handling and Logging
|
953 |
+
- Improved error messages and logging for easier debugging.
|
954 |
+
|
955 |
+
#### 5. Content Extraction Refinements
|
956 |
+
- Refined HTML sanitization process.
|
957 |
+
- Improved handling of base64 encoded images.
|
958 |
+
- Enhanced Markdown conversion process.
|
959 |
+
- Optimized content extraction algorithms.
|
960 |
+
|
961 |
+
#### 6. Utility Function Enhancements
|
962 |
+
- `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers.
|
963 |
+
|
964 |
+
### Bug Fixes
|
965 |
+
- Fixed an issue where image tags were being prematurely removed during content extraction.
|
966 |
+
|
967 |
+
### Examples and Documentation
|
968 |
+
- Updated `quickstart_async.py` with examples of:
|
969 |
+
- Using custom headers in LLM extraction.
|
970 |
+
- Different LLM provider usage (OpenAI, Hugging Face, Ollama).
|
971 |
+
- Custom browser type usage.
|
972 |
+
|
973 |
+
### Developer Notes
|
974 |
+
- Refactored code for better maintainability, flexibility, and performance.
|
975 |
+
- Enhanced type hinting throughout the codebase for improved development experience.
|
976 |
+
- Expanded error handling for more robust operation.
|
977 |
+
|
978 |
+
These updates significantly enhance the flexibility, accuracy, and robustness of crawl4ai, providing users with more control and options for their web crawling and content extraction tasks.
|
979 |
+
|
980 |
+
## [v0.3.5] - 2024-09-02
|
981 |
+
|
982 |
+
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
|
983 |
+
|
984 |
+
- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy
|
985 |
+
- Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler
|
986 |
+
- Improve error handling and timeout management in crawling process
|
987 |
+
- Fix typo in CrawlResult model (responser_headers -> response_headers)
|
988 |
+
|
989 |
+
## [v0.2.77] - 2024-08-04
|
990 |
+
|
991 |
+
Significant improvements in text processing and performance:
|
992 |
+
|
993 |
+
- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
|
994 |
+
- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
|
995 |
+
- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
|
996 |
+
- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
|
997 |
+
|
998 |
+
These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
|
999 |
+
|
1000 |
+
## [v0.2.76] - 2024-08-02
|
1001 |
+
|
1002 |
+
Major improvements in functionality, performance, and cross-platform compatibility! 🚀
|
1003 |
+
|
1004 |
+
- 🐳 **Docker enhancements**: Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows.
|
1005 |
+
- 🌐 **Official Docker Hub image**: Launched our first official image on Docker Hub for streamlined deployment.
|
1006 |
+
- 🔧 **Selenium upgrade**: Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility.
|
1007 |
+
- 🖼️ **Image description**: Implemented ability to generate textual descriptions for extracted images from web pages.
|
1008 |
+
- ⚡ **Performance boost**: Various improvements to enhance overall speed and performance.
|
1009 |
+
|
1010 |
+
A big shoutout to our amazing community contributors:
|
1011 |
+
- [@aravindkarnam](https://github.com/aravindkarnam) for developing the textual description extraction feature.
|
1012 |
+
- [@FractalMind](https://github.com/FractalMind) for creating the first official Docker Hub image and fixing Dockerfile errors.
|
1013 |
+
- [@ketonkss4](https://github.com/ketonkss4) for identifying Selenium's new capabilities, helping us reduce dependencies.
|
1014 |
+
|
1015 |
+
Your contributions are driving Crawl4AI forward! 🙌
|
1016 |
+
|
1017 |
+
## [v0.2.75] - 2024-07-19
|
1018 |
+
|
1019 |
+
Minor improvements for a more maintainable codebase:
|
1020 |
+
|
1021 |
+
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
1022 |
+
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
1023 |
+
|
1024 |
+
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
1025 |
+
|
1026 |
+
## [v0.2.74] - 2024-07-08
|
1027 |
+
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
1028 |
+
|
1029 |
+
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
1030 |
+
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
1031 |
+
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
1032 |
+
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
1033 |
+
|
1034 |
+
|
1035 |
+
## [v0.2.73] - 2024-07-03
|
1036 |
+
|
1037 |
+
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
1038 |
+
|
1039 |
+
* Supporting website need "with-head" mode to crawl the website with head.
|
1040 |
+
* Fixing the installation issues for setup.py and dockerfile.
|
1041 |
+
* Resolve multiple issues.
|
1042 |
+
|
1043 |
+
## [v0.2.72] - 2024-06-30
|
1044 |
+
|
1045 |
+
This release brings exciting updates and improvements to our project! 🎉
|
1046 |
+
|
1047 |
+
* 📚 **Documentation Updates**: Our documentation has been revamped to reflect the latest changes and additions.
|
1048 |
+
* 🚀 **New Modes in setup.py**: We've added support for three new modes in setup.py: default, torch, and transformers. This enhances the project's flexibility and usability.
|
1049 |
+
* 🐳 **Docker File Updates**: The Docker file has been updated to ensure seamless compatibility with the new modes and improvements.
|
1050 |
+
* 🕷️ **Temporary Solution for Headless Crawling**: We've implemented a temporary solution to overcome issues with crawling websites in headless mode.
|
1051 |
+
|
1052 |
+
These changes aim to improve the overall user experience, provide more flexibility, and enhance the project's performance. We're thrilled to share these updates with you and look forward to continuing to evolve and improve our project!
|
1053 |
+
|
1054 |
+
## [0.2.71] - 2024-06-26
|
1055 |
+
|
1056 |
+
**Improved Error Handling and Performance** 🚧
|
1057 |
+
|
1058 |
+
* 🚫 Refactored `crawler_strategy.py` to handle exceptions and provide better error messages, making it more robust and reliable.
|
1059 |
+
* 💻 Optimized the `get_content_of_website_optimized` function in `utils.py` for improved performance, reducing potential bottlenecks.
|
1060 |
+
* 💻 Updated `utils.py` with the latest changes, ensuring consistency and accuracy.
|
1061 |
+
* 🚫 Migrated to `ChromeDriverManager` to resolve Chrome driver download issues, providing a smoother user experience.
|
1062 |
+
|
1063 |
+
These changes focus on refining the existing codebase, resulting in a more stable, efficient, and user-friendly experience. With these improvements, you can expect fewer errors and better performance in the crawler strategy and utility functions.
|
1064 |
+
|
1065 |
+
## [0.2.71] - 2024-06-25
|
1066 |
+
### Fixed
|
1067 |
+
- Speed up twice the extraction function.
|
1068 |
+
|
1069 |
+
|
1070 |
+
## [0.2.6] - 2024-06-22
|
1071 |
+
### Fixed
|
1072 |
+
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
|
1073 |
+
|
1074 |
+
## [0.2.5] - 2024-06-18
|
1075 |
+
### Added
|
1076 |
+
- Added five important hooks to the crawler:
|
1077 |
+
- on_driver_created: Called when the driver is ready for initializations.
|
1078 |
+
- before_get_url: Called right before Selenium fetches the URL.
|
1079 |
+
- after_get_url: Called after Selenium fetches the URL.
|
1080 |
+
- before_return_html: Called when the data is parsed and ready.
|
1081 |
+
- on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
|
1082 |
+
- Added an example in `quickstart.py` in the example folder under the docs.
|
1083 |
+
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
|
1084 |
+
- Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
|
1085 |
+
- Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!).
|
1086 |
+
|
1087 |
+
## [v0.2.4] - 2024-06-17
|
1088 |
+
### Fixed
|
1089 |
+
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Crawl4AI Code of Conduct
|
2 |
+
|
3 |
+
## Our Pledge
|
4 |
+
|
5 |
+
We as members, contributors, and leaders pledge to make participation in our
|
6 |
+
community a harassment-free experience for everyone, regardless of age, body
|
7 |
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
8 |
+
identity and expression, level of experience, education, socio-economic status,
|
9 |
+
nationality, personal appearance, race, caste, color, religion, or sexual
|
10 |
+
identity and orientation.
|
11 |
+
|
12 |
+
We pledge to act and interact in ways that contribute to an open, welcoming,
|
13 |
+
diverse, inclusive, and healthy community.
|
14 |
+
|
15 |
+
## Our Standards
|
16 |
+
|
17 |
+
Examples of behavior that contributes to a positive environment for our
|
18 |
+
community include:
|
19 |
+
|
20 |
+
* Demonstrating empathy and kindness toward other people
|
21 |
+
* Being respectful of differing opinions, viewpoints, and experiences
|
22 |
+
* Giving and gracefully accepting constructive feedback
|
23 |
+
* Accepting responsibility and apologizing to those affected by our mistakes,
|
24 |
+
and learning from the experience
|
25 |
+
* Focusing on what is best not just for us as individuals, but for the overall
|
26 |
+
community
|
27 |
+
|
28 |
+
Examples of unacceptable behavior include:
|
29 |
+
|
30 |
+
* The use of sexualized language or imagery, and sexual attention or advances of
|
31 |
+
any kind
|
32 |
+
* Trolling, insulting or derogatory comments, and personal or political attacks
|
33 |
+
* Public or private harassment
|
34 |
+
* Publishing others' private information, such as a physical or email address,
|
35 |
+
without their explicit permission
|
36 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
37 |
+
professional setting
|
38 |
+
|
39 |
+
## Enforcement Responsibilities
|
40 |
+
|
41 |
+
Community leaders are responsible for clarifying and enforcing our standards of
|
42 |
+
acceptable behavior and will take appropriate and fair corrective action in
|
43 |
+
response to any behavior that they deem inappropriate, threatening, offensive,
|
44 |
+
or harmful.
|
45 |
+
|
46 |
+
Community leaders have the right and responsibility to remove, edit, or reject
|
47 |
+
comments, commits, code, wiki edits, issues, and other contributions that are
|
48 |
+
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
49 |
+
decisions when appropriate.
|
50 |
+
|
51 |
+
## Scope
|
52 |
+
|
53 |
+
This Code of Conduct applies within all community spaces, and also applies when
|
54 |
+
an individual is officially representing the community in public spaces.
|
55 |
+
Examples of representing our community include using an official email address,
|
56 |
+
posting via an official social media account, or acting as an appointed
|
57 |
+
representative at an online or offline event.
|
58 |
+
|
59 |
+
## Enforcement
|
60 |
+
|
61 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
62 |
+
reported to the community leaders responsible for enforcement at
|
63 |
+
[email protected]. All complaints will be reviewed and investigated promptly and fairly.
|
64 |
+
|
65 |
+
All community leaders are obligated to respect the privacy and security of the
|
66 |
+
reporter of any incident.
|
67 |
+
|
68 |
+
## Enforcement Guidelines
|
69 |
+
|
70 |
+
Community leaders will follow these Community Impact Guidelines in determining
|
71 |
+
the consequences for any action they deem in violation of this Code of Conduct:
|
72 |
+
|
73 |
+
### 1. Correction
|
74 |
+
|
75 |
+
**Community Impact**: Use of inappropriate language or other behavior deemed
|
76 |
+
unprofessional or unwelcome in the community.
|
77 |
+
|
78 |
+
**Consequence**: A private, written warning from community leaders, providing
|
79 |
+
clarity around the nature of the violation and an explanation of why the
|
80 |
+
behavior was inappropriate. A public apology may be requested.
|
81 |
+
|
82 |
+
### 2. Warning
|
83 |
+
|
84 |
+
**Community Impact**: A violation through a single incident or series of
|
85 |
+
actions.
|
86 |
+
|
87 |
+
**Consequence**: A warning with consequences for continued behavior. No
|
88 |
+
interaction with the people involved, including unsolicited interaction with
|
89 |
+
those enforcing the Code of Conduct, for a specified period of time. This
|
90 |
+
includes avoiding interactions in community spaces as well as external channels
|
91 |
+
like social media. Violating these terms may lead to a temporary or permanent
|
92 |
+
ban.
|
93 |
+
|
94 |
+
### 3. Temporary Ban
|
95 |
+
|
96 |
+
**Community Impact**: A serious violation of community standards, including
|
97 |
+
sustained inappropriate behavior.
|
98 |
+
|
99 |
+
**Consequence**: A temporary ban from any sort of interaction or public
|
100 |
+
communication with the community for a specified period of time. No public or
|
101 |
+
private interaction with the people involved, including unsolicited interaction
|
102 |
+
with those enforcing the Code of Conduct, is allowed during this period.
|
103 |
+
Violating these terms may lead to a permanent ban.
|
104 |
+
|
105 |
+
### 4. Permanent Ban
|
106 |
+
|
107 |
+
**Community Impact**: Demonstrating a pattern of violation of community
|
108 |
+
standards, including sustained inappropriate behavior, harassment of an
|
109 |
+
individual, or aggression toward or disparagement of classes of individuals.
|
110 |
+
|
111 |
+
**Consequence**: A permanent ban from any sort of public interaction within the
|
112 |
+
community.
|
113 |
+
|
114 |
+
## Attribution
|
115 |
+
|
116 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
117 |
+
version 2.1, available at
|
118 |
+
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
|
119 |
+
|
120 |
+
Community Impact Guidelines were inspired by
|
121 |
+
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
122 |
+
|
123 |
+
For answers to common questions about this code of conduct, see the FAQ at
|
124 |
+
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
|
125 |
+
[https://www.contributor-covenant.org/translations][translations].
|
126 |
+
|
127 |
+
[homepage]: https://www.contributor-covenant.org
|
128 |
+
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
|
129 |
+
[Mozilla CoC]: https://github.com/mozilla/diversity
|
130 |
+
[FAQ]: https://www.contributor-covenant.org/faq
|
131 |
+
[translations]: https://www.contributor-covenant.org/translations
|
CONTRIBUTORS.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributors to Crawl4AI
|
2 |
+
|
3 |
+
We would like to thank the following people for their contributions to Crawl4AI:
|
4 |
+
|
5 |
+
## Core Team
|
6 |
+
|
7 |
+
- [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
|
8 |
+
- [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
|
9 |
+
- [Aravind Karnam](https://github.com/aravindkarnam) - Developer
|
10 |
+
|
11 |
+
## Community Contributors
|
12 |
+
|
13 |
+
- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined.
|
14 |
+
- [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
|
15 |
+
- [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
|
16 |
+
- [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
|
17 |
+
- [datehoer](https://github.com/datehoer) - Add browser prxy support
|
18 |
+
|
19 |
+
## Pull Requests
|
20 |
+
|
21 |
+
- [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304)
|
22 |
+
- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
|
23 |
+
- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
|
24 |
+
- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
|
25 |
+
- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
|
26 |
+
|
27 |
+
|
28 |
+
## Other Contributors
|
29 |
+
|
30 |
+
- [Gokhan](https://github.com/gkhngyk)
|
31 |
+
- [Shiv Kumar](https://github.com/shivkumar0757)
|
32 |
+
- [QIN2DIM](https://github.com/QIN2DIM)
|
33 |
+
|
34 |
+
## Acknowledgements
|
35 |
+
|
36 |
+
We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
|
37 |
+
|
38 |
+
---
|
39 |
+
|
40 |
+
If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly.
|
41 |
+
|
42 |
+
Thank you all for your contributions!
|
Dockerfile
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# syntax=docker/dockerfile:1.4
|
2 |
+
|
3 |
+
ARG TARGETPLATFORM
|
4 |
+
ARG BUILDPLATFORM
|
5 |
+
|
6 |
+
# Other build arguments
|
7 |
+
ARG PYTHON_VERSION=3.10
|
8 |
+
|
9 |
+
# Base stage with system dependencies
|
10 |
+
FROM python:${PYTHON_VERSION}-slim as base
|
11 |
+
|
12 |
+
# Declare ARG variables again within the build stage
|
13 |
+
ARG INSTALL_TYPE=basic
|
14 |
+
ARG ENABLE_GPU=false
|
15 |
+
|
16 |
+
# Platform-specific labels
|
17 |
+
LABEL maintainer="unclecode"
|
18 |
+
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
19 |
+
LABEL version="1.0"
|
20 |
+
|
21 |
+
# Environment setup
|
22 |
+
ENV PYTHONUNBUFFERED=1 \
|
23 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
24 |
+
PIP_NO_CACHE_DIR=1 \
|
25 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
26 |
+
PIP_DEFAULT_TIMEOUT=100 \
|
27 |
+
DEBIAN_FRONTEND=noninteractive
|
28 |
+
|
29 |
+
# Install system dependencies
|
30 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
31 |
+
build-essential \
|
32 |
+
curl \
|
33 |
+
wget \
|
34 |
+
gnupg \
|
35 |
+
git \
|
36 |
+
cmake \
|
37 |
+
pkg-config \
|
38 |
+
python3-dev \
|
39 |
+
libjpeg-dev \
|
40 |
+
libpng-dev \
|
41 |
+
&& rm -rf /var/lib/apt/lists/*
|
42 |
+
|
43 |
+
# Playwright system dependencies for Linux
|
44 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
45 |
+
libglib2.0-0 \
|
46 |
+
libnss3 \
|
47 |
+
libnspr4 \
|
48 |
+
libatk1.0-0 \
|
49 |
+
libatk-bridge2.0-0 \
|
50 |
+
libcups2 \
|
51 |
+
libdrm2 \
|
52 |
+
libdbus-1-3 \
|
53 |
+
libxcb1 \
|
54 |
+
libxkbcommon0 \
|
55 |
+
libx11-6 \
|
56 |
+
libxcomposite1 \
|
57 |
+
libxdamage1 \
|
58 |
+
libxext6 \
|
59 |
+
libxfixes3 \
|
60 |
+
libxrandr2 \
|
61 |
+
libgbm1 \
|
62 |
+
libpango-1.0-0 \
|
63 |
+
libcairo2 \
|
64 |
+
libasound2 \
|
65 |
+
libatspi2.0-0 \
|
66 |
+
&& rm -rf /var/lib/apt/lists/*
|
67 |
+
|
68 |
+
# GPU support if enabled and architecture is supported
|
69 |
+
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
|
70 |
+
apt-get update && apt-get install -y --no-install-recommends \
|
71 |
+
nvidia-cuda-toolkit \
|
72 |
+
&& rm -rf /var/lib/apt/lists/* ; \
|
73 |
+
else \
|
74 |
+
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
75 |
+
fi
|
76 |
+
|
77 |
+
# Create and set working directory
|
78 |
+
WORKDIR /app
|
79 |
+
|
80 |
+
# Copy the entire project
|
81 |
+
COPY . .
|
82 |
+
|
83 |
+
# Install base requirements
|
84 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
85 |
+
|
86 |
+
# Install required library for FastAPI
|
87 |
+
RUN pip install fastapi uvicorn psutil
|
88 |
+
|
89 |
+
# Install ML dependencies first for better layer caching
|
90 |
+
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
91 |
+
pip install --no-cache-dir \
|
92 |
+
torch \
|
93 |
+
torchvision \
|
94 |
+
torchaudio \
|
95 |
+
scikit-learn \
|
96 |
+
nltk \
|
97 |
+
transformers \
|
98 |
+
tokenizers && \
|
99 |
+
python -m nltk.downloader punkt stopwords ; \
|
100 |
+
fi
|
101 |
+
|
102 |
+
# Install the package
|
103 |
+
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
104 |
+
pip install ".[all]" && \
|
105 |
+
python -m crawl4ai.model_loader ; \
|
106 |
+
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
|
107 |
+
pip install ".[torch]" ; \
|
108 |
+
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
|
109 |
+
pip install ".[transformer]" && \
|
110 |
+
python -m crawl4ai.model_loader ; \
|
111 |
+
else \
|
112 |
+
pip install "." ; \
|
113 |
+
fi
|
114 |
+
|
115 |
+
# Install MkDocs and required plugins
|
116 |
+
RUN pip install --no-cache-dir \
|
117 |
+
mkdocs \
|
118 |
+
mkdocs-material \
|
119 |
+
mkdocs-terminal \
|
120 |
+
pymdown-extensions
|
121 |
+
|
122 |
+
# Build MkDocs documentation
|
123 |
+
RUN mkdocs build
|
124 |
+
|
125 |
+
# Install Playwright and browsers
|
126 |
+
RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
127 |
+
playwright install chromium; \
|
128 |
+
elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
129 |
+
playwright install chromium; \
|
130 |
+
fi
|
131 |
+
|
132 |
+
# Expose port
|
133 |
+
EXPOSE 8000 11235 9222 8080
|
134 |
+
|
135 |
+
# Start the FastAPI server
|
136 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
|
LICENSE
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
|
10 |
+
|
11 |
+
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
|
12 |
+
|
13 |
+
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
|
14 |
+
|
15 |
+
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
|
16 |
+
|
17 |
+
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
|
18 |
+
|
19 |
+
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
|
20 |
+
|
21 |
+
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
|
22 |
+
|
23 |
+
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
|
24 |
+
|
25 |
+
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
26 |
+
|
27 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
|
28 |
+
|
29 |
+
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
|
30 |
+
|
31 |
+
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
|
32 |
+
|
33 |
+
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
|
34 |
+
|
35 |
+
You must give any other recipients of the Work or Derivative Works a copy of this License; and
|
36 |
+
You must cause any modified files to carry prominent notices stating that You changed the files; and
|
37 |
+
You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
|
38 |
+
If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
|
39 |
+
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
|
40 |
+
|
41 |
+
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
|
42 |
+
|
43 |
+
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
|
44 |
+
|
45 |
+
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
|
46 |
+
|
47 |
+
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
48 |
+
|
49 |
+
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
50 |
+
|
51 |
+
END OF TERMS AND CONDITIONS
|
MANIFEST.in
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
include requirements.txt
|
2 |
+
recursive-include crawl4ai/js_snippet *.js
|
MISSION.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Mission
|
2 |
+
|
3 |
+
![Mission Diagram](./docs/assets/pitch-dark.svg)
|
4 |
+
|
5 |
+
### 1. The Data Capitalization Opportunity
|
6 |
+
|
7 |
+
We live in an unprecedented era of digital wealth creation. Every day, individuals and enterprises generate massive amounts of valuable digital footprints across various platforms, social media channels, messenger apps, and cloud services. While people can interact with their data within these platforms, there's an immense untapped opportunity to transform this data into true capital assets. Just as physical property became a foundational element of wealth creation, personal and enterprise data has the potential to become a new form of capital on balance sheets.
|
8 |
+
|
9 |
+
For individuals, this represents an opportunity to transform their digital activities into valuable assets. For enterprises, their internal communications, team discussions, and collaborative documents contain rich insights that could be structured and valued as intellectual capital. This wealth of information represents an unprecedented opportunity for value creation in the digital age.
|
10 |
+
|
11 |
+
### 2. The Potential of Authentic Data
|
12 |
+
|
13 |
+
While synthetic data has played a crucial role in AI development, there's an enormous untapped potential in the authentic data generated by individuals and organizations. Every message, document, and interaction contains unique insights and patterns that could enhance AI development. The challenge isn't a lack of data - it's that most authentic human-generated data remains inaccessible for productive use.
|
14 |
+
|
15 |
+
By enabling willing participation in data sharing, we can unlock this vast reservoir of authentic human knowledge. This represents an opportunity to enhance AI development with diverse, real-world data that reflects the full spectrum of human experience and knowledge.
|
16 |
+
|
17 |
+
## Our Pathway to Data Democracy
|
18 |
+
|
19 |
+
### 1. Open-Source Foundation
|
20 |
+
|
21 |
+
Our first step is creating an open-source data extraction engine that empowers developers and innovators to build tools for data structuring and organization. This foundation ensures transparency, security, and community-driven development. By making these tools openly available, we enable the technical infrastructure needed for true data ownership and capitalization.
|
22 |
+
|
23 |
+
### 2. Data Capitalization Platform
|
24 |
+
|
25 |
+
Building on this open-source foundation, we're developing a platform that helps individuals and enterprises transform their digital footprints into structured, valuable assets. This platform will provide the tools and frameworks needed to organize, understand, and value personal and organizational data as true capital assets.
|
26 |
+
|
27 |
+
### 3. Creating a Data Marketplace
|
28 |
+
|
29 |
+
The final piece is establishing a marketplace where individuals and organizations can willingly share their data assets. This creates opportunities for:
|
30 |
+
- Individuals to earn equity, revenue, or other forms of value from their data
|
31 |
+
- Enterprises to access diverse, high-quality data for AI development
|
32 |
+
- Researchers to work with authentic human-generated data
|
33 |
+
- Startups to build innovative solutions using real-world data
|
34 |
+
|
35 |
+
## Economic Vision: A Shared Data Economy
|
36 |
+
|
37 |
+
We envision a future where data becomes a fundamental asset class in a thriving shared economy. This transformation will democratize AI development by enabling willing participation in data sharing, ensuring that the benefits of AI advancement flow back to data creators. Just as property rights revolutionized economic systems, establishing data as a capital asset will create new opportunities for wealth creation and economic participation.
|
38 |
+
|
39 |
+
This shared data economy will:
|
40 |
+
- Enable individuals to capitalize on their digital footprints
|
41 |
+
- Create new revenue streams for data creators
|
42 |
+
- Provide AI developers with access to diverse, authentic data
|
43 |
+
- Foster innovation through broader access to real-world data
|
44 |
+
- Ensure more equitable distribution of AI's economic benefits
|
45 |
+
|
46 |
+
Our vision is to facilitate this transformation from the ground up - starting with open-source tools, progressing to data capitalization platforms, and ultimately creating a thriving marketplace where data becomes a true asset class in a shared economy. This approach ensures that the future of AI is built on a foundation of authentic human knowledge, with benefits flowing back to the individuals and organizations who create and share their valuable data.
|
README.md
CHANGED
@@ -6,6 +6,565 @@ colorTo: pink
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
|
|
9 |
---
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
+
port: 11235
|
10 |
---
|
11 |
|
12 |
+
|
13 |
+
|
14 |
+
# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
|
15 |
+
|
16 |
+
<div align="center">
|
17 |
+
|
18 |
+
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
19 |
+
|
20 |
+
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
|
21 |
+
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
|
22 |
+
|
23 |
+
[![PyPI version](https://badge.fury.io/py/crawl4ai.svg)](https://badge.fury.io/py/crawl4ai)
|
24 |
+
[![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/)
|
25 |
+
[![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai)
|
26 |
+
|
27 |
+
<!-- [![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/) -->
|
28 |
+
[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
29 |
+
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
|
30 |
+
[![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
|
31 |
+
[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md)
|
32 |
+
|
33 |
+
</div>
|
34 |
+
|
35 |
+
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
36 |
+
|
37 |
+
[✨ Check out latest update v0.4.24x](#-recent-updates)
|
38 |
+
|
39 |
+
🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
40 |
+
|
41 |
+
## 🧐 Why Crawl4AI?
|
42 |
+
|
43 |
+
1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications.
|
44 |
+
2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance.
|
45 |
+
3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access.
|
46 |
+
4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models.
|
47 |
+
5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration.
|
48 |
+
6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository.
|
49 |
+
|
50 |
+
## 🚀 Quick Start
|
51 |
+
|
52 |
+
1. Install Crawl4AI:
|
53 |
+
```bash
|
54 |
+
# Install the package
|
55 |
+
pip install -U crawl4ai
|
56 |
+
|
57 |
+
# Run post-installation setup
|
58 |
+
crawl4ai-setup
|
59 |
+
|
60 |
+
# Verify your installation
|
61 |
+
crawl4ai-doctor
|
62 |
+
```
|
63 |
+
|
64 |
+
If you encounter any browser-related issues, you can install them manually:
|
65 |
+
```bash
|
66 |
+
python -m playwright install --with-deps chromium
|
67 |
+
```
|
68 |
+
|
69 |
+
2. Run a simple web crawl:
|
70 |
+
```python
|
71 |
+
import asyncio
|
72 |
+
from crawl4ai import *
|
73 |
+
|
74 |
+
async def main():
|
75 |
+
async with AsyncWebCrawler() as crawler:
|
76 |
+
result = await crawler.arun(
|
77 |
+
url="https://www.nbcnews.com/business",
|
78 |
+
)
|
79 |
+
print(result.markdown)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
asyncio.run(main())
|
83 |
+
```
|
84 |
+
|
85 |
+
## ✨ Features
|
86 |
+
|
87 |
+
<details>
|
88 |
+
<summary>📝 <strong>Markdown Generation</strong></summary>
|
89 |
+
|
90 |
+
- 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting.
|
91 |
+
- 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing.
|
92 |
+
- 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations.
|
93 |
+
- 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs.
|
94 |
+
- 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
|
95 |
+
</details>
|
96 |
+
|
97 |
+
<details>
|
98 |
+
<summary>📊 <strong>Structured Data Extraction</strong></summary>
|
99 |
+
|
100 |
+
- 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction.
|
101 |
+
- 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing.
|
102 |
+
- 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction.
|
103 |
+
- 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors.
|
104 |
+
- 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns.
|
105 |
+
|
106 |
+
</details>
|
107 |
+
|
108 |
+
<details>
|
109 |
+
<summary>🌐 <strong>Browser Integration</strong></summary>
|
110 |
+
|
111 |
+
- 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection.
|
112 |
+
- 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction.
|
113 |
+
- 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling.
|
114 |
+
- 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access.
|
115 |
+
- ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups.
|
116 |
+
- 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit.
|
117 |
+
- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to match page content, ensuring complete rendering and capturing of all elements.
|
118 |
+
|
119 |
+
</details>
|
120 |
+
|
121 |
+
<details>
|
122 |
+
<summary>🔎 <strong>Crawling & Scraping</strong></summary>
|
123 |
+
|
124 |
+
- 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`.
|
125 |
+
- 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction.
|
126 |
+
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
127 |
+
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
128 |
+
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
129 |
+
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
130 |
+
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
131 |
+
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
132 |
+
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
133 |
+
- 🕵️ **Lazy Load Handling**: Waits for images to fully load, ensuring no content is missed due to lazy loading.
|
134 |
+
- 🔄 **Full-Page Scanning**: Simulates scrolling to load and capture all dynamic content, perfect for infinite scroll pages.
|
135 |
+
|
136 |
+
</details>
|
137 |
+
|
138 |
+
<details>
|
139 |
+
<summary>🚀 <strong>Deployment</strong></summary>
|
140 |
+
|
141 |
+
- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment.
|
142 |
+
- 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows.
|
143 |
+
- 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance.
|
144 |
+
- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms.
|
145 |
+
|
146 |
+
</details>
|
147 |
+
|
148 |
+
<details>
|
149 |
+
<summary>🎯 <strong>Additional Features</strong></summary>
|
150 |
+
|
151 |
+
- 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users.
|
152 |
+
- 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata.
|
153 |
+
- 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration.
|
154 |
+
- 🛡️ **Error Handling**: Robust error management for seamless execution.
|
155 |
+
- 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests.
|
156 |
+
- 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage.
|
157 |
+
- 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency.
|
158 |
+
|
159 |
+
</details>
|
160 |
+
|
161 |
+
## Try it Now!
|
162 |
+
|
163 |
+
✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
|
164 |
+
|
165 |
+
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
166 |
+
|
167 |
+
## Installation 🛠️
|
168 |
+
|
169 |
+
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
|
170 |
+
|
171 |
+
<details>
|
172 |
+
<summary>🐍 <strong>Using pip</strong></summary>
|
173 |
+
|
174 |
+
Choose the installation option that best fits your needs:
|
175 |
+
|
176 |
+
### Basic Installation
|
177 |
+
|
178 |
+
For basic web crawling and scraping tasks:
|
179 |
+
|
180 |
+
```bash
|
181 |
+
pip install crawl4ai
|
182 |
+
crawl4ai-setup # Setup the browser
|
183 |
+
```
|
184 |
+
|
185 |
+
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
|
186 |
+
|
187 |
+
👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
|
188 |
+
|
189 |
+
1. Through the command line:
|
190 |
+
|
191 |
+
```bash
|
192 |
+
playwright install
|
193 |
+
```
|
194 |
+
|
195 |
+
2. If the above doesn't work, try this more specific command:
|
196 |
+
|
197 |
+
```bash
|
198 |
+
python -m playwright install chromium
|
199 |
+
```
|
200 |
+
|
201 |
+
This second method has proven to be more reliable in some cases.
|
202 |
+
|
203 |
+
---
|
204 |
+
|
205 |
+
### Installation with Synchronous Version
|
206 |
+
|
207 |
+
The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium:
|
208 |
+
|
209 |
+
```bash
|
210 |
+
pip install crawl4ai[sync]
|
211 |
+
```
|
212 |
+
|
213 |
+
---
|
214 |
+
|
215 |
+
### Development Installation
|
216 |
+
|
217 |
+
For contributors who plan to modify the source code:
|
218 |
+
|
219 |
+
```bash
|
220 |
+
git clone https://github.com/unclecode/crawl4ai.git
|
221 |
+
cd crawl4ai
|
222 |
+
pip install -e . # Basic installation in editable mode
|
223 |
+
```
|
224 |
+
|
225 |
+
Install optional features:
|
226 |
+
|
227 |
+
```bash
|
228 |
+
pip install -e ".[torch]" # With PyTorch features
|
229 |
+
pip install -e ".[transformer]" # With Transformer features
|
230 |
+
pip install -e ".[cosine]" # With cosine similarity features
|
231 |
+
pip install -e ".[sync]" # With synchronous crawling (Selenium)
|
232 |
+
pip install -e ".[all]" # Install all optional features
|
233 |
+
```
|
234 |
+
|
235 |
+
</details>
|
236 |
+
|
237 |
+
<details>
|
238 |
+
<summary>🐳 <strong>Docker Deployment</strong></summary>
|
239 |
+
|
240 |
+
> 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution.
|
241 |
+
|
242 |
+
### Current Docker Support
|
243 |
+
|
244 |
+
The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version:
|
245 |
+
|
246 |
+
- 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation
|
247 |
+
- ⚠️ Note: This setup will be replaced in the next major release
|
248 |
+
|
249 |
+
### What's Coming Next?
|
250 |
+
|
251 |
+
Our new Docker implementation will bring:
|
252 |
+
- Improved performance and resource efficiency
|
253 |
+
- Streamlined deployment process
|
254 |
+
- Better integration with Crawl4AI features
|
255 |
+
- Enhanced scalability options
|
256 |
+
|
257 |
+
Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates!
|
258 |
+
|
259 |
+
</details>
|
260 |
+
|
261 |
+
---
|
262 |
+
|
263 |
+
### Quick Test
|
264 |
+
|
265 |
+
Run a quick test (works for both Docker options):
|
266 |
+
|
267 |
+
```python
|
268 |
+
import requests
|
269 |
+
|
270 |
+
# Submit a crawl job
|
271 |
+
response = requests.post(
|
272 |
+
"http://localhost:11235/crawl",
|
273 |
+
json={"urls": "https://example.com", "priority": 10}
|
274 |
+
)
|
275 |
+
task_id = response.json()["task_id"]
|
276 |
+
|
277 |
+
# Continue polling until the task is complete (status="completed")
|
278 |
+
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
279 |
+
```
|
280 |
+
|
281 |
+
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
|
282 |
+
|
283 |
+
</details>
|
284 |
+
|
285 |
+
|
286 |
+
## 🔬 Advanced Usage Examples 🔬
|
287 |
+
|
288 |
+
You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared.
|
289 |
+
|
290 |
+
<details>
|
291 |
+
<summary>📝 <strong>Heuristic Markdown Generation with Clean and Fit Markdown</strong></summary>
|
292 |
+
|
293 |
+
```python
|
294 |
+
import asyncio
|
295 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
296 |
+
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
297 |
+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
298 |
+
|
299 |
+
async def main():
|
300 |
+
browser_config = BrowserConfig(
|
301 |
+
headless=True,
|
302 |
+
verbose=True,
|
303 |
+
)
|
304 |
+
run_config = CrawlerRunConfig(
|
305 |
+
cache_mode=CacheMode.ENABLED,
|
306 |
+
markdown_generator=DefaultMarkdownGenerator(
|
307 |
+
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
308 |
+
),
|
309 |
+
# markdown_generator=DefaultMarkdownGenerator(
|
310 |
+
# content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0)
|
311 |
+
# ),
|
312 |
+
)
|
313 |
+
|
314 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
315 |
+
result = await crawler.arun(
|
316 |
+
url="https://docs.micronaut.io/4.7.6/guide/",
|
317 |
+
config=run_config
|
318 |
+
)
|
319 |
+
print(len(result.markdown))
|
320 |
+
print(len(result.fit_markdown))
|
321 |
+
print(len(result.markdown_v2.fit_markdown))
|
322 |
+
|
323 |
+
if __name__ == "__main__":
|
324 |
+
asyncio.run(main())
|
325 |
+
```
|
326 |
+
|
327 |
+
</details>
|
328 |
+
|
329 |
+
<details>
|
330 |
+
<summary>🖥️ <strong>Executing JavaScript & Extract Structured Data without LLMs</strong></summary>
|
331 |
+
|
332 |
+
```python
|
333 |
+
import asyncio
|
334 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
335 |
+
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
336 |
+
import json
|
337 |
+
|
338 |
+
async def main():
|
339 |
+
schema = {
|
340 |
+
"name": "KidoCode Courses",
|
341 |
+
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
342 |
+
"fields": [
|
343 |
+
{
|
344 |
+
"name": "section_title",
|
345 |
+
"selector": "h3.heading-50",
|
346 |
+
"type": "text",
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"name": "section_description",
|
350 |
+
"selector": ".charge-content",
|
351 |
+
"type": "text",
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"name": "course_name",
|
355 |
+
"selector": ".text-block-93",
|
356 |
+
"type": "text",
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"name": "course_description",
|
360 |
+
"selector": ".course-content-text",
|
361 |
+
"type": "text",
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"name": "course_icon",
|
365 |
+
"selector": ".image-92",
|
366 |
+
"type": "attribute",
|
367 |
+
"attribute": "src"
|
368 |
+
}
|
369 |
+
}
|
370 |
+
}
|
371 |
+
|
372 |
+
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
373 |
+
|
374 |
+
browser_config = BrowserConfig(
|
375 |
+
headless=False,
|
376 |
+
verbose=True
|
377 |
+
)
|
378 |
+
run_config = CrawlerRunConfig(
|
379 |
+
extraction_strategy=extraction_strategy,
|
380 |
+
js_code=["""(async () => {const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");for(let tab of tabs) {tab.scrollIntoView();tab.click();await new Promise(r => setTimeout(r, 500));}})();"""],
|
381 |
+
cache_mode=CacheMode.BYPASS
|
382 |
+
)
|
383 |
+
|
384 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
385 |
+
|
386 |
+
result = await crawler.arun(
|
387 |
+
url="https://www.kidocode.com/degrees/technology",
|
388 |
+
config=run_config
|
389 |
+
)
|
390 |
+
|
391 |
+
companies = json.loads(result.extracted_content)
|
392 |
+
print(f"Successfully extracted {len(companies)} companies")
|
393 |
+
print(json.dumps(companies[0], indent=2))
|
394 |
+
|
395 |
+
|
396 |
+
if __name__ == "__main__":
|
397 |
+
asyncio.run(main())
|
398 |
+
```
|
399 |
+
|
400 |
+
</details>
|
401 |
+
|
402 |
+
<details>
|
403 |
+
<summary>📚 <strong>Extracting Structured Data with LLMs</strong></summary>
|
404 |
+
|
405 |
+
```python
|
406 |
+
import os
|
407 |
+
import asyncio
|
408 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
409 |
+
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
410 |
+
from pydantic import BaseModel, Field
|
411 |
+
|
412 |
+
class OpenAIModelFee(BaseModel):
|
413 |
+
model_name: str = Field(..., description="Name of the OpenAI model.")
|
414 |
+
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
415 |
+
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
416 |
+
|
417 |
+
async def main():
|
418 |
+
browser_config = BrowserConfig(verbose=True)
|
419 |
+
run_config = CrawlerRunConfig(
|
420 |
+
word_count_threshold=1,
|
421 |
+
extraction_strategy=LLMExtractionStrategy(
|
422 |
+
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
|
423 |
+
# provider="ollama/qwen2", api_token="no-token",
|
424 |
+
provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
|
425 |
+
schema=OpenAIModelFee.schema(),
|
426 |
+
extraction_type="schema",
|
427 |
+
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
428 |
+
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
429 |
+
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
|
430 |
+
),
|
431 |
+
cache_mode=CacheMode.BYPASS,
|
432 |
+
)
|
433 |
+
|
434 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
435 |
+
result = await crawler.arun(
|
436 |
+
url='https://openai.com/api/pricing/',
|
437 |
+
config=run_config
|
438 |
+
)
|
439 |
+
print(result.extracted_content)
|
440 |
+
|
441 |
+
if __name__ == "__main__":
|
442 |
+
asyncio.run(main())
|
443 |
+
```
|
444 |
+
|
445 |
+
</details>
|
446 |
+
|
447 |
+
<details>
|
448 |
+
<summary>🤖 <strong>Using You own Browswer with Custome User Profile</strong></summary>
|
449 |
+
|
450 |
+
```python
|
451 |
+
import os, sys
|
452 |
+
from pathlib import Path
|
453 |
+
import asyncio, time
|
454 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
455 |
+
|
456 |
+
async def test_news_crawl():
|
457 |
+
# Create a persistent user data directory
|
458 |
+
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
459 |
+
os.makedirs(user_data_dir, exist_ok=True)
|
460 |
+
|
461 |
+
browser_config = BrowserConfig(
|
462 |
+
verbose=True,
|
463 |
+
headless=True,
|
464 |
+
user_data_dir=user_data_dir,
|
465 |
+
use_persistent_context=True,
|
466 |
+
)
|
467 |
+
run_config = CrawlerRunConfig(
|
468 |
+
cache_mode=CacheMode.BYPASS
|
469 |
+
)
|
470 |
+
|
471 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
472 |
+
url = "ADDRESS_OF_A_CHALLENGING_WEBSITE"
|
473 |
+
|
474 |
+
result = await crawler.arun(
|
475 |
+
url,
|
476 |
+
config=run_config,
|
477 |
+
magic=True,
|
478 |
+
)
|
479 |
+
|
480 |
+
print(f"Successfully crawled {url}")
|
481 |
+
print(f"Content length: {len(result.markdown)}")
|
482 |
+
```
|
483 |
+
|
484 |
+
</details>
|
485 |
+
|
486 |
+
|
487 |
+
## ✨ Recent Updates
|
488 |
+
|
489 |
+
- 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling
|
490 |
+
- 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies
|
491 |
+
- 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction
|
492 |
+
- 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types
|
493 |
+
- ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management
|
494 |
+
- 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking
|
495 |
+
- 🔐 **Security Features**: Improved input validation and safe expression evaluation
|
496 |
+
|
497 |
+
Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
498 |
+
|
499 |
+
## 📖 Documentation & Roadmap
|
500 |
+
|
501 |
+
> 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide!
|
502 |
+
|
503 |
+
For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
504 |
+
|
505 |
+
To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md).
|
506 |
+
|
507 |
+
<details>
|
508 |
+
<summary>📈 <strong>Development TODOs</strong></summary>
|
509 |
+
|
510 |
+
- [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction
|
511 |
+
- [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction
|
512 |
+
- [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction
|
513 |
+
- [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations
|
514 |
+
- [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas
|
515 |
+
- [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce)
|
516 |
+
- [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content
|
517 |
+
- [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance
|
518 |
+
- [ ] 8. Performance Monitor: Real-time insights into crawler operations
|
519 |
+
- [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers
|
520 |
+
- [ ] 10. Sponsorship Program: Structured support system with tiered benefits
|
521 |
+
- [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials
|
522 |
+
|
523 |
+
</details>
|
524 |
+
|
525 |
+
## 🤝 Contributing
|
526 |
+
|
527 |
+
We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
|
528 |
+
|
529 |
+
## 📄 License
|
530 |
+
|
531 |
+
Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
|
532 |
+
|
533 |
+
## 📧 Contact
|
534 |
+
|
535 |
+
For questions, suggestions, or feedback, feel free to reach out:
|
536 |
+
|
537 |
+
- GitHub: [unclecode](https://github.com/unclecode)
|
538 |
+
- Twitter: [@unclecode](https://twitter.com/unclecode)
|
539 |
+
- Website: [crawl4ai.com](https://crawl4ai.com)
|
540 |
+
|
541 |
+
Happy Crawling! 🕸️🚀
|
542 |
+
|
543 |
+
## 🗾 Mission
|
544 |
+
|
545 |
+
Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy.
|
546 |
+
|
547 |
+
We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement.
|
548 |
+
|
549 |
+
<details>
|
550 |
+
<summary>🔑 <strong>Key Opportunities</strong></summary>
|
551 |
+
|
552 |
+
- **Data Capitalization**: Transform digital footprints into measurable, valuable assets.
|
553 |
+
- **Authentic AI Data**: Provide AI systems with real human insights.
|
554 |
+
- **Shared Economy**: Create a fair data marketplace that benefits data creators.
|
555 |
+
|
556 |
+
</details>
|
557 |
+
|
558 |
+
<details>
|
559 |
+
<summary>🚀 <strong>Development Pathway</strong></summary>
|
560 |
+
|
561 |
+
1. **Open-Source Tools**: Community-driven platforms for transparent data extraction.
|
562 |
+
2. **Digital Asset Structuring**: Tools to organize and value digital knowledge.
|
563 |
+
3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data.
|
564 |
+
|
565 |
+
For more details, see our [full mission statement](./MISSION.md).
|
566 |
+
</details>
|
567 |
+
|
568 |
+
## Star History
|
569 |
+
|
570 |
+
[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
|
ROADMAP.md
ADDED
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Crawl4AI Strategic Roadmap
|
2 |
+
|
3 |
+
```mermaid
|
4 |
+
%%{init: {'themeVariables': { 'fontSize': '14px'}}}%%
|
5 |
+
graph TD
|
6 |
+
subgraph A1[Advanced Crawling Systems 🔧]
|
7 |
+
A["`
|
8 |
+
• Graph Crawler ✓
|
9 |
+
• Question-Based Crawler
|
10 |
+
• Knowledge-Optimal Crawler
|
11 |
+
• Agentic Crawler
|
12 |
+
`"]
|
13 |
+
end
|
14 |
+
|
15 |
+
subgraph A2[Specialized Features 🛠️]
|
16 |
+
B["`
|
17 |
+
• Automated Schema Generator
|
18 |
+
• Domain-Specific Scrapers
|
19 |
+
•
|
20 |
+
•
|
21 |
+
`"]
|
22 |
+
end
|
23 |
+
|
24 |
+
subgraph A3[Development Tools 🔨]
|
25 |
+
C["`
|
26 |
+
• Interactive Playground
|
27 |
+
• Performance Monitor
|
28 |
+
• Cloud Integration
|
29 |
+
•
|
30 |
+
`"]
|
31 |
+
end
|
32 |
+
|
33 |
+
subgraph A4[Community & Growth 🌱]
|
34 |
+
D["`
|
35 |
+
• Sponsorship Program
|
36 |
+
• Educational Content
|
37 |
+
•
|
38 |
+
•
|
39 |
+
`"]
|
40 |
+
end
|
41 |
+
|
42 |
+
classDef default fill:#f9f9f9,stroke:#333,stroke-width:2px
|
43 |
+
classDef section fill:#f0f0f0,stroke:#333,stroke-width:4px,rx:10
|
44 |
+
class A1,A2,A3,A4 section
|
45 |
+
|
46 |
+
%% Layout hints
|
47 |
+
A1 --> A2[" "]
|
48 |
+
A3 --> A4[" "]
|
49 |
+
linkStyle 0,1 stroke:none
|
50 |
+
```
|
51 |
+
|
52 |
+
Crawl4AI is evolving to provide more intelligent, efficient, and versatile web crawling capabilities. This roadmap outlines the key developments and features planned for the project, organized into strategic sections that build upon our current foundation.
|
53 |
+
|
54 |
+
## 1. Advanced Crawling Systems 🔧
|
55 |
+
|
56 |
+
This section introduces three powerful crawling systems that extend Crawl4AI's capabilities from basic web crawling to intelligent, purpose-driven data extraction.
|
57 |
+
|
58 |
+
### 1.1 Question-Based Crawler
|
59 |
+
The Question-Based Crawler enhances our core engine by enabling automatic discovery and extraction of relevant web content based on natural language questions.
|
60 |
+
|
61 |
+
Key Features:
|
62 |
+
- SerpiAPI integration for intelligent web search
|
63 |
+
- Relevancy scoring for search results
|
64 |
+
- Automatic URL discovery and prioritization
|
65 |
+
- Cross-source validation
|
66 |
+
|
67 |
+
```python
|
68 |
+
from crawl4ai import AsyncWebCrawler
|
69 |
+
from crawl4ai.discovery import QuestionBasedDiscovery
|
70 |
+
|
71 |
+
async with AsyncWebCrawler() as crawler:
|
72 |
+
discovery = QuestionBasedDiscovery(crawler)
|
73 |
+
results = await discovery.arun(
|
74 |
+
question="What are the system requirements for major cloud providers' GPU instances?",
|
75 |
+
max_urls=5,
|
76 |
+
relevance_threshold=0.7
|
77 |
+
)
|
78 |
+
|
79 |
+
for result in results:
|
80 |
+
print(f"Source: {result.url} (Relevance: {result.relevance_score})")
|
81 |
+
print(f"Content: {result.markdown}\n")
|
82 |
+
```
|
83 |
+
|
84 |
+
### 1.2 Knowledge-Optimal Crawler
|
85 |
+
An intelligent crawling system that solves the optimization problem of minimizing data extraction while maximizing knowledge acquisition for specific objectives.
|
86 |
+
|
87 |
+
Key Features:
|
88 |
+
- Smart content prioritization
|
89 |
+
- Minimal data extraction for maximum knowledge
|
90 |
+
- Probabilistic relevance assessment
|
91 |
+
- Objective-driven crawling paths
|
92 |
+
|
93 |
+
```python
|
94 |
+
from crawl4ai import AsyncWebCrawler
|
95 |
+
from crawl4ai.optimization import KnowledgeOptimizer
|
96 |
+
|
97 |
+
async with AsyncWebCrawler() as crawler:
|
98 |
+
optimizer = KnowledgeOptimizer(
|
99 |
+
objective="Understand GPU instance pricing and limitations across cloud providers",
|
100 |
+
required_knowledge=[
|
101 |
+
"pricing structure",
|
102 |
+
"GPU specifications",
|
103 |
+
"usage limits",
|
104 |
+
"availability zones"
|
105 |
+
],
|
106 |
+
confidence_threshold=0.85
|
107 |
+
)
|
108 |
+
|
109 |
+
result = await crawler.arun(
|
110 |
+
urls=[
|
111 |
+
"https://aws.amazon.com/ec2/pricing/",
|
112 |
+
"https://cloud.google.com/gpu",
|
113 |
+
"https://azure.microsoft.com/pricing/"
|
114 |
+
],
|
115 |
+
optimizer=optimizer,
|
116 |
+
optimization_mode="minimal_extraction"
|
117 |
+
)
|
118 |
+
|
119 |
+
print(f"Knowledge Coverage: {result.knowledge_coverage}")
|
120 |
+
print(f"Data Efficiency: {result.efficiency_ratio}")
|
121 |
+
print(f"Extracted Content: {result.optimal_content}")
|
122 |
+
```
|
123 |
+
|
124 |
+
### 1.3 Agentic Crawler
|
125 |
+
An autonomous system capable of understanding complex goals and automatically planning and executing multi-step crawling operations.
|
126 |
+
|
127 |
+
Key Features:
|
128 |
+
- Autonomous goal interpretation
|
129 |
+
- Dynamic step planning
|
130 |
+
- Interactive navigation capabilities
|
131 |
+
- Visual recognition and interaction
|
132 |
+
- Automatic error recovery
|
133 |
+
|
134 |
+
```python
|
135 |
+
from crawl4ai import AsyncWebCrawler
|
136 |
+
from crawl4ai.agents import CrawlerAgent
|
137 |
+
|
138 |
+
async with AsyncWebCrawler() as crawler:
|
139 |
+
agent = CrawlerAgent(crawler)
|
140 |
+
|
141 |
+
# Automatic planning and execution
|
142 |
+
result = await agent.arun(
|
143 |
+
goal="Find research papers about quantum computing published in 2023 with more than 50 citations",
|
144 |
+
auto_retry=True
|
145 |
+
)
|
146 |
+
print("Generated Plan:", result.executed_steps)
|
147 |
+
print("Extracted Data:", result.data)
|
148 |
+
|
149 |
+
# Using custom steps with automatic execution
|
150 |
+
result = await agent.arun(
|
151 |
+
goal="Extract conference deadlines from ML conferences",
|
152 |
+
custom_plan=[
|
153 |
+
"Navigate to conference page",
|
154 |
+
"Find important dates section",
|
155 |
+
"Extract submission deadlines",
|
156 |
+
"Verify dates are for 2024"
|
157 |
+
]
|
158 |
+
)
|
159 |
+
|
160 |
+
# Monitoring execution
|
161 |
+
print("Step Completion:", result.step_status)
|
162 |
+
print("Execution Time:", result.execution_time)
|
163 |
+
print("Success Rate:", result.success_rate)
|
164 |
+
```
|
165 |
+
|
166 |
+
# Section 2: Specialized Features 🛠️
|
167 |
+
|
168 |
+
This section introduces specialized tools and features that enhance Crawl4AI's capabilities for specific use cases and data extraction needs.
|
169 |
+
|
170 |
+
### 2.1 Automated Schema Generator
|
171 |
+
A system that automatically generates JsonCssExtractionStrategy schemas from natural language descriptions, making structured data extraction accessible to all users.
|
172 |
+
|
173 |
+
Key Features:
|
174 |
+
- Natural language schema generation
|
175 |
+
- Automatic pattern detection
|
176 |
+
- Predefined schema templates
|
177 |
+
- Chrome extension for visual schema building
|
178 |
+
|
179 |
+
```python
|
180 |
+
from crawl4ai import AsyncWebCrawler
|
181 |
+
from crawl4ai.schema import SchemaGenerator
|
182 |
+
|
183 |
+
# Generate schema from natural language description
|
184 |
+
generator = SchemaGenerator()
|
185 |
+
schema = await generator.generate(
|
186 |
+
url="https://news-website.com",
|
187 |
+
description="For each news article on the page, I need the headline, publication date, and main image"
|
188 |
+
)
|
189 |
+
|
190 |
+
# Use generated schema with crawler
|
191 |
+
async with AsyncWebCrawler() as crawler:
|
192 |
+
result = await crawler.arun(
|
193 |
+
url="https://news-website.com",
|
194 |
+
extraction_strategy=schema
|
195 |
+
)
|
196 |
+
|
197 |
+
# Example of generated schema:
|
198 |
+
"""
|
199 |
+
{
|
200 |
+
"name": "News Article Extractor",
|
201 |
+
"baseSelector": "article.news-item",
|
202 |
+
"fields": [
|
203 |
+
{
|
204 |
+
"name": "headline",
|
205 |
+
"selector": "h2.article-title",
|
206 |
+
"type": "text"
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"name": "date",
|
210 |
+
"selector": "span.publish-date",
|
211 |
+
"type": "text"
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"name": "image",
|
215 |
+
"selector": "img.article-image",
|
216 |
+
"type": "attribute",
|
217 |
+
"attribute": "src"
|
218 |
+
}
|
219 |
+
]
|
220 |
+
}
|
221 |
+
"""
|
222 |
+
```
|
223 |
+
|
224 |
+
### 2.2 Domain Specific Scrapers
|
225 |
+
Specialized extraction strategies optimized for common website types and platforms, providing consistent and reliable data extraction without additional configuration.
|
226 |
+
|
227 |
+
Key Features:
|
228 |
+
- Pre-configured extractors for popular platforms
|
229 |
+
- Academic site specialization (arXiv, NCBI)
|
230 |
+
- E-commerce standardization
|
231 |
+
- Documentation site handling
|
232 |
+
|
233 |
+
```python
|
234 |
+
from crawl4ai import AsyncWebCrawler
|
235 |
+
from crawl4ai.extractors import AcademicExtractor, EcommerceExtractor
|
236 |
+
|
237 |
+
async with AsyncWebCrawler() as crawler:
|
238 |
+
# Academic paper extraction
|
239 |
+
papers = await crawler.arun(
|
240 |
+
url="https://arxiv.org/list/cs.AI/recent",
|
241 |
+
extractor="academic", # Built-in extractor type
|
242 |
+
site_type="arxiv", # Specific site optimization
|
243 |
+
extract_fields=[
|
244 |
+
"title",
|
245 |
+
"authors",
|
246 |
+
"abstract",
|
247 |
+
"citations"
|
248 |
+
]
|
249 |
+
)
|
250 |
+
|
251 |
+
# E-commerce product data
|
252 |
+
products = await crawler.arun(
|
253 |
+
url="https://store.example.com/products",
|
254 |
+
extractor="ecommerce",
|
255 |
+
extract_fields=[
|
256 |
+
"name",
|
257 |
+
"price",
|
258 |
+
"availability",
|
259 |
+
"reviews"
|
260 |
+
]
|
261 |
+
)
|
262 |
+
```
|
263 |
+
|
264 |
+
### 2.3 Web Embedding Index
|
265 |
+
Creates and maintains a semantic search infrastructure for crawled content, enabling efficient retrieval and querying of web content through vector embeddings.
|
266 |
+
|
267 |
+
Key Features:
|
268 |
+
- Automatic embedding generation
|
269 |
+
- Intelligent content chunking
|
270 |
+
- Efficient vector storage and indexing
|
271 |
+
- Semantic search capabilities
|
272 |
+
|
273 |
+
```python
|
274 |
+
from crawl4ai import AsyncWebCrawler
|
275 |
+
from crawl4ai.indexing import WebIndex
|
276 |
+
|
277 |
+
# Initialize and build index
|
278 |
+
index = WebIndex(model="efficient-mini")
|
279 |
+
|
280 |
+
async with AsyncWebCrawler() as crawler:
|
281 |
+
# Crawl and index content
|
282 |
+
await index.build(
|
283 |
+
urls=["https://docs.example.com"],
|
284 |
+
crawler=crawler,
|
285 |
+
options={
|
286 |
+
"chunk_method": "semantic",
|
287 |
+
"update_policy": "incremental",
|
288 |
+
"embedding_batch_size": 100
|
289 |
+
}
|
290 |
+
)
|
291 |
+
|
292 |
+
# Search through indexed content
|
293 |
+
results = await index.search(
|
294 |
+
query="How to implement OAuth authentication?",
|
295 |
+
filters={
|
296 |
+
"content_type": "technical",
|
297 |
+
"recency": "6months"
|
298 |
+
},
|
299 |
+
top_k=5
|
300 |
+
)
|
301 |
+
|
302 |
+
# Get similar content
|
303 |
+
similar = await index.find_similar(
|
304 |
+
url="https://docs.example.com/auth/oauth",
|
305 |
+
threshold=0.85
|
306 |
+
)
|
307 |
+
```
|
308 |
+
|
309 |
+
Each of these specialized features builds upon Crawl4AI's core functionality while providing targeted solutions for specific use cases. They can be used independently or combined for more complex data extraction and processing needs.
|
310 |
+
|
311 |
+
# Section 3: Development Tools 🔧
|
312 |
+
|
313 |
+
This section covers tools designed to enhance the development experience, monitoring, and deployment of Crawl4AI applications.
|
314 |
+
|
315 |
+
### 3.1 Crawl4AI Playground 🎮
|
316 |
+
|
317 |
+
The Crawl4AI Playground is an interactive web-based development environment that simplifies web scraping experimentation, development, and deployment. With its intuitive interface and AI-powered assistance, users can quickly prototype, test, and deploy web scraping solutions.
|
318 |
+
|
319 |
+
#### Key Features 🌟
|
320 |
+
|
321 |
+
##### Visual Strategy Builder
|
322 |
+
- Interactive point-and-click interface for building extraction strategies
|
323 |
+
- Real-time preview of selected elements
|
324 |
+
- Side-by-side comparison of different extraction approaches
|
325 |
+
- Visual validation of CSS selectors and XPath queries
|
326 |
+
|
327 |
+
##### AI Assistant Integration
|
328 |
+
- Strategy recommendations based on target website analysis
|
329 |
+
- Parameter optimization suggestions
|
330 |
+
- Best practices guidance for specific use cases
|
331 |
+
- Automated error detection and resolution
|
332 |
+
- Performance optimization tips
|
333 |
+
|
334 |
+
##### Real-Time Testing & Validation
|
335 |
+
- Live preview of extraction results
|
336 |
+
- Side-by-side comparison of multiple strategies
|
337 |
+
- Performance metrics visualization
|
338 |
+
- Automatic validation of extracted data
|
339 |
+
- Error detection and debugging tools
|
340 |
+
|
341 |
+
##### Project Management
|
342 |
+
- Save and organize multiple scraping projects
|
343 |
+
- Version control for configurations
|
344 |
+
- Export/import project settings
|
345 |
+
- Share configurations with team members
|
346 |
+
- Project templates for common use cases
|
347 |
+
|
348 |
+
##### Deployment Pipeline
|
349 |
+
- One-click deployment to various environments
|
350 |
+
- Docker container generation
|
351 |
+
- Cloud deployment templates (AWS, GCP, Azure)
|
352 |
+
- Scaling configuration management
|
353 |
+
- Monitoring setup automation
|
354 |
+
|
355 |
+
|
356 |
+
### 3.2 Performance Monitoring System
|
357 |
+
A comprehensive monitoring solution providing real-time insights into crawler operations, resource usage, and system health through both CLI and GUI interfaces.
|
358 |
+
|
359 |
+
Key Features:
|
360 |
+
- Real-time resource tracking
|
361 |
+
- Active crawl monitoring
|
362 |
+
- Performance statistics
|
363 |
+
- Customizable alerting system
|
364 |
+
|
365 |
+
```python
|
366 |
+
from crawl4ai import AsyncWebCrawler
|
367 |
+
from crawl4ai.monitor import CrawlMonitor
|
368 |
+
|
369 |
+
# Initialize monitoring
|
370 |
+
monitor = CrawlMonitor()
|
371 |
+
|
372 |
+
# Start monitoring with CLI interface
|
373 |
+
await monitor.start(
|
374 |
+
mode="cli", # or "gui"
|
375 |
+
refresh_rate="1s",
|
376 |
+
metrics={
|
377 |
+
"resources": ["cpu", "memory", "network"],
|
378 |
+
"crawls": ["active", "queued", "completed"],
|
379 |
+
"performance": ["success_rate", "response_times"]
|
380 |
+
}
|
381 |
+
)
|
382 |
+
|
383 |
+
# Example CLI output:
|
384 |
+
"""
|
385 |
+
Crawl4AI Monitor (Live) - Press Q to exit
|
386 |
+
────────────────────────────────────────
|
387 |
+
System Usage:
|
388 |
+
├─ CPU: ███████░░░ 70%
|
389 |
+
└─ Memory: ████░░░░░ 2.1GB/8GB
|
390 |
+
|
391 |
+
Active Crawls:
|
392 |
+
ID URL Status Progress
|
393 |
+
001 docs.example.com 🟢 Active 75%
|
394 |
+
002 api.service.com 🟡 Queue -
|
395 |
+
|
396 |
+
Metrics (Last 5min):
|
397 |
+
├─ Success Rate: 98%
|
398 |
+
├─ Avg Response: 0.6s
|
399 |
+
└─ Pages/sec: 8.5
|
400 |
+
"""
|
401 |
+
```
|
402 |
+
|
403 |
+
### 3.3 Cloud Integration
|
404 |
+
Streamlined deployment tools for setting up Crawl4AI in various cloud environments, with support for scaling and monitoring.
|
405 |
+
|
406 |
+
Key Features:
|
407 |
+
- One-click deployment solutions
|
408 |
+
- Auto-scaling configuration
|
409 |
+
- Load balancing setup
|
410 |
+
- Cloud-specific optimizations
|
411 |
+
- Monitoring integration
|
412 |
+
|
413 |
+
```python
|
414 |
+
from crawl4ai import AsyncWebCrawler
|
415 |
+
from crawl4ai.deploy import CloudDeployer
|
416 |
+
|
417 |
+
# Initialize deployer
|
418 |
+
deployer = CloudDeployer()
|
419 |
+
|
420 |
+
# Deploy crawler service
|
421 |
+
deployment = await deployer.deploy(
|
422 |
+
service_name="crawler-cluster",
|
423 |
+
platform="aws", # or "gcp", "azure"
|
424 |
+
config={
|
425 |
+
"instance_type": "compute-optimized",
|
426 |
+
"auto_scaling": {
|
427 |
+
"min_instances": 2,
|
428 |
+
"max_instances": 10,
|
429 |
+
"scale_based_on": "cpu_usage"
|
430 |
+
},
|
431 |
+
"region": "us-east-1",
|
432 |
+
"monitoring": True
|
433 |
+
}
|
434 |
+
)
|
435 |
+
|
436 |
+
# Get deployment status and endpoints
|
437 |
+
print(f"Service Status: {deployment.status}")
|
438 |
+
print(f"API Endpoint: {deployment.endpoint}")
|
439 |
+
print(f"Monitor URL: {deployment.monitor_url}")
|
440 |
+
```
|
441 |
+
|
442 |
+
These development tools work together to provide a comprehensive environment for developing, testing, monitoring, and deploying Crawl4AI applications. The Playground helps users experiment and generate optimal configurations, the Performance Monitor ensures smooth operation, and the Cloud Integration tools simplify deployment and scaling.
|
443 |
+
|
444 |
+
# Section 4: Community & Growth 🌱
|
445 |
+
|
446 |
+
This section outlines initiatives designed to build and support the Crawl4AI community, provide educational resources, and ensure sustainable project growth.
|
447 |
+
|
448 |
+
### 4.1 Sponsorship Program
|
449 |
+
A structured program to support ongoing development and maintenance of Crawl4AI while providing valuable benefits to sponsors.
|
450 |
+
|
451 |
+
Key Features:
|
452 |
+
- Multiple sponsorship tiers
|
453 |
+
- Sponsor recognition system
|
454 |
+
- Priority support for sponsors
|
455 |
+
- Early access to new features
|
456 |
+
- Custom feature development opportunities
|
457 |
+
|
458 |
+
Program Structure (not yet finalized):
|
459 |
+
```
|
460 |
+
Sponsorship Tiers:
|
461 |
+
|
462 |
+
🥉 Bronze Supporter
|
463 |
+
- GitHub Sponsor badge
|
464 |
+
- Priority issue response
|
465 |
+
- Community Discord role
|
466 |
+
|
467 |
+
🥈 Silver Supporter
|
468 |
+
- All Bronze benefits
|
469 |
+
- Technical support channel
|
470 |
+
- Vote on roadmap priorities
|
471 |
+
- Early access to beta features
|
472 |
+
|
473 |
+
🥇 Gold Supporter
|
474 |
+
- All Silver benefits
|
475 |
+
- Custom feature requests
|
476 |
+
- Direct developer access
|
477 |
+
- Private support sessions
|
478 |
+
|
479 |
+
💎 Diamond Partner
|
480 |
+
- All Gold benefits
|
481 |
+
- Custom development
|
482 |
+
- On-demand consulting
|
483 |
+
- Integration support
|
484 |
+
```
|
485 |
+
|
486 |
+
### 4.2 "How to Crawl" Video Series
|
487 |
+
A comprehensive educational resource teaching users how to effectively use Crawl4AI for various web scraping and data extraction scenarios.
|
488 |
+
|
489 |
+
Key Features:
|
490 |
+
- Step-by-step tutorials
|
491 |
+
- Real-world use cases
|
492 |
+
- Best practices
|
493 |
+
- Integration guides
|
494 |
+
- Advanced feature deep-dives
|
495 |
+
|
496 |
+
These community initiatives are designed to:
|
497 |
+
- Provide comprehensive learning resources
|
498 |
+
- Foster a supportive user community
|
499 |
+
- Ensure sustainable project development
|
500 |
+
- Share knowledge and best practices
|
501 |
+
- Create opportunities for collaboration
|
502 |
+
|
503 |
+
The combination of structured support through sponsorship, educational content through video series, and interactive learning through the playground creates a robust ecosystem for both new and experienced users of Crawl4AI.
|
crawl4ai/__init__.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# __init__.py
|
2 |
+
|
3 |
+
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
4 |
+
from .async_configs import BrowserConfig, CrawlerRunConfig
|
5 |
+
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
6 |
+
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
7 |
+
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
8 |
+
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
9 |
+
from .models import CrawlResult
|
10 |
+
from .__version__ import __version__
|
11 |
+
|
12 |
+
__all__ = [
|
13 |
+
"AsyncWebCrawler",
|
14 |
+
"CrawlResult",
|
15 |
+
"CacheMode",
|
16 |
+
'BrowserConfig',
|
17 |
+
'CrawlerRunConfig',
|
18 |
+
'ExtractionStrategy',
|
19 |
+
'LLMExtractionStrategy',
|
20 |
+
'CosineStrategy',
|
21 |
+
'JsonCssExtractionStrategy',
|
22 |
+
'ChunkingStrategy',
|
23 |
+
'RegexChunking',
|
24 |
+
'DefaultMarkdownGenerator',
|
25 |
+
'PruningContentFilter',
|
26 |
+
'BM25ContentFilter',
|
27 |
+
]
|
28 |
+
|
29 |
+
def is_sync_version_installed():
|
30 |
+
try:
|
31 |
+
import selenium
|
32 |
+
return True
|
33 |
+
except ImportError:
|
34 |
+
return False
|
35 |
+
|
36 |
+
if is_sync_version_installed():
|
37 |
+
try:
|
38 |
+
from .web_crawler import WebCrawler
|
39 |
+
__all__.append("WebCrawler")
|
40 |
+
except ImportError:
|
41 |
+
import warnings
|
42 |
+
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
|
43 |
+
else:
|
44 |
+
WebCrawler = None
|
45 |
+
# import warnings
|
46 |
+
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
crawl4ai/__version__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# crawl4ai/_version.py
|
2 |
+
__version__ = "0.4.247"
|
crawl4ai/async_configs.py
ADDED
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .config import (
|
2 |
+
MIN_WORD_THRESHOLD,
|
3 |
+
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
4 |
+
SCREENSHOT_HEIGHT_TRESHOLD,
|
5 |
+
PAGE_TIMEOUT,
|
6 |
+
IMAGE_SCORE_THRESHOLD,
|
7 |
+
SOCIAL_MEDIA_DOMAINS,
|
8 |
+
|
9 |
+
)
|
10 |
+
from .user_agent_generator import UserAgentGenerator
|
11 |
+
from .extraction_strategy import ExtractionStrategy
|
12 |
+
from .chunking_strategy import ChunkingStrategy
|
13 |
+
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
14 |
+
from typing import Union, List
|
15 |
+
|
16 |
+
|
17 |
+
class BrowserConfig:
|
18 |
+
"""
|
19 |
+
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
20 |
+
|
21 |
+
This class centralizes all parameters that affect browser and context creation. Instead of passing
|
22 |
+
scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
|
23 |
+
code will then reference these settings to initialize the browser in a consistent, documented manner.
|
24 |
+
|
25 |
+
Attributes:
|
26 |
+
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
27 |
+
Default: "chromium".
|
28 |
+
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
29 |
+
Default: True.
|
30 |
+
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
31 |
+
advanced manipulation. Default: False.
|
32 |
+
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
|
33 |
+
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
|
34 |
+
Automatically sets use_managed_browser=True. Default: False.
|
35 |
+
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
36 |
+
temporary directory may be used. Default: None.
|
37 |
+
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
|
38 |
+
is "chromium". Default: "chromium".
|
39 |
+
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
|
40 |
+
is "chromium". Default: "chromium".
|
41 |
+
proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
42 |
+
Default: None.
|
43 |
+
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
44 |
+
If None, no additional proxy config. Default: None.
|
45 |
+
viewport_width (int): Default viewport width for pages. Default: 1080.
|
46 |
+
viewport_height (int): Default viewport height for pages. Default: 600.
|
47 |
+
verbose (bool): Enable verbose logging.
|
48 |
+
Default: True.
|
49 |
+
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
50 |
+
Default: False.
|
51 |
+
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
52 |
+
a default path will be created. Default: None.
|
53 |
+
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
|
54 |
+
Default: None.
|
55 |
+
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
56 |
+
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
57 |
+
cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
|
58 |
+
{"name": "...", "value": "...", "url": "..."}.
|
59 |
+
Default: [].
|
60 |
+
headers (dict): Extra HTTP headers to apply to all requests in this context.
|
61 |
+
Default: {}.
|
62 |
+
user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
63 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
|
64 |
+
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
|
65 |
+
user_agent as-is. Default: None.
|
66 |
+
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
67 |
+
Default: None.
|
68 |
+
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
|
69 |
+
Default: False.
|
70 |
+
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
71 |
+
extra_args (list): Additional command-line arguments passed to the browser.
|
72 |
+
Default: [].
|
73 |
+
"""
|
74 |
+
|
75 |
+
def __init__(
|
76 |
+
self,
|
77 |
+
browser_type: str = "chromium",
|
78 |
+
headless: bool = True,
|
79 |
+
use_managed_browser: bool = False,
|
80 |
+
use_persistent_context: bool = False,
|
81 |
+
user_data_dir: str = None,
|
82 |
+
chrome_channel: str = "chromium",
|
83 |
+
channel: str = "chromium",
|
84 |
+
proxy: str = None,
|
85 |
+
proxy_config: dict = None,
|
86 |
+
viewport_width: int = 1080,
|
87 |
+
viewport_height: int = 600,
|
88 |
+
accept_downloads: bool = False,
|
89 |
+
downloads_path: str = None,
|
90 |
+
storage_state=None,
|
91 |
+
ignore_https_errors: bool = True,
|
92 |
+
java_script_enabled: bool = True,
|
93 |
+
sleep_on_close: bool = False,
|
94 |
+
verbose: bool = True,
|
95 |
+
cookies: list = None,
|
96 |
+
headers: dict = None,
|
97 |
+
user_agent: str = (
|
98 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
99 |
+
"(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
100 |
+
),
|
101 |
+
user_agent_mode: str = None,
|
102 |
+
user_agent_generator_config: dict = None,
|
103 |
+
text_mode: bool = False,
|
104 |
+
light_mode: bool = False,
|
105 |
+
extra_args: list = None,
|
106 |
+
debugging_port : int = 9222,
|
107 |
+
):
|
108 |
+
self.browser_type = browser_type
|
109 |
+
self.headless = headless
|
110 |
+
self.use_managed_browser = use_managed_browser
|
111 |
+
self.use_persistent_context = use_persistent_context
|
112 |
+
self.user_data_dir = user_data_dir
|
113 |
+
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
114 |
+
self.channel = channel or self.browser_type or "chromium"
|
115 |
+
self.proxy = proxy
|
116 |
+
self.proxy_config = proxy_config
|
117 |
+
self.viewport_width = viewport_width
|
118 |
+
self.viewport_height = viewport_height
|
119 |
+
self.accept_downloads = accept_downloads
|
120 |
+
self.downloads_path = downloads_path
|
121 |
+
self.storage_state = storage_state
|
122 |
+
self.ignore_https_errors = ignore_https_errors
|
123 |
+
self.java_script_enabled = java_script_enabled
|
124 |
+
self.cookies = cookies if cookies is not None else []
|
125 |
+
self.headers = headers if headers is not None else {}
|
126 |
+
self.user_agent = user_agent
|
127 |
+
self.user_agent_mode = user_agent_mode
|
128 |
+
self.user_agent_generator_config = user_agent_generator_config
|
129 |
+
self.text_mode = text_mode
|
130 |
+
self.light_mode = light_mode
|
131 |
+
self.extra_args = extra_args if extra_args is not None else []
|
132 |
+
self.sleep_on_close = sleep_on_close
|
133 |
+
self.verbose = verbose
|
134 |
+
self.debugging_port = debugging_port
|
135 |
+
|
136 |
+
user_agenr_generator = UserAgentGenerator()
|
137 |
+
if self.user_agent_mode != "random" and self.user_agent_generator_config:
|
138 |
+
self.user_agent = user_agenr_generator.generate(
|
139 |
+
**(self.user_agent_generator_config or {})
|
140 |
+
)
|
141 |
+
elif self.user_agent_mode == "random":
|
142 |
+
self.user_agent = user_agenr_generator.generate()
|
143 |
+
else:
|
144 |
+
pass
|
145 |
+
|
146 |
+
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
147 |
+
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
148 |
+
|
149 |
+
# If persistent context is requested, ensure managed browser is enabled
|
150 |
+
if self.use_persistent_context:
|
151 |
+
self.use_managed_browser = True
|
152 |
+
|
153 |
+
@staticmethod
|
154 |
+
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
155 |
+
return BrowserConfig(
|
156 |
+
browser_type=kwargs.get("browser_type", "chromium"),
|
157 |
+
headless=kwargs.get("headless", True),
|
158 |
+
use_managed_browser=kwargs.get("use_managed_browser", False),
|
159 |
+
use_persistent_context=kwargs.get("use_persistent_context", False),
|
160 |
+
user_data_dir=kwargs.get("user_data_dir"),
|
161 |
+
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
162 |
+
channel=kwargs.get("channel", "chromium"),
|
163 |
+
proxy=kwargs.get("proxy"),
|
164 |
+
proxy_config=kwargs.get("proxy_config"),
|
165 |
+
viewport_width=kwargs.get("viewport_width", 1080),
|
166 |
+
viewport_height=kwargs.get("viewport_height", 600),
|
167 |
+
accept_downloads=kwargs.get("accept_downloads", False),
|
168 |
+
downloads_path=kwargs.get("downloads_path"),
|
169 |
+
storage_state=kwargs.get("storage_state"),
|
170 |
+
ignore_https_errors=kwargs.get("ignore_https_errors", True),
|
171 |
+
java_script_enabled=kwargs.get("java_script_enabled", True),
|
172 |
+
cookies=kwargs.get("cookies", []),
|
173 |
+
headers=kwargs.get("headers", {}),
|
174 |
+
user_agent=kwargs.get(
|
175 |
+
"user_agent",
|
176 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
177 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
178 |
+
),
|
179 |
+
user_agent_mode=kwargs.get("user_agent_mode"),
|
180 |
+
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
|
181 |
+
text_mode=kwargs.get("text_mode", False),
|
182 |
+
light_mode=kwargs.get("light_mode", False),
|
183 |
+
extra_args=kwargs.get("extra_args", []),
|
184 |
+
)
|
185 |
+
|
186 |
+
|
187 |
+
class CrawlerRunConfig:
|
188 |
+
"""
|
189 |
+
Configuration class for controlling how the crawler runs each crawl operation.
|
190 |
+
This includes parameters for content extraction, page manipulation, waiting conditions,
|
191 |
+
caching, and other runtime behaviors.
|
192 |
+
|
193 |
+
This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
|
194 |
+
By using this class, you have a single place to understand and adjust the crawling options.
|
195 |
+
|
196 |
+
Attributes:
|
197 |
+
# Content Processing Parameters
|
198 |
+
word_count_threshold (int): Minimum word count threshold before processing content.
|
199 |
+
Default: MIN_WORD_THRESHOLD (typically 200).
|
200 |
+
extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
|
201 |
+
Default: None (NoExtractionStrategy is used if None).
|
202 |
+
chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
|
203 |
+
Default: RegexChunking().
|
204 |
+
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
205 |
+
Default: None.
|
206 |
+
content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
|
207 |
+
Default: None.
|
208 |
+
only_text (bool): If True, attempt to extract text-only content where applicable.
|
209 |
+
Default: False.
|
210 |
+
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
211 |
+
Default: None.
|
212 |
+
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
|
213 |
+
Default: None.
|
214 |
+
excluded_selector (str or None): CSS selector to exclude from processing.
|
215 |
+
Default: None.
|
216 |
+
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
|
217 |
+
Default: False.
|
218 |
+
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
|
219 |
+
Default: False.
|
220 |
+
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
221 |
+
Default: False.
|
222 |
+
parser_type (str): Type of parser to use for HTML parsing.
|
223 |
+
Default: "lxml".
|
224 |
+
|
225 |
+
# Caching Parameters
|
226 |
+
cache_mode (CacheMode or None): Defines how caching is handled.
|
227 |
+
If None, defaults to CacheMode.ENABLED internally.
|
228 |
+
Default: None.
|
229 |
+
session_id (str or None): Optional session ID to persist the browser context and the created
|
230 |
+
page instance. If the ID already exists, the crawler does not
|
231 |
+
create a new page and uses the current page to preserve the state.
|
232 |
+
bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
|
233 |
+
Default: False.
|
234 |
+
disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
|
235 |
+
Default: False.
|
236 |
+
no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
|
237 |
+
Default: False.
|
238 |
+
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
|
239 |
+
Default: False.
|
240 |
+
|
241 |
+
# Page Navigation and Timing Parameters
|
242 |
+
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
|
243 |
+
Default: "domcontentloaded".
|
244 |
+
page_timeout (int): Timeout in ms for page operations like navigation.
|
245 |
+
Default: 60000 (60 seconds).
|
246 |
+
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
247 |
+
Default: None.
|
248 |
+
wait_for_images (bool): If True, wait for images to load before extracting content.
|
249 |
+
Default: False.
|
250 |
+
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
251 |
+
Default: 0.1.
|
252 |
+
mean_delay (float): Mean base delay between requests when calling arun_many.
|
253 |
+
Default: 0.1.
|
254 |
+
max_range (float): Max random additional delay range for requests in arun_many.
|
255 |
+
Default: 0.3.
|
256 |
+
semaphore_count (int): Number of concurrent operations allowed.
|
257 |
+
Default: 5.
|
258 |
+
|
259 |
+
# Page Interaction Parameters
|
260 |
+
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
|
261 |
+
Default: None.
|
262 |
+
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
|
263 |
+
Default: False.
|
264 |
+
ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
|
265 |
+
Default: True.
|
266 |
+
scan_full_page (bool): If True, scroll through the entire page to load all content.
|
267 |
+
Default: False.
|
268 |
+
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
|
269 |
+
Default: 0.2.
|
270 |
+
process_iframes (bool): If True, attempts to process and inline iframe content.
|
271 |
+
Default: False.
|
272 |
+
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
273 |
+
Default: False.
|
274 |
+
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
|
275 |
+
Default: False.
|
276 |
+
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
|
277 |
+
Default: False.
|
278 |
+
magic (bool): If True, attempts automatic handling of overlays/popups.
|
279 |
+
Default: False.
|
280 |
+
adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
|
281 |
+
Default: False.
|
282 |
+
|
283 |
+
# Media Handling Parameters
|
284 |
+
screenshot (bool): Whether to take a screenshot after crawling.
|
285 |
+
Default: False.
|
286 |
+
screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
|
287 |
+
Default: None.
|
288 |
+
screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
|
289 |
+
Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
|
290 |
+
pdf (bool): Whether to generate a PDF of the page.
|
291 |
+
Default: False.
|
292 |
+
image_description_min_word_threshold (int): Minimum words for image description extraction.
|
293 |
+
Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
|
294 |
+
image_score_threshold (int): Minimum score threshold for processing an image.
|
295 |
+
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
|
296 |
+
exclude_external_images (bool): If True, exclude all external images from processing.
|
297 |
+
Default: False.
|
298 |
+
|
299 |
+
# Link and Domain Handling Parameters
|
300 |
+
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
|
301 |
+
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
302 |
+
exclude_external_links (bool): If True, exclude all external links from the results.
|
303 |
+
Default: False.
|
304 |
+
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
|
305 |
+
Default: False.
|
306 |
+
exclude_domains (list of str): List of specific domains to exclude from results.
|
307 |
+
Default: [].
|
308 |
+
|
309 |
+
# Debugging and Logging Parameters
|
310 |
+
verbose (bool): Enable verbose logging.
|
311 |
+
Default: True.
|
312 |
+
log_console (bool): If True, log console messages from the page.
|
313 |
+
Default: False.
|
314 |
+
"""
|
315 |
+
|
316 |
+
def __init__(
|
317 |
+
self,
|
318 |
+
# Content Processing Parameters
|
319 |
+
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
320 |
+
extraction_strategy: ExtractionStrategy = None,
|
321 |
+
chunking_strategy: ChunkingStrategy = None,
|
322 |
+
markdown_generator: MarkdownGenerationStrategy = None,
|
323 |
+
content_filter=None,
|
324 |
+
only_text: bool = False,
|
325 |
+
css_selector: str = None,
|
326 |
+
excluded_tags: list = None,
|
327 |
+
excluded_selector: str = None,
|
328 |
+
keep_data_attributes: bool = False,
|
329 |
+
remove_forms: bool = False,
|
330 |
+
prettiify: bool = False,
|
331 |
+
parser_type: str = "lxml",
|
332 |
+
|
333 |
+
# SSL Parameters
|
334 |
+
fetch_ssl_certificate: bool = False,
|
335 |
+
|
336 |
+
# Caching Parameters
|
337 |
+
cache_mode=None,
|
338 |
+
session_id: str = None,
|
339 |
+
bypass_cache: bool = False,
|
340 |
+
disable_cache: bool = False,
|
341 |
+
no_cache_read: bool = False,
|
342 |
+
no_cache_write: bool = False,
|
343 |
+
|
344 |
+
# Page Navigation and Timing Parameters
|
345 |
+
wait_until: str = "domcontentloaded",
|
346 |
+
page_timeout: int = PAGE_TIMEOUT,
|
347 |
+
wait_for: str = None,
|
348 |
+
wait_for_images: bool = False,
|
349 |
+
delay_before_return_html: float = 0.1,
|
350 |
+
mean_delay: float = 0.1,
|
351 |
+
max_range: float = 0.3,
|
352 |
+
semaphore_count: int = 5,
|
353 |
+
|
354 |
+
# Page Interaction Parameters
|
355 |
+
js_code: Union[str, List[str]] = None,
|
356 |
+
js_only: bool = False,
|
357 |
+
ignore_body_visibility: bool = True,
|
358 |
+
scan_full_page: bool = False,
|
359 |
+
scroll_delay: float = 0.2,
|
360 |
+
process_iframes: bool = False,
|
361 |
+
remove_overlay_elements: bool = False,
|
362 |
+
simulate_user: bool = False,
|
363 |
+
override_navigator: bool = False,
|
364 |
+
magic: bool = False,
|
365 |
+
adjust_viewport_to_content: bool = False,
|
366 |
+
|
367 |
+
# Media Handling Parameters
|
368 |
+
screenshot: bool = False,
|
369 |
+
screenshot_wait_for: float = None,
|
370 |
+
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
|
371 |
+
pdf: bool = False,
|
372 |
+
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
373 |
+
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
374 |
+
exclude_external_images: bool = False,
|
375 |
+
|
376 |
+
# Link and Domain Handling Parameters
|
377 |
+
exclude_social_media_domains: list = None,
|
378 |
+
exclude_external_links: bool = False,
|
379 |
+
exclude_social_media_links: bool = False,
|
380 |
+
exclude_domains: list = None,
|
381 |
+
|
382 |
+
# Debugging and Logging Parameters
|
383 |
+
verbose: bool = True,
|
384 |
+
log_console: bool = False,
|
385 |
+
|
386 |
+
url: str = None,
|
387 |
+
):
|
388 |
+
self.url = url
|
389 |
+
|
390 |
+
# Content Processing Parameters
|
391 |
+
self.word_count_threshold = word_count_threshold
|
392 |
+
self.extraction_strategy = extraction_strategy
|
393 |
+
self.chunking_strategy = chunking_strategy
|
394 |
+
self.markdown_generator = markdown_generator
|
395 |
+
self.content_filter = content_filter
|
396 |
+
self.only_text = only_text
|
397 |
+
self.css_selector = css_selector
|
398 |
+
self.excluded_tags = excluded_tags or []
|
399 |
+
self.excluded_selector = excluded_selector or ""
|
400 |
+
self.keep_data_attributes = keep_data_attributes
|
401 |
+
self.remove_forms = remove_forms
|
402 |
+
self.prettiify = prettiify
|
403 |
+
self.parser_type = parser_type
|
404 |
+
|
405 |
+
# SSL Parameters
|
406 |
+
self.fetch_ssl_certificate = fetch_ssl_certificate
|
407 |
+
|
408 |
+
# Caching Parameters
|
409 |
+
self.cache_mode = cache_mode
|
410 |
+
self.session_id = session_id
|
411 |
+
self.bypass_cache = bypass_cache
|
412 |
+
self.disable_cache = disable_cache
|
413 |
+
self.no_cache_read = no_cache_read
|
414 |
+
self.no_cache_write = no_cache_write
|
415 |
+
|
416 |
+
# Page Navigation and Timing Parameters
|
417 |
+
self.wait_until = wait_until
|
418 |
+
self.page_timeout = page_timeout
|
419 |
+
self.wait_for = wait_for
|
420 |
+
self.wait_for_images = wait_for_images
|
421 |
+
self.delay_before_return_html = delay_before_return_html
|
422 |
+
self.mean_delay = mean_delay
|
423 |
+
self.max_range = max_range
|
424 |
+
self.semaphore_count = semaphore_count
|
425 |
+
|
426 |
+
# Page Interaction Parameters
|
427 |
+
self.js_code = js_code
|
428 |
+
self.js_only = js_only
|
429 |
+
self.ignore_body_visibility = ignore_body_visibility
|
430 |
+
self.scan_full_page = scan_full_page
|
431 |
+
self.scroll_delay = scroll_delay
|
432 |
+
self.process_iframes = process_iframes
|
433 |
+
self.remove_overlay_elements = remove_overlay_elements
|
434 |
+
self.simulate_user = simulate_user
|
435 |
+
self.override_navigator = override_navigator
|
436 |
+
self.magic = magic
|
437 |
+
self.adjust_viewport_to_content = adjust_viewport_to_content
|
438 |
+
|
439 |
+
# Media Handling Parameters
|
440 |
+
self.screenshot = screenshot
|
441 |
+
self.screenshot_wait_for = screenshot_wait_for
|
442 |
+
self.screenshot_height_threshold = screenshot_height_threshold
|
443 |
+
self.pdf = pdf
|
444 |
+
self.image_description_min_word_threshold = image_description_min_word_threshold
|
445 |
+
self.image_score_threshold = image_score_threshold
|
446 |
+
self.exclude_external_images = exclude_external_images
|
447 |
+
|
448 |
+
# Link and Domain Handling Parameters
|
449 |
+
self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
|
450 |
+
self.exclude_external_links = exclude_external_links
|
451 |
+
self.exclude_social_media_links = exclude_social_media_links
|
452 |
+
self.exclude_domains = exclude_domains or []
|
453 |
+
|
454 |
+
# Debugging and Logging Parameters
|
455 |
+
self.verbose = verbose
|
456 |
+
self.log_console = log_console
|
457 |
+
|
458 |
+
# Validate type of extraction strategy and chunking strategy if they are provided
|
459 |
+
if self.extraction_strategy is not None and not isinstance(
|
460 |
+
self.extraction_strategy, ExtractionStrategy
|
461 |
+
):
|
462 |
+
raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
|
463 |
+
if self.chunking_strategy is not None and not isinstance(
|
464 |
+
self.chunking_strategy, ChunkingStrategy
|
465 |
+
):
|
466 |
+
raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
|
467 |
+
|
468 |
+
# Set default chunking strategy if None
|
469 |
+
if self.chunking_strategy is None:
|
470 |
+
from .chunking_strategy import RegexChunking
|
471 |
+
self.chunking_strategy = RegexChunking()
|
472 |
+
|
473 |
+
@staticmethod
|
474 |
+
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
475 |
+
return CrawlerRunConfig(
|
476 |
+
# Content Processing Parameters
|
477 |
+
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
478 |
+
extraction_strategy=kwargs.get("extraction_strategy"),
|
479 |
+
chunking_strategy=kwargs.get("chunking_strategy"),
|
480 |
+
markdown_generator=kwargs.get("markdown_generator"),
|
481 |
+
content_filter=kwargs.get("content_filter"),
|
482 |
+
only_text=kwargs.get("only_text", False),
|
483 |
+
css_selector=kwargs.get("css_selector"),
|
484 |
+
excluded_tags=kwargs.get("excluded_tags", []),
|
485 |
+
excluded_selector=kwargs.get("excluded_selector", ""),
|
486 |
+
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
487 |
+
remove_forms=kwargs.get("remove_forms", False),
|
488 |
+
prettiify=kwargs.get("prettiify", False),
|
489 |
+
parser_type=kwargs.get("parser_type", "lxml"),
|
490 |
+
|
491 |
+
# SSL Parameters
|
492 |
+
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
493 |
+
|
494 |
+
# Caching Parameters
|
495 |
+
cache_mode=kwargs.get("cache_mode"),
|
496 |
+
session_id=kwargs.get("session_id"),
|
497 |
+
bypass_cache=kwargs.get("bypass_cache", False),
|
498 |
+
disable_cache=kwargs.get("disable_cache", False),
|
499 |
+
no_cache_read=kwargs.get("no_cache_read", False),
|
500 |
+
no_cache_write=kwargs.get("no_cache_write", False),
|
501 |
+
|
502 |
+
# Page Navigation and Timing Parameters
|
503 |
+
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
504 |
+
page_timeout=kwargs.get("page_timeout", 60000),
|
505 |
+
wait_for=kwargs.get("wait_for"),
|
506 |
+
wait_for_images=kwargs.get("wait_for_images", False),
|
507 |
+
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
508 |
+
mean_delay=kwargs.get("mean_delay", 0.1),
|
509 |
+
max_range=kwargs.get("max_range", 0.3),
|
510 |
+
semaphore_count=kwargs.get("semaphore_count", 5),
|
511 |
+
|
512 |
+
# Page Interaction Parameters
|
513 |
+
js_code=kwargs.get("js_code"),
|
514 |
+
js_only=kwargs.get("js_only", False),
|
515 |
+
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
|
516 |
+
scan_full_page=kwargs.get("scan_full_page", False),
|
517 |
+
scroll_delay=kwargs.get("scroll_delay", 0.2),
|
518 |
+
process_iframes=kwargs.get("process_iframes", False),
|
519 |
+
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
520 |
+
simulate_user=kwargs.get("simulate_user", False),
|
521 |
+
override_navigator=kwargs.get("override_navigator", False),
|
522 |
+
magic=kwargs.get("magic", False),
|
523 |
+
adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
|
524 |
+
|
525 |
+
# Media Handling Parameters
|
526 |
+
screenshot=kwargs.get("screenshot", False),
|
527 |
+
screenshot_wait_for=kwargs.get("screenshot_wait_for"),
|
528 |
+
screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
|
529 |
+
pdf=kwargs.get("pdf", False),
|
530 |
+
image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
|
531 |
+
image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
|
532 |
+
exclude_external_images=kwargs.get("exclude_external_images", False),
|
533 |
+
|
534 |
+
# Link and Domain Handling Parameters
|
535 |
+
exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
|
536 |
+
exclude_external_links=kwargs.get("exclude_external_links", False),
|
537 |
+
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
538 |
+
exclude_domains=kwargs.get("exclude_domains", []),
|
539 |
+
|
540 |
+
# Debugging and Logging Parameters
|
541 |
+
verbose=kwargs.get("verbose", True),
|
542 |
+
log_console=kwargs.get("log_console", False),
|
543 |
+
|
544 |
+
url=kwargs.get("url"),
|
545 |
+
)
|
546 |
+
|
547 |
+
# Create a funciton returns dict of the object
|
548 |
+
def to_dict(self):
|
549 |
+
return {
|
550 |
+
"word_count_threshold": self.word_count_threshold,
|
551 |
+
"extraction_strategy": self.extraction_strategy,
|
552 |
+
"chunking_strategy": self.chunking_strategy,
|
553 |
+
"markdown_generator": self.markdown_generator,
|
554 |
+
"content_filter": self.content_filter,
|
555 |
+
"only_text": self.only_text,
|
556 |
+
"css_selector": self.css_selector,
|
557 |
+
"excluded_tags": self.excluded_tags,
|
558 |
+
"excluded_selector": self.excluded_selector,
|
559 |
+
"keep_data_attributes": self.keep_data_attributes,
|
560 |
+
"remove_forms": self.remove_forms,
|
561 |
+
"prettiify": self.prettiify,
|
562 |
+
"parser_type": self.parser_type,
|
563 |
+
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
564 |
+
"cache_mode": self.cache_mode,
|
565 |
+
"session_id": self.session_id,
|
566 |
+
"bypass_cache": self.bypass_cache,
|
567 |
+
"disable_cache": self.disable_cache,
|
568 |
+
"no_cache_read": self.no_cache_read,
|
569 |
+
"no_cache_write": self.no_cache_write,
|
570 |
+
"wait_until": self.wait_until,
|
571 |
+
"page_timeout": self.page_timeout,
|
572 |
+
"wait_for": self.wait_for,
|
573 |
+
"wait_for_images": self.wait_for_images,
|
574 |
+
"delay_before_return_html": self.delay_before_return_html,
|
575 |
+
"mean_delay": self.mean_delay,
|
576 |
+
"max_range": self.max_range,
|
577 |
+
"semaphore_count": self.semaphore_count,
|
578 |
+
"js_code": self.js_code,
|
579 |
+
"js_only": self.js_only,
|
580 |
+
"ignore_body_visibility": self.ignore_body_visibility,
|
581 |
+
"scan_full_page": self.scan_full_page,
|
582 |
+
"scroll_delay": self.scroll_delay,
|
583 |
+
"process_iframes": self.process_iframes,
|
584 |
+
"remove_overlay_elements": self.remove_overlay_elements,
|
585 |
+
"simulate_user": self.simulate_user,
|
586 |
+
"override_navigator": self.override_navigator,
|
587 |
+
"magic": self.magic,
|
588 |
+
"adjust_viewport_to_content": self.adjust_viewport_to_content,
|
589 |
+
"screenshot": self.screenshot,
|
590 |
+
"screenshot_wait_for": self.screenshot_wait_for,
|
591 |
+
"screenshot_height_threshold": self.screenshot_height_threshold,
|
592 |
+
"pdf": self.pdf,
|
593 |
+
"image_description_min_word_threshold": self.image_description_min_word_threshold,
|
594 |
+
"image_score_threshold": self.image_score_threshold,
|
595 |
+
"exclude_external_images": self.exclude_external_images,
|
596 |
+
"exclude_social_media_domains": self.exclude_social_media_domains,
|
597 |
+
"exclude_external_links": self.exclude_external_links,
|
598 |
+
"exclude_social_media_links": self.exclude_social_media_links,
|
599 |
+
"exclude_domains": self.exclude_domains,
|
600 |
+
"verbose": self.verbose,
|
601 |
+
"log_console": self.log_console,
|
602 |
+
"url": self.url,
|
603 |
+
}
|
crawl4ai/async_crawler_strategy.py
ADDED
@@ -0,0 +1,2191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import base64
|
3 |
+
import time
|
4 |
+
from abc import ABC, abstractmethod
|
5 |
+
from typing import Callable, Dict, Any, List, Optional, Awaitable, Union
|
6 |
+
import os, sys, shutil
|
7 |
+
import tempfile, subprocess
|
8 |
+
from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext
|
9 |
+
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
10 |
+
from io import BytesIO
|
11 |
+
from PIL import Image, ImageDraw, ImageFont
|
12 |
+
from pathlib import Path
|
13 |
+
from playwright.async_api import ProxySettings
|
14 |
+
from pydantic import BaseModel
|
15 |
+
import hashlib
|
16 |
+
import json
|
17 |
+
import uuid
|
18 |
+
from .js_snippet import load_js_script
|
19 |
+
from .models import AsyncCrawlResponse
|
20 |
+
from .utils import get_error_context
|
21 |
+
from .user_agent_generator import UserAgentGenerator
|
22 |
+
from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
|
23 |
+
from .async_configs import BrowserConfig, CrawlerRunConfig
|
24 |
+
from .async_logger import AsyncLogger
|
25 |
+
from playwright_stealth import StealthConfig, stealth_async
|
26 |
+
from .ssl_certificate import SSLCertificate
|
27 |
+
|
28 |
+
stealth_config = StealthConfig(
|
29 |
+
webdriver=True,
|
30 |
+
chrome_app=True,
|
31 |
+
chrome_csi=True,
|
32 |
+
chrome_load_times=True,
|
33 |
+
chrome_runtime=True,
|
34 |
+
navigator_languages=True,
|
35 |
+
navigator_plugins=True,
|
36 |
+
navigator_permissions=True,
|
37 |
+
webgl_vendor=True,
|
38 |
+
outerdimensions=True,
|
39 |
+
navigator_hardware_concurrency=True,
|
40 |
+
media_codecs=True,
|
41 |
+
)
|
42 |
+
|
43 |
+
BROWSER_DISABLE_OPTIONS = [
|
44 |
+
"--disable-background-networking",
|
45 |
+
"--disable-background-timer-throttling",
|
46 |
+
"--disable-backgrounding-occluded-windows",
|
47 |
+
"--disable-breakpad",
|
48 |
+
"--disable-client-side-phishing-detection",
|
49 |
+
"--disable-component-extensions-with-background-pages",
|
50 |
+
"--disable-default-apps",
|
51 |
+
"--disable-extensions",
|
52 |
+
"--disable-features=TranslateUI",
|
53 |
+
"--disable-hang-monitor",
|
54 |
+
"--disable-ipc-flooding-protection",
|
55 |
+
"--disable-popup-blocking",
|
56 |
+
"--disable-prompt-on-repost",
|
57 |
+
"--disable-sync",
|
58 |
+
"--force-color-profile=srgb",
|
59 |
+
"--metrics-recording-only",
|
60 |
+
"--no-first-run",
|
61 |
+
"--password-store=basic",
|
62 |
+
"--use-mock-keychain",
|
63 |
+
]
|
64 |
+
|
65 |
+
|
66 |
+
class ManagedBrowser:
|
67 |
+
"""
|
68 |
+
Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
|
69 |
+
|
70 |
+
Attributes:
|
71 |
+
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
72 |
+
Default: "chromium".
|
73 |
+
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
74 |
+
temporary directory may be used. Default: None.
|
75 |
+
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
76 |
+
Default: True.
|
77 |
+
browser_process (subprocess.Popen): The process object for the browser.
|
78 |
+
temp_dir (str): Temporary directory for user data if not provided.
|
79 |
+
debugging_port (int): Port for debugging the browser.
|
80 |
+
host (str): Host for debugging the browser.
|
81 |
+
|
82 |
+
Methods:
|
83 |
+
start(): Starts the browser process and returns the CDP endpoint URL.
|
84 |
+
_get_browser_path(): Returns the browser executable path based on OS and browser type.
|
85 |
+
_get_browser_args(): Returns browser-specific command line arguments.
|
86 |
+
_get_user_data_dir(): Returns the user data directory path.
|
87 |
+
_cleanup(): Terminates the browser process and removes the temporary directory.
|
88 |
+
"""
|
89 |
+
|
90 |
+
browser_type: str
|
91 |
+
user_data_dir: str
|
92 |
+
headless: bool
|
93 |
+
browser_process: subprocess.Popen
|
94 |
+
temp_dir: str
|
95 |
+
debugging_port: int
|
96 |
+
host: str
|
97 |
+
def __init__(
|
98 |
+
self,
|
99 |
+
browser_type: str = "chromium",
|
100 |
+
user_data_dir: Optional[str] = None,
|
101 |
+
headless: bool = False,
|
102 |
+
logger=None,
|
103 |
+
host: str = "localhost",
|
104 |
+
debugging_port: int = 9222,
|
105 |
+
):
|
106 |
+
"""
|
107 |
+
Initialize the ManagedBrowser instance.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
111 |
+
Default: "chromium".
|
112 |
+
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
113 |
+
temporary directory may be used. Default: None.
|
114 |
+
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
115 |
+
Default: True.
|
116 |
+
logger (logging.Logger): Logger instance for logging messages. Default: None.
|
117 |
+
host (str): Host for debugging the browser. Default: "localhost".
|
118 |
+
debugging_port (int): Port for debugging the browser. Default: 9222.
|
119 |
+
"""
|
120 |
+
self.browser_type = browser_type
|
121 |
+
self.user_data_dir = user_data_dir
|
122 |
+
self.headless = headless
|
123 |
+
self.browser_process = None
|
124 |
+
self.temp_dir = None
|
125 |
+
self.debugging_port = debugging_port
|
126 |
+
self.host = host
|
127 |
+
self.logger = logger
|
128 |
+
self.shutting_down = False
|
129 |
+
|
130 |
+
async def start(self) -> str:
|
131 |
+
"""
|
132 |
+
Starts the browser process and returns the CDP endpoint URL.
|
133 |
+
If user_data_dir is not provided, creates a temporary directory.
|
134 |
+
"""
|
135 |
+
|
136 |
+
# Create temp dir if needed
|
137 |
+
if not self.user_data_dir:
|
138 |
+
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
139 |
+
self.user_data_dir = self.temp_dir
|
140 |
+
|
141 |
+
# Get browser path and args based on OS and browser type
|
142 |
+
browser_path = self._get_browser_path()
|
143 |
+
args = self._get_browser_args()
|
144 |
+
|
145 |
+
# Start browser process
|
146 |
+
try:
|
147 |
+
self.browser_process = subprocess.Popen(
|
148 |
+
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
149 |
+
)
|
150 |
+
# Monitor browser process output for errors
|
151 |
+
asyncio.create_task(self._monitor_browser_process())
|
152 |
+
await asyncio.sleep(2) # Give browser time to start
|
153 |
+
return f"http://{self.host}:{self.debugging_port}"
|
154 |
+
except Exception as e:
|
155 |
+
await self.cleanup()
|
156 |
+
raise Exception(f"Failed to start browser: {e}")
|
157 |
+
|
158 |
+
async def _monitor_browser_process(self):
|
159 |
+
"""
|
160 |
+
Monitor the browser process for unexpected termination.
|
161 |
+
|
162 |
+
How it works:
|
163 |
+
1. Read stdout and stderr from the browser process.
|
164 |
+
2. If the process has terminated, log the error message and terminate the browser.
|
165 |
+
3. If the shutting_down flag is set, log the normal termination message.
|
166 |
+
4. If any other error occurs, log the error message.
|
167 |
+
|
168 |
+
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
169 |
+
"""
|
170 |
+
if self.browser_process:
|
171 |
+
try:
|
172 |
+
stdout, stderr = await asyncio.gather(
|
173 |
+
asyncio.to_thread(self.browser_process.stdout.read),
|
174 |
+
asyncio.to_thread(self.browser_process.stderr.read),
|
175 |
+
)
|
176 |
+
|
177 |
+
# Check shutting_down flag BEFORE logging anything
|
178 |
+
if self.browser_process.poll() is not None:
|
179 |
+
if not self.shutting_down:
|
180 |
+
self.logger.error(
|
181 |
+
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
182 |
+
tag="ERROR",
|
183 |
+
params={
|
184 |
+
"code": self.browser_process.returncode,
|
185 |
+
"stdout": stdout.decode(),
|
186 |
+
"stderr": stderr.decode(),
|
187 |
+
},
|
188 |
+
)
|
189 |
+
await self.cleanup()
|
190 |
+
else:
|
191 |
+
self.logger.info(
|
192 |
+
message="Browser process terminated normally | Code: {code}",
|
193 |
+
tag="INFO",
|
194 |
+
params={"code": self.browser_process.returncode},
|
195 |
+
)
|
196 |
+
except Exception as e:
|
197 |
+
if not self.shutting_down:
|
198 |
+
self.logger.error(
|
199 |
+
message="Error monitoring browser process: {error}",
|
200 |
+
tag="ERROR",
|
201 |
+
params={"error": str(e)},
|
202 |
+
)
|
203 |
+
|
204 |
+
def _get_browser_path(self) -> str:
|
205 |
+
"""Returns the browser executable path based on OS and browser type"""
|
206 |
+
if sys.platform == "darwin": # macOS
|
207 |
+
paths = {
|
208 |
+
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
209 |
+
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
210 |
+
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
|
211 |
+
}
|
212 |
+
elif sys.platform == "win32": # Windows
|
213 |
+
paths = {
|
214 |
+
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
215 |
+
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
216 |
+
"webkit": None, # WebKit not supported on Windows
|
217 |
+
}
|
218 |
+
else: # Linux
|
219 |
+
paths = {
|
220 |
+
"chromium": "google-chrome",
|
221 |
+
"firefox": "firefox",
|
222 |
+
"webkit": None, # WebKit not supported on Linux
|
223 |
+
}
|
224 |
+
|
225 |
+
return paths.get(self.browser_type)
|
226 |
+
|
227 |
+
def _get_browser_args(self) -> List[str]:
|
228 |
+
"""Returns browser-specific command line arguments"""
|
229 |
+
base_args = [self._get_browser_path()]
|
230 |
+
|
231 |
+
if self.browser_type == "chromium":
|
232 |
+
args = [
|
233 |
+
f"--remote-debugging-port={self.debugging_port}",
|
234 |
+
f"--user-data-dir={self.user_data_dir}",
|
235 |
+
]
|
236 |
+
if self.headless:
|
237 |
+
args.append("--headless=new")
|
238 |
+
elif self.browser_type == "firefox":
|
239 |
+
args = [
|
240 |
+
"--remote-debugging-port",
|
241 |
+
str(self.debugging_port),
|
242 |
+
"--profile",
|
243 |
+
self.user_data_dir,
|
244 |
+
]
|
245 |
+
if self.headless:
|
246 |
+
args.append("--headless")
|
247 |
+
else:
|
248 |
+
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
249 |
+
|
250 |
+
return base_args + args
|
251 |
+
|
252 |
+
async def cleanup(self):
|
253 |
+
"""Cleanup browser process and temporary directory"""
|
254 |
+
# Set shutting_down flag BEFORE any termination actions
|
255 |
+
self.shutting_down = True
|
256 |
+
|
257 |
+
if self.browser_process:
|
258 |
+
try:
|
259 |
+
self.browser_process.terminate()
|
260 |
+
# Wait for process to end gracefully
|
261 |
+
for _ in range(10): # 10 attempts, 100ms each
|
262 |
+
if self.browser_process.poll() is not None:
|
263 |
+
break
|
264 |
+
await asyncio.sleep(0.1)
|
265 |
+
|
266 |
+
# Force kill if still running
|
267 |
+
if self.browser_process.poll() is None:
|
268 |
+
self.browser_process.kill()
|
269 |
+
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
270 |
+
|
271 |
+
except Exception as e:
|
272 |
+
self.logger.error(
|
273 |
+
message="Error terminating browser: {error}",
|
274 |
+
tag="ERROR",
|
275 |
+
params={"error": str(e)},
|
276 |
+
)
|
277 |
+
|
278 |
+
if self.temp_dir and os.path.exists(self.temp_dir):
|
279 |
+
try:
|
280 |
+
shutil.rmtree(self.temp_dir)
|
281 |
+
except Exception as e:
|
282 |
+
self.logger.error(
|
283 |
+
message="Error removing temporary directory: {error}",
|
284 |
+
tag="ERROR",
|
285 |
+
params={"error": str(e)},
|
286 |
+
)
|
287 |
+
|
288 |
+
|
289 |
+
class BrowserManager:
|
290 |
+
"""
|
291 |
+
Manages the browser instance and context.
|
292 |
+
|
293 |
+
Attributes:
|
294 |
+
config (BrowserConfig): Configuration object containing all browser settings
|
295 |
+
logger: Logger instance for recording events and errors
|
296 |
+
browser (Browser): The browser instance
|
297 |
+
default_context (BrowserContext): The default browser context
|
298 |
+
managed_browser (ManagedBrowser): The managed browser instance
|
299 |
+
playwright (Playwright): The Playwright instance
|
300 |
+
sessions (dict): Dictionary to store session information
|
301 |
+
session_ttl (int): Session timeout in seconds
|
302 |
+
"""
|
303 |
+
def __init__(self, browser_config: BrowserConfig, logger=None):
|
304 |
+
"""
|
305 |
+
Initialize the BrowserManager with a browser configuration.
|
306 |
+
|
307 |
+
Args:
|
308 |
+
browser_config (BrowserConfig): Configuration object containing all browser settings
|
309 |
+
logger: Logger instance for recording events and errors
|
310 |
+
"""
|
311 |
+
self.config: BrowserConfig = browser_config
|
312 |
+
self.logger = logger
|
313 |
+
|
314 |
+
# Browser state
|
315 |
+
self.browser = None
|
316 |
+
self.default_context = None
|
317 |
+
self.managed_browser = None
|
318 |
+
self.playwright = None
|
319 |
+
|
320 |
+
# Session management
|
321 |
+
self.sessions = {}
|
322 |
+
self.session_ttl = 1800 # 30 minutes
|
323 |
+
|
324 |
+
# Initialize ManagedBrowser if needed
|
325 |
+
if self.config.use_managed_browser:
|
326 |
+
self.managed_browser = ManagedBrowser(
|
327 |
+
browser_type=self.config.browser_type,
|
328 |
+
user_data_dir=self.config.user_data_dir,
|
329 |
+
headless=self.config.headless,
|
330 |
+
logger=self.logger,
|
331 |
+
debugging_port=self.config.debugging_port,
|
332 |
+
)
|
333 |
+
|
334 |
+
async def start(self):
|
335 |
+
"""
|
336 |
+
Start the browser instance and set up the default context.
|
337 |
+
|
338 |
+
How it works:
|
339 |
+
1. Check if Playwright is already initialized.
|
340 |
+
2. If not, initialize Playwright.
|
341 |
+
3. If managed browser is used, start it and connect to the CDP endpoint.
|
342 |
+
4. If managed browser is not used, launch the browser and set up the default context.
|
343 |
+
|
344 |
+
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
345 |
+
"""
|
346 |
+
if self.playwright is None:
|
347 |
+
from playwright.async_api import async_playwright
|
348 |
+
|
349 |
+
self.playwright = await async_playwright().start()
|
350 |
+
|
351 |
+
if self.config.use_managed_browser:
|
352 |
+
cdp_url = await self.managed_browser.start()
|
353 |
+
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
354 |
+
contexts = self.browser.contexts
|
355 |
+
if contexts:
|
356 |
+
self.default_context = contexts[0]
|
357 |
+
else:
|
358 |
+
self.default_context = await self.create_browser_context()
|
359 |
+
# self.default_context = await self.browser.new_context(
|
360 |
+
# viewport={
|
361 |
+
# "width": self.config.viewport_width,
|
362 |
+
# "height": self.config.viewport_height,
|
363 |
+
# },
|
364 |
+
# storage_state=self.config.storage_state,
|
365 |
+
# user_agent=self.config.headers.get(
|
366 |
+
# "User-Agent", self.config.user_agent
|
367 |
+
# ),
|
368 |
+
# accept_downloads=self.config.accept_downloads,
|
369 |
+
# ignore_https_errors=self.config.ignore_https_errors,
|
370 |
+
# java_script_enabled=self.config.java_script_enabled,
|
371 |
+
# )
|
372 |
+
await self.setup_context(self.default_context)
|
373 |
+
else:
|
374 |
+
browser_args = self._build_browser_args()
|
375 |
+
|
376 |
+
# Launch appropriate browser type
|
377 |
+
if self.config.browser_type == "firefox":
|
378 |
+
self.browser = await self.playwright.firefox.launch(**browser_args)
|
379 |
+
elif self.config.browser_type == "webkit":
|
380 |
+
self.browser = await self.playwright.webkit.launch(**browser_args)
|
381 |
+
else:
|
382 |
+
self.browser = await self.playwright.chromium.launch(**browser_args)
|
383 |
+
|
384 |
+
self.default_context = self.browser
|
385 |
+
|
386 |
+
def _build_browser_args(self) -> dict:
|
387 |
+
"""Build browser launch arguments from config."""
|
388 |
+
args = [
|
389 |
+
"--disable-gpu",
|
390 |
+
"--disable-gpu-compositing",
|
391 |
+
"--disable-software-rasterizer",
|
392 |
+
"--no-sandbox",
|
393 |
+
"--disable-dev-shm-usage",
|
394 |
+
"--no-first-run",
|
395 |
+
"--no-default-browser-check",
|
396 |
+
"--disable-infobars",
|
397 |
+
"--window-position=0,0",
|
398 |
+
"--ignore-certificate-errors",
|
399 |
+
"--ignore-certificate-errors-spki-list",
|
400 |
+
"--disable-blink-features=AutomationControlled",
|
401 |
+
"--window-position=400,0",
|
402 |
+
"--disable-renderer-backgrounding",
|
403 |
+
"--disable-ipc-flooding-protection",
|
404 |
+
"--force-color-profile=srgb",
|
405 |
+
"--mute-audio",
|
406 |
+
"--disable-background-timer-throttling",
|
407 |
+
# "--single-process",
|
408 |
+
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
409 |
+
]
|
410 |
+
|
411 |
+
if self.config.light_mode:
|
412 |
+
args.extend(BROWSER_DISABLE_OPTIONS)
|
413 |
+
|
414 |
+
if self.config.text_mode:
|
415 |
+
args.extend(
|
416 |
+
[
|
417 |
+
"--blink-settings=imagesEnabled=false",
|
418 |
+
"--disable-remote-fonts",
|
419 |
+
"--disable-images",
|
420 |
+
"--disable-javascript",
|
421 |
+
"--disable-software-rasterizer",
|
422 |
+
"--disable-dev-shm-usage",
|
423 |
+
]
|
424 |
+
)
|
425 |
+
|
426 |
+
if self.config.extra_args:
|
427 |
+
args.extend(self.config.extra_args)
|
428 |
+
|
429 |
+
browser_args = {"headless": self.config.headless, "args": args}
|
430 |
+
|
431 |
+
if self.config.chrome_channel:
|
432 |
+
browser_args["channel"] = self.config.chrome_channel
|
433 |
+
|
434 |
+
if self.config.accept_downloads:
|
435 |
+
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
|
436 |
+
os.getcwd(), "downloads"
|
437 |
+
)
|
438 |
+
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
439 |
+
|
440 |
+
if self.config.proxy or self.config.proxy_config:
|
441 |
+
from playwright.async_api import ProxySettings
|
442 |
+
|
443 |
+
proxy_settings = (
|
444 |
+
ProxySettings(server=self.config.proxy)
|
445 |
+
if self.config.proxy
|
446 |
+
else ProxySettings(
|
447 |
+
server=self.config.proxy_config.get("server"),
|
448 |
+
username=self.config.proxy_config.get("username"),
|
449 |
+
password=self.config.proxy_config.get("password"),
|
450 |
+
)
|
451 |
+
)
|
452 |
+
browser_args["proxy"] = proxy_settings
|
453 |
+
|
454 |
+
return browser_args
|
455 |
+
|
456 |
+
async def setup_context(
|
457 |
+
self,
|
458 |
+
context: BrowserContext,
|
459 |
+
crawlerRunConfig: CrawlerRunConfig,
|
460 |
+
is_default=False,
|
461 |
+
):
|
462 |
+
"""
|
463 |
+
Set up a browser context with the configured options.
|
464 |
+
|
465 |
+
How it works:
|
466 |
+
1. Set extra HTTP headers if provided.
|
467 |
+
2. Add cookies if provided.
|
468 |
+
3. Load storage state if provided.
|
469 |
+
4. Accept downloads if enabled.
|
470 |
+
5. Set default timeouts for navigation and download.
|
471 |
+
6. Set user agent if provided.
|
472 |
+
7. Set browser hints if provided.
|
473 |
+
8. Set proxy if provided.
|
474 |
+
9. Set downloads path if provided.
|
475 |
+
10. Set storage state if provided.
|
476 |
+
11. Set cache if provided.
|
477 |
+
12. Set extra HTTP headers if provided.
|
478 |
+
13. Add cookies if provided.
|
479 |
+
14. Set default timeouts for navigation and download if enabled.
|
480 |
+
15. Set user agent if provided.
|
481 |
+
16. Set browser hints if provided.
|
482 |
+
|
483 |
+
Args:
|
484 |
+
context (BrowserContext): The browser context to set up
|
485 |
+
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
486 |
+
is_default (bool): Flag indicating if this is the default context
|
487 |
+
Returns:
|
488 |
+
None
|
489 |
+
"""
|
490 |
+
if self.config.headers:
|
491 |
+
await context.set_extra_http_headers(self.config.headers)
|
492 |
+
|
493 |
+
if self.config.cookies:
|
494 |
+
await context.add_cookies(self.config.cookies)
|
495 |
+
|
496 |
+
if self.config.storage_state:
|
497 |
+
await context.storage_state(path=None)
|
498 |
+
|
499 |
+
if self.config.accept_downloads:
|
500 |
+
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
501 |
+
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
502 |
+
if self.config.downloads_path:
|
503 |
+
context._impl_obj._options["accept_downloads"] = True
|
504 |
+
context._impl_obj._options["downloads_path"] = (
|
505 |
+
self.config.downloads_path
|
506 |
+
)
|
507 |
+
|
508 |
+
# Handle user agent and browser hints
|
509 |
+
if self.config.user_agent:
|
510 |
+
combined_headers = {
|
511 |
+
"User-Agent": self.config.user_agent,
|
512 |
+
"sec-ch-ua": self.config.browser_hint,
|
513 |
+
}
|
514 |
+
combined_headers.update(self.config.headers)
|
515 |
+
await context.set_extra_http_headers(combined_headers)
|
516 |
+
|
517 |
+
# Add default cookie
|
518 |
+
await context.add_cookies(
|
519 |
+
[{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}]
|
520 |
+
)
|
521 |
+
|
522 |
+
# Handle navigator overrides
|
523 |
+
if (
|
524 |
+
crawlerRunConfig.override_navigator
|
525 |
+
or crawlerRunConfig.simulate_user
|
526 |
+
or crawlerRunConfig.magic
|
527 |
+
):
|
528 |
+
await context.add_init_script(load_js_script("navigator_overrider"))
|
529 |
+
|
530 |
+
async def create_browser_context(self):
|
531 |
+
"""
|
532 |
+
Creates and returns a new browser context with configured settings.
|
533 |
+
Applies text-only mode settings if text_mode is enabled in config.
|
534 |
+
|
535 |
+
Returns:
|
536 |
+
Context: Browser context object with the specified configurations
|
537 |
+
"""
|
538 |
+
# Base settings
|
539 |
+
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
540 |
+
viewport_settings = {
|
541 |
+
"width": self.config.viewport_width,
|
542 |
+
"height": self.config.viewport_height,
|
543 |
+
}
|
544 |
+
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
545 |
+
|
546 |
+
blocked_extensions = [
|
547 |
+
# Images
|
548 |
+
'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd',
|
549 |
+
# Fonts
|
550 |
+
'woff', 'woff2', 'ttf', 'otf', 'eot',
|
551 |
+
# Styles
|
552 |
+
# 'css', 'less', 'scss', 'sass',
|
553 |
+
# Media
|
554 |
+
'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v',
|
555 |
+
'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac',
|
556 |
+
# Documents
|
557 |
+
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
|
558 |
+
# Archives
|
559 |
+
'zip', 'rar', '7z', 'tar', 'gz',
|
560 |
+
# Scripts and data
|
561 |
+
'xml', 'swf', 'wasm'
|
562 |
+
]
|
563 |
+
|
564 |
+
# Common context settings
|
565 |
+
context_settings = {
|
566 |
+
"user_agent": user_agent,
|
567 |
+
"viewport": viewport_settings,
|
568 |
+
"proxy": proxy_settings,
|
569 |
+
"accept_downloads": self.config.accept_downloads,
|
570 |
+
"storage_state": self.config.storage_state,
|
571 |
+
"ignore_https_errors": self.config.ignore_https_errors,
|
572 |
+
"device_scale_factor": 1.0,
|
573 |
+
"java_script_enabled": self.config.java_script_enabled,
|
574 |
+
}
|
575 |
+
|
576 |
+
if self.config.text_mode:
|
577 |
+
text_mode_settings = {
|
578 |
+
"has_touch": False,
|
579 |
+
"is_mobile": False,
|
580 |
+
}
|
581 |
+
# Update context settings with text mode settings
|
582 |
+
context_settings.update(text_mode_settings)
|
583 |
+
|
584 |
+
# Create and return the context with all settings
|
585 |
+
context = await self.browser.new_context(**context_settings)
|
586 |
+
|
587 |
+
# Apply text mode settings if enabled
|
588 |
+
if self.config.text_mode:
|
589 |
+
# Create and apply route patterns for each extension
|
590 |
+
for ext in blocked_extensions:
|
591 |
+
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
592 |
+
return context
|
593 |
+
|
594 |
+
# async def get_page(self, session_id: Optional[str], user_agent: str):
|
595 |
+
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
596 |
+
"""
|
597 |
+
Get a page for the given session ID, creating a new one if needed.
|
598 |
+
|
599 |
+
Args:
|
600 |
+
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
601 |
+
|
602 |
+
Returns:
|
603 |
+
Page: The page object for the given session ID.
|
604 |
+
BrowserContext: The browser context for the given session ID.
|
605 |
+
"""
|
606 |
+
self._cleanup_expired_sessions()
|
607 |
+
|
608 |
+
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
609 |
+
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
610 |
+
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
611 |
+
return page, context
|
612 |
+
|
613 |
+
if self.config.use_managed_browser:
|
614 |
+
context = self.default_context
|
615 |
+
page = await context.new_page()
|
616 |
+
else:
|
617 |
+
context = await self.create_browser_context()
|
618 |
+
await self.setup_context(context, crawlerRunConfig)
|
619 |
+
page = await context.new_page()
|
620 |
+
|
621 |
+
if crawlerRunConfig.session_id:
|
622 |
+
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
623 |
+
|
624 |
+
return page, context
|
625 |
+
|
626 |
+
async def kill_session(self, session_id: str):
|
627 |
+
"""
|
628 |
+
Kill a browser session and clean up resources.
|
629 |
+
|
630 |
+
Args:
|
631 |
+
session_id (str): The session ID to kill.
|
632 |
+
"""
|
633 |
+
if session_id in self.sessions:
|
634 |
+
context, page, _ = self.sessions[session_id]
|
635 |
+
await page.close()
|
636 |
+
if not self.config.use_managed_browser:
|
637 |
+
await context.close()
|
638 |
+
del self.sessions[session_id]
|
639 |
+
|
640 |
+
def _cleanup_expired_sessions(self):
|
641 |
+
"""Clean up expired sessions based on TTL."""
|
642 |
+
current_time = time.time()
|
643 |
+
expired_sessions = [
|
644 |
+
sid
|
645 |
+
for sid, (_, _, last_used) in self.sessions.items()
|
646 |
+
if current_time - last_used > self.session_ttl
|
647 |
+
]
|
648 |
+
for sid in expired_sessions:
|
649 |
+
asyncio.create_task(self.kill_session(sid))
|
650 |
+
|
651 |
+
async def close(self):
|
652 |
+
"""Close all browser resources and clean up."""
|
653 |
+
if self.config.sleep_on_close:
|
654 |
+
await asyncio.sleep(0.5)
|
655 |
+
|
656 |
+
session_ids = list(self.sessions.keys())
|
657 |
+
for session_id in session_ids:
|
658 |
+
await self.kill_session(session_id)
|
659 |
+
|
660 |
+
if self.browser:
|
661 |
+
await self.browser.close()
|
662 |
+
self.browser = None
|
663 |
+
|
664 |
+
if self.managed_browser:
|
665 |
+
await asyncio.sleep(0.5)
|
666 |
+
await self.managed_browser.cleanup()
|
667 |
+
self.managed_browser = None
|
668 |
+
|
669 |
+
if self.playwright:
|
670 |
+
await self.playwright.stop()
|
671 |
+
self.playwright = None
|
672 |
+
|
673 |
+
|
674 |
+
class AsyncCrawlerStrategy(ABC):
|
675 |
+
"""
|
676 |
+
Abstract base class for crawler strategies.
|
677 |
+
Subclasses must implement the crawl method.
|
678 |
+
"""
|
679 |
+
@abstractmethod
|
680 |
+
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
681 |
+
pass # 4 + 3
|
682 |
+
|
683 |
+
|
684 |
+
|
685 |
+
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
686 |
+
"""
|
687 |
+
Crawler strategy using Playwright.
|
688 |
+
|
689 |
+
Attributes:
|
690 |
+
browser_config (BrowserConfig): Configuration object containing browser settings.
|
691 |
+
logger (AsyncLogger): Logger instance for recording events and errors.
|
692 |
+
_downloaded_files (List[str]): List of downloaded file paths.
|
693 |
+
hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
|
694 |
+
browser_manager (BrowserManager): Manager for browser creation and management.
|
695 |
+
|
696 |
+
Methods:
|
697 |
+
__init__(self, browser_config=None, logger=None, **kwargs):
|
698 |
+
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
|
699 |
+
__aenter__(self):
|
700 |
+
Start the browser and initialize the browser manager.
|
701 |
+
__aexit__(self, exc_type, exc_val, exc_tb):
|
702 |
+
Close the browser and clean up resources.
|
703 |
+
start(self):
|
704 |
+
Start the browser and initialize the browser manager.
|
705 |
+
close(self):
|
706 |
+
Close the browser and clean up resources.
|
707 |
+
kill_session(self, session_id):
|
708 |
+
Kill a browser session and clean up resources.
|
709 |
+
crawl(self, url, **kwargs):
|
710 |
+
Run the crawler for a single URL.
|
711 |
+
|
712 |
+
"""
|
713 |
+
def __init__(
|
714 |
+
self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
|
715 |
+
):
|
716 |
+
"""
|
717 |
+
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
|
718 |
+
|
719 |
+
Args:
|
720 |
+
browser_config (BrowserConfig): Configuration object containing browser settings.
|
721 |
+
If None, will be created from kwargs for backwards compatibility.
|
722 |
+
logger: Logger instance for recording events and errors.
|
723 |
+
**kwargs: Additional arguments for backwards compatibility and extending functionality.
|
724 |
+
"""
|
725 |
+
# Initialize browser config, either from provided object or kwargs
|
726 |
+
self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs)
|
727 |
+
self.logger = logger
|
728 |
+
|
729 |
+
# Initialize session management
|
730 |
+
self._downloaded_files = []
|
731 |
+
|
732 |
+
# Initialize hooks system
|
733 |
+
self.hooks = {
|
734 |
+
"on_browser_created": None,
|
735 |
+
"on_page_context_created": None,
|
736 |
+
"on_user_agent_updated": None,
|
737 |
+
"on_execution_started": None,
|
738 |
+
"before_goto": None,
|
739 |
+
"after_goto": None,
|
740 |
+
"before_return_html": None,
|
741 |
+
"before_retrieve_html": None,
|
742 |
+
}
|
743 |
+
|
744 |
+
# Initialize browser manager with config
|
745 |
+
self.browser_manager = BrowserManager(
|
746 |
+
browser_config=self.browser_config, logger=self.logger
|
747 |
+
)
|
748 |
+
|
749 |
+
async def __aenter__(self):
|
750 |
+
await self.start()
|
751 |
+
return self
|
752 |
+
|
753 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
754 |
+
await self.close()
|
755 |
+
|
756 |
+
async def start(self):
|
757 |
+
"""
|
758 |
+
Start the browser and initialize the browser manager.
|
759 |
+
"""
|
760 |
+
await self.browser_manager.start()
|
761 |
+
await self.execute_hook(
|
762 |
+
"on_browser_created",
|
763 |
+
self.browser_manager.browser,
|
764 |
+
context=self.browser_manager.default_context,
|
765 |
+
)
|
766 |
+
|
767 |
+
async def close(self):
|
768 |
+
"""
|
769 |
+
Close the browser and clean up resources.
|
770 |
+
"""
|
771 |
+
await self.browser_manager.close()
|
772 |
+
|
773 |
+
async def kill_session(self, session_id: str):
|
774 |
+
"""
|
775 |
+
Kill a browser session and clean up resources.
|
776 |
+
|
777 |
+
Args:
|
778 |
+
session_id (str): The ID of the session to kill.
|
779 |
+
|
780 |
+
Returns:
|
781 |
+
None
|
782 |
+
"""
|
783 |
+
# Log a warning message and no need kill session, in new version auto kill session
|
784 |
+
self.logger.warning(
|
785 |
+
message="Session auto-kill is enabled in the new version. No need to manually kill sessions.",
|
786 |
+
tag="WARNING",
|
787 |
+
)
|
788 |
+
await self.browser_manager.kill_session(session_id)
|
789 |
+
|
790 |
+
def set_hook(self, hook_type: str, hook: Callable):
|
791 |
+
"""
|
792 |
+
Set a hook function for a specific hook type. Following are list of hook types:
|
793 |
+
- on_browser_created: Called when a new browser instance is created.
|
794 |
+
- on_page_context_created: Called when a new page context is created.
|
795 |
+
- on_user_agent_updated: Called when the user agent is updated.
|
796 |
+
- on_execution_started: Called when the execution starts.
|
797 |
+
- before_goto: Called before a goto operation.
|
798 |
+
- after_goto: Called after a goto operation.
|
799 |
+
- before_return_html: Called before returning HTML content.
|
800 |
+
- before_retrieve_html: Called before retrieving HTML content.
|
801 |
+
|
802 |
+
All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.
|
803 |
+
|
804 |
+
Args:
|
805 |
+
hook_type (str): The type of the hook.
|
806 |
+
hook (Callable): The hook function to set.
|
807 |
+
|
808 |
+
Returns:
|
809 |
+
None
|
810 |
+
"""
|
811 |
+
if hook_type in self.hooks:
|
812 |
+
self.hooks[hook_type] = hook
|
813 |
+
else:
|
814 |
+
raise ValueError(f"Invalid hook type: {hook_type}")
|
815 |
+
|
816 |
+
async def execute_hook(self, hook_type: str, *args, **kwargs):
|
817 |
+
"""
|
818 |
+
Execute a hook function for a specific hook type.
|
819 |
+
|
820 |
+
Args:
|
821 |
+
hook_type (str): The type of the hook.
|
822 |
+
*args: Variable length positional arguments.
|
823 |
+
**kwargs: Keyword arguments.
|
824 |
+
|
825 |
+
Returns:
|
826 |
+
The return value of the hook function, if any.
|
827 |
+
"""
|
828 |
+
hook = self.hooks.get(hook_type)
|
829 |
+
if hook:
|
830 |
+
if asyncio.iscoroutinefunction(hook):
|
831 |
+
return await hook(*args, **kwargs)
|
832 |
+
else:
|
833 |
+
return hook(*args, **kwargs)
|
834 |
+
return args[0] if args else None
|
835 |
+
|
836 |
+
def update_user_agent(self, user_agent: str):
|
837 |
+
"""
|
838 |
+
Update the user agent for the browser.
|
839 |
+
|
840 |
+
Args:
|
841 |
+
user_agent (str): The new user agent string.
|
842 |
+
|
843 |
+
Returns:
|
844 |
+
None
|
845 |
+
"""
|
846 |
+
self.user_agent = user_agent
|
847 |
+
|
848 |
+
def set_custom_headers(self, headers: Dict[str, str]):
|
849 |
+
"""
|
850 |
+
Set custom headers for the browser.
|
851 |
+
|
852 |
+
Args:
|
853 |
+
headers (Dict[str, str]): A dictionary of headers to set.
|
854 |
+
|
855 |
+
Returns:
|
856 |
+
None
|
857 |
+
"""
|
858 |
+
self.headers = headers
|
859 |
+
|
860 |
+
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
861 |
+
"""
|
862 |
+
Wait for a condition in a smart way. This functions works as below:
|
863 |
+
|
864 |
+
1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
|
865 |
+
2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
|
866 |
+
3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
|
867 |
+
4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.
|
868 |
+
|
869 |
+
This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
|
870 |
+
Args:
|
871 |
+
page: Playwright page object
|
872 |
+
wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
|
873 |
+
timeout (float): Maximum time to wait in milliseconds
|
874 |
+
|
875 |
+
Returns:
|
876 |
+
None
|
877 |
+
"""
|
878 |
+
wait_for = wait_for.strip()
|
879 |
+
|
880 |
+
if wait_for.startswith("js:"):
|
881 |
+
# Explicitly specified JavaScript
|
882 |
+
js_code = wait_for[3:].strip()
|
883 |
+
return await self.csp_compliant_wait(page, js_code, timeout)
|
884 |
+
elif wait_for.startswith("css:"):
|
885 |
+
# Explicitly specified CSS selector
|
886 |
+
css_selector = wait_for[4:].strip()
|
887 |
+
try:
|
888 |
+
await page.wait_for_selector(css_selector, timeout=timeout)
|
889 |
+
except Error as e:
|
890 |
+
if "Timeout" in str(e):
|
891 |
+
raise TimeoutError(
|
892 |
+
f"Timeout after {timeout}ms waiting for selector '{css_selector}'"
|
893 |
+
)
|
894 |
+
else:
|
895 |
+
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
896 |
+
else:
|
897 |
+
# Auto-detect based on content
|
898 |
+
if wait_for.startswith("()") or wait_for.startswith("function"):
|
899 |
+
# It's likely a JavaScript function
|
900 |
+
return await self.csp_compliant_wait(page, wait_for, timeout)
|
901 |
+
else:
|
902 |
+
# Assume it's a CSS selector first
|
903 |
+
try:
|
904 |
+
await page.wait_for_selector(wait_for, timeout=timeout)
|
905 |
+
except Error as e:
|
906 |
+
if "Timeout" in str(e):
|
907 |
+
raise TimeoutError(
|
908 |
+
f"Timeout after {timeout}ms waiting for selector '{wait_for}'"
|
909 |
+
)
|
910 |
+
else:
|
911 |
+
# If it's not a timeout error, it might be an invalid selector
|
912 |
+
# Let's try to evaluate it as a JavaScript function as a fallback
|
913 |
+
try:
|
914 |
+
return await self.csp_compliant_wait(
|
915 |
+
page, f"() => {{{wait_for}}}", timeout
|
916 |
+
)
|
917 |
+
except Error:
|
918 |
+
raise ValueError(
|
919 |
+
f"Invalid wait_for parameter: '{wait_for}'. "
|
920 |
+
"It should be either a valid CSS selector, a JavaScript function, "
|
921 |
+
"or explicitly prefixed with 'js:' or 'css:'."
|
922 |
+
)
|
923 |
+
|
924 |
+
async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ):
|
925 |
+
"""
|
926 |
+
Wait for a condition in a CSP-compliant way.
|
927 |
+
|
928 |
+
Args:
|
929 |
+
page: Playwright page object
|
930 |
+
user_wait_function: JavaScript function as string that returns boolean
|
931 |
+
timeout: Maximum time to wait in milliseconds
|
932 |
+
|
933 |
+
Returns:
|
934 |
+
bool: True if condition was met, False if timed out
|
935 |
+
|
936 |
+
Raises:
|
937 |
+
RuntimeError: If there's an error evaluating the condition
|
938 |
+
"""
|
939 |
+
wrapper_js = f"""
|
940 |
+
async () => {{
|
941 |
+
const userFunction = {user_wait_function};
|
942 |
+
const startTime = Date.now();
|
943 |
+
try {{
|
944 |
+
while (true) {{
|
945 |
+
if (await userFunction()) {{
|
946 |
+
return true;
|
947 |
+
}}
|
948 |
+
if (Date.now() - startTime > {timeout}) {{
|
949 |
+
return false; // Return false instead of throwing
|
950 |
+
}}
|
951 |
+
await new Promise(resolve => setTimeout(resolve, 100));
|
952 |
+
}}
|
953 |
+
}} catch (error) {{
|
954 |
+
throw new Error(`Error evaluating condition: ${{error.message}}`);
|
955 |
+
}}
|
956 |
+
}}
|
957 |
+
"""
|
958 |
+
|
959 |
+
try:
|
960 |
+
result = await page.evaluate(wrapper_js)
|
961 |
+
return result
|
962 |
+
except Exception as e:
|
963 |
+
if "Error evaluating condition" in str(e):
|
964 |
+
raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}")
|
965 |
+
# For timeout or other cases, just return False
|
966 |
+
return False
|
967 |
+
|
968 |
+
async def process_iframes(self, page):
|
969 |
+
"""
|
970 |
+
Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.
|
971 |
+
|
972 |
+
Args:
|
973 |
+
page: Playwright page object
|
974 |
+
|
975 |
+
Returns:
|
976 |
+
Playwright page object
|
977 |
+
"""
|
978 |
+
# Find all iframes
|
979 |
+
iframes = await page.query_selector_all("iframe")
|
980 |
+
|
981 |
+
for i, iframe in enumerate(iframes):
|
982 |
+
try:
|
983 |
+
# Add a unique identifier to the iframe
|
984 |
+
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
985 |
+
|
986 |
+
# Get the frame associated with this iframe
|
987 |
+
frame = await iframe.content_frame()
|
988 |
+
|
989 |
+
if frame:
|
990 |
+
# Wait for the frame to load
|
991 |
+
await frame.wait_for_load_state(
|
992 |
+
"load", timeout=30000
|
993 |
+
) # 30 seconds timeout
|
994 |
+
|
995 |
+
# Extract the content of the iframe's body
|
996 |
+
iframe_content = await frame.evaluate(
|
997 |
+
"() => document.body.innerHTML"
|
998 |
+
)
|
999 |
+
|
1000 |
+
# Generate a unique class name for this iframe
|
1001 |
+
class_name = f"extracted-iframe-content-{i}"
|
1002 |
+
|
1003 |
+
# Replace the iframe with a div containing the extracted content
|
1004 |
+
_iframe = iframe_content.replace("`", "\\`")
|
1005 |
+
await page.evaluate(
|
1006 |
+
f"""
|
1007 |
+
() => {{
|
1008 |
+
const iframe = document.getElementById('iframe-{i}');
|
1009 |
+
const div = document.createElement('div');
|
1010 |
+
div.innerHTML = `{_iframe}`;
|
1011 |
+
div.className = '{class_name}';
|
1012 |
+
iframe.replaceWith(div);
|
1013 |
+
}}
|
1014 |
+
"""
|
1015 |
+
)
|
1016 |
+
else:
|
1017 |
+
self.logger.warning(
|
1018 |
+
message="Could not access content frame for iframe {index}",
|
1019 |
+
tag="SCRAPE",
|
1020 |
+
params={"index": i},
|
1021 |
+
)
|
1022 |
+
except Exception as e:
|
1023 |
+
self.logger.error(
|
1024 |
+
message="Error processing iframe {index}: {error}",
|
1025 |
+
tag="ERROR",
|
1026 |
+
params={"index": i, "error": str(e)},
|
1027 |
+
)
|
1028 |
+
|
1029 |
+
# Return the page object
|
1030 |
+
return page
|
1031 |
+
|
1032 |
+
async def create_session(self, **kwargs) -> str:
|
1033 |
+
"""
|
1034 |
+
Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
|
1035 |
+
This function is asynchronous and returns a string representing the session ID.
|
1036 |
+
|
1037 |
+
Args:
|
1038 |
+
**kwargs: Optional keyword arguments to configure the session.
|
1039 |
+
|
1040 |
+
Returns:
|
1041 |
+
str: The session ID.
|
1042 |
+
"""
|
1043 |
+
await self.start()
|
1044 |
+
|
1045 |
+
session_id = kwargs.get("session_id") or str(uuid.uuid4())
|
1046 |
+
|
1047 |
+
user_agent = kwargs.get("user_agent", self.user_agent)
|
1048 |
+
# Use browser_manager to get a fresh page & context assigned to this session_id
|
1049 |
+
page, context = await self.browser_manager.get_page(session_id, user_agent)
|
1050 |
+
return session_id
|
1051 |
+
|
1052 |
+
async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse:
|
1053 |
+
"""
|
1054 |
+
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
|
1055 |
+
|
1056 |
+
Args:
|
1057 |
+
url (str): The URL to crawl. Supported prefixes:
|
1058 |
+
- 'http://' or 'https://': Web URL to crawl.
|
1059 |
+
- 'file://': Local file path to process.
|
1060 |
+
- 'raw://': Raw HTML content to process.
|
1061 |
+
**kwargs: Additional parameters:
|
1062 |
+
- 'screenshot' (bool): Whether to take a screenshot.
|
1063 |
+
- ... [other existing parameters]
|
1064 |
+
|
1065 |
+
Returns:
|
1066 |
+
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
1067 |
+
"""
|
1068 |
+
config = config or CrawlerRunConfig.from_kwargs(kwargs)
|
1069 |
+
response_headers = {}
|
1070 |
+
status_code = 200 # Default for local/raw HTML
|
1071 |
+
screenshot_data = None
|
1072 |
+
|
1073 |
+
if url.startswith(("http://", "https://")):
|
1074 |
+
return await self._crawl_web(url, config)
|
1075 |
+
|
1076 |
+
elif url.startswith("file://"):
|
1077 |
+
# Process local file
|
1078 |
+
local_file_path = url[7:] # Remove 'file://' prefix
|
1079 |
+
if not os.path.exists(local_file_path):
|
1080 |
+
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
1081 |
+
with open(local_file_path, "r", encoding="utf-8") as f:
|
1082 |
+
html = f.read()
|
1083 |
+
if config.screenshot:
|
1084 |
+
screenshot_data = await self._generate_screenshot_from_html(html)
|
1085 |
+
return AsyncCrawlResponse(
|
1086 |
+
html=html,
|
1087 |
+
response_headers=response_headers,
|
1088 |
+
status_code=status_code,
|
1089 |
+
screenshot=screenshot_data,
|
1090 |
+
get_delayed_content=None,
|
1091 |
+
)
|
1092 |
+
|
1093 |
+
elif url.startswith("raw:") or url.startswith("raw://"):
|
1094 |
+
# Process raw HTML content
|
1095 |
+
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
1096 |
+
html = raw_html
|
1097 |
+
if config.screenshot:
|
1098 |
+
screenshot_data = await self._generate_screenshot_from_html(html)
|
1099 |
+
return AsyncCrawlResponse(
|
1100 |
+
html=html,
|
1101 |
+
response_headers=response_headers,
|
1102 |
+
status_code=status_code,
|
1103 |
+
screenshot=screenshot_data,
|
1104 |
+
get_delayed_content=None,
|
1105 |
+
)
|
1106 |
+
else:
|
1107 |
+
raise ValueError(
|
1108 |
+
"URL must start with 'http://', 'https://', 'file://', or 'raw:'"
|
1109 |
+
)
|
1110 |
+
|
1111 |
+
async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse:
|
1112 |
+
"""
|
1113 |
+
Internal method to crawl web URLs with the specified configuration.
|
1114 |
+
|
1115 |
+
Args:
|
1116 |
+
url (str): The web URL to crawl
|
1117 |
+
config (CrawlerRunConfig): Configuration object controlling the crawl behavior
|
1118 |
+
|
1119 |
+
Returns:
|
1120 |
+
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data
|
1121 |
+
"""
|
1122 |
+
config.url = url
|
1123 |
+
response_headers = {}
|
1124 |
+
status_code = None
|
1125 |
+
|
1126 |
+
# Reset downloaded files list for new crawl
|
1127 |
+
self._downloaded_files = []
|
1128 |
+
|
1129 |
+
# Handle user agent with magic mode
|
1130 |
+
user_agent = self.browser_config.user_agent
|
1131 |
+
if config.magic and self.browser_config.user_agent_mode != "random":
|
1132 |
+
self.browser_config.user_agent = UserAgentGenerator().generate(
|
1133 |
+
**(self.browser_config.user_agent_generator_config or {})
|
1134 |
+
)
|
1135 |
+
|
1136 |
+
# Get page for session
|
1137 |
+
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
1138 |
+
|
1139 |
+
# Add default cookie
|
1140 |
+
await context.add_cookies(
|
1141 |
+
[{"name": "cookiesEnabled", "value": "true", "url": url}]
|
1142 |
+
)
|
1143 |
+
|
1144 |
+
# Handle navigator overrides
|
1145 |
+
if config.override_navigator or config.simulate_user or config.magic:
|
1146 |
+
await context.add_init_script(load_js_script("navigator_overrider"))
|
1147 |
+
|
1148 |
+
# Call hook after page creation
|
1149 |
+
await self.execute_hook("on_page_context_created", page, context=context)
|
1150 |
+
|
1151 |
+
# Set up console logging if requested
|
1152 |
+
if config.log_console:
|
1153 |
+
|
1154 |
+
def log_consol(
|
1155 |
+
msg, console_log_type="debug"
|
1156 |
+
): # Corrected the parameter syntax
|
1157 |
+
if console_log_type == "error":
|
1158 |
+
self.logger.error(
|
1159 |
+
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
1160 |
+
tag="CONSOLE",
|
1161 |
+
params={"msg": msg.text},
|
1162 |
+
)
|
1163 |
+
elif console_log_type == "debug":
|
1164 |
+
self.logger.debug(
|
1165 |
+
message=f"Console: {msg}", # Use f-string for variable interpolation
|
1166 |
+
tag="CONSOLE",
|
1167 |
+
params={"msg": msg.text},
|
1168 |
+
)
|
1169 |
+
|
1170 |
+
page.on("console", log_consol)
|
1171 |
+
page.on("pageerror", lambda e: log_consol(e, "error"))
|
1172 |
+
|
1173 |
+
try:
|
1174 |
+
# Get SSL certificate information if requested and URL is HTTPS
|
1175 |
+
ssl_cert = None
|
1176 |
+
if config.fetch_ssl_certificate:
|
1177 |
+
ssl_cert = SSLCertificate.from_url(url)
|
1178 |
+
|
1179 |
+
# Set up download handling
|
1180 |
+
if self.browser_config.accept_downloads:
|
1181 |
+
page.on(
|
1182 |
+
"download",
|
1183 |
+
lambda download: asyncio.create_task(
|
1184 |
+
self._handle_download(download)
|
1185 |
+
),
|
1186 |
+
)
|
1187 |
+
|
1188 |
+
# Handle page navigation and content loading
|
1189 |
+
if not config.js_only:
|
1190 |
+
await self.execute_hook("before_goto", page, context=context, url=url)
|
1191 |
+
|
1192 |
+
try:
|
1193 |
+
# Generate a unique nonce for this request
|
1194 |
+
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
|
1195 |
+
|
1196 |
+
# Add CSP headers to the request
|
1197 |
+
await page.set_extra_http_headers({
|
1198 |
+
'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
|
1199 |
+
})
|
1200 |
+
|
1201 |
+
response = await page.goto(
|
1202 |
+
url, wait_until=config.wait_until, timeout=config.page_timeout
|
1203 |
+
)
|
1204 |
+
except Error as e:
|
1205 |
+
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
1206 |
+
|
1207 |
+
await self.execute_hook("after_goto", page, context=context, url=url, response=response)
|
1208 |
+
|
1209 |
+
if response is None:
|
1210 |
+
status_code = 200
|
1211 |
+
response_headers = {}
|
1212 |
+
else:
|
1213 |
+
status_code = response.status
|
1214 |
+
response_headers = response.headers
|
1215 |
+
|
1216 |
+
else:
|
1217 |
+
status_code = 200
|
1218 |
+
response_headers = {}
|
1219 |
+
|
1220 |
+
# Wait for body element and visibility
|
1221 |
+
try:
|
1222 |
+
await page.wait_for_selector("body", state="attached", timeout=30000)
|
1223 |
+
|
1224 |
+
# Use the new check_visibility function with csp_compliant_wait
|
1225 |
+
is_visible = await self.csp_compliant_wait(
|
1226 |
+
page,
|
1227 |
+
"""() => {
|
1228 |
+
const element = document.body;
|
1229 |
+
if (!element) return false;
|
1230 |
+
const style = window.getComputedStyle(element);
|
1231 |
+
const isVisible = style.display !== 'none' &&
|
1232 |
+
style.visibility !== 'hidden' &&
|
1233 |
+
style.opacity !== '0';
|
1234 |
+
return isVisible;
|
1235 |
+
}""",
|
1236 |
+
timeout=30000
|
1237 |
+
)
|
1238 |
+
|
1239 |
+
if not is_visible and not config.ignore_body_visibility:
|
1240 |
+
visibility_info = await self.check_visibility(page)
|
1241 |
+
raise Error(f"Body element is hidden: {visibility_info}")
|
1242 |
+
|
1243 |
+
except Error as e:
|
1244 |
+
visibility_info = await self.check_visibility(page)
|
1245 |
+
|
1246 |
+
if self.config.verbose:
|
1247 |
+
self.logger.debug(
|
1248 |
+
message="Body visibility info: {info}",
|
1249 |
+
tag="DEBUG",
|
1250 |
+
params={"info": visibility_info},
|
1251 |
+
)
|
1252 |
+
|
1253 |
+
if not config.ignore_body_visibility:
|
1254 |
+
raise Error(f"Body element is hidden: {visibility_info}")
|
1255 |
+
|
1256 |
+
|
1257 |
+
# try:
|
1258 |
+
# await page.wait_for_selector("body", state="attached", timeout=30000)
|
1259 |
+
|
1260 |
+
# await page.wait_for_function(
|
1261 |
+
# """
|
1262 |
+
# () => {
|
1263 |
+
# const body = document.body;
|
1264 |
+
# const style = window.getComputedStyle(body);
|
1265 |
+
# return style.display !== 'none' &&
|
1266 |
+
# style.visibility !== 'hidden' &&
|
1267 |
+
# style.opacity !== '0';
|
1268 |
+
# }
|
1269 |
+
# """,
|
1270 |
+
# timeout=30000,
|
1271 |
+
# )
|
1272 |
+
# except Error as e:
|
1273 |
+
# visibility_info = await page.evaluate(
|
1274 |
+
# """
|
1275 |
+
# () => {
|
1276 |
+
# const body = document.body;
|
1277 |
+
# const style = window.getComputedStyle(body);
|
1278 |
+
# return {
|
1279 |
+
# display: style.display,
|
1280 |
+
# visibility: style.visibility,
|
1281 |
+
# opacity: style.opacity,
|
1282 |
+
# hasContent: body.innerHTML.length,
|
1283 |
+
# classList: Array.from(body.classList)
|
1284 |
+
# }
|
1285 |
+
# }
|
1286 |
+
# """
|
1287 |
+
# )
|
1288 |
+
|
1289 |
+
# if self.config.verbose:
|
1290 |
+
# self.logger.debug(
|
1291 |
+
# message="Body visibility info: {info}",
|
1292 |
+
# tag="DEBUG",
|
1293 |
+
# params={"info": visibility_info},
|
1294 |
+
# )
|
1295 |
+
|
1296 |
+
# if not config.ignore_body_visibility:
|
1297 |
+
# raise Error(f"Body element is hidden: {visibility_info}")
|
1298 |
+
|
1299 |
+
# Handle content loading and viewport adjustment
|
1300 |
+
if not self.browser_config.text_mode and (
|
1301 |
+
config.wait_for_images or config.adjust_viewport_to_content
|
1302 |
+
):
|
1303 |
+
await page.wait_for_load_state("domcontentloaded")
|
1304 |
+
await asyncio.sleep(0.1)
|
1305 |
+
|
1306 |
+
# Check for image loading with improved error handling
|
1307 |
+
images_loaded = await self.csp_compliant_wait(
|
1308 |
+
page,
|
1309 |
+
"() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)",
|
1310 |
+
timeout=1000
|
1311 |
+
)
|
1312 |
+
|
1313 |
+
if not images_loaded and self.logger:
|
1314 |
+
self.logger.warning(
|
1315 |
+
message="Some images failed to load within timeout",
|
1316 |
+
tag="SCRAPE",
|
1317 |
+
)
|
1318 |
+
|
1319 |
+
# Adjust viewport if needed
|
1320 |
+
if not self.browser_config.text_mode and config.adjust_viewport_to_content:
|
1321 |
+
try:
|
1322 |
+
dimensions = await self.get_page_dimensions(page)
|
1323 |
+
page_height = dimensions['height']
|
1324 |
+
page_width = dimensions['width']
|
1325 |
+
# page_width = await page.evaluate(
|
1326 |
+
# "document.documentElement.scrollWidth"
|
1327 |
+
# )
|
1328 |
+
# page_height = await page.evaluate(
|
1329 |
+
# "document.documentElement.scrollHeight"
|
1330 |
+
# )
|
1331 |
+
|
1332 |
+
target_width = self.browser_config.viewport_width
|
1333 |
+
target_height = int(target_width * page_width / page_height * 0.95)
|
1334 |
+
await page.set_viewport_size(
|
1335 |
+
{"width": target_width, "height": target_height}
|
1336 |
+
)
|
1337 |
+
|
1338 |
+
scale = min(target_width / page_width, target_height / page_height)
|
1339 |
+
cdp = await page.context.new_cdp_session(page)
|
1340 |
+
await cdp.send(
|
1341 |
+
"Emulation.setDeviceMetricsOverride",
|
1342 |
+
{
|
1343 |
+
"width": page_width,
|
1344 |
+
"height": page_height,
|
1345 |
+
"deviceScaleFactor": 1,
|
1346 |
+
"mobile": False,
|
1347 |
+
"scale": scale,
|
1348 |
+
},
|
1349 |
+
)
|
1350 |
+
except Exception as e:
|
1351 |
+
self.logger.warning(
|
1352 |
+
message="Failed to adjust viewport to content: {error}",
|
1353 |
+
tag="VIEWPORT",
|
1354 |
+
params={"error": str(e)},
|
1355 |
+
)
|
1356 |
+
|
1357 |
+
# Handle full page scanning
|
1358 |
+
if config.scan_full_page:
|
1359 |
+
await self._handle_full_page_scan(page, config.scroll_delay)
|
1360 |
+
|
1361 |
+
# Execute JavaScript if provided
|
1362 |
+
# if config.js_code:
|
1363 |
+
# if isinstance(config.js_code, str):
|
1364 |
+
# await page.evaluate(config.js_code)
|
1365 |
+
# elif isinstance(config.js_code, list):
|
1366 |
+
# for js in config.js_code:
|
1367 |
+
# await page.evaluate(js)
|
1368 |
+
|
1369 |
+
if config.js_code:
|
1370 |
+
# execution_result = await self.execute_user_script(page, config.js_code)
|
1371 |
+
execution_result = await self.robust_execute_user_script(page, config.js_code)
|
1372 |
+
if not execution_result["success"]:
|
1373 |
+
self.logger.warning(
|
1374 |
+
message="User script execution had issues: {error}",
|
1375 |
+
tag="JS_EXEC",
|
1376 |
+
params={"error": execution_result.get("error")}
|
1377 |
+
)
|
1378 |
+
|
1379 |
+
await self.execute_hook("on_execution_started", page, context=context)
|
1380 |
+
|
1381 |
+
# Handle user simulation
|
1382 |
+
if config.simulate_user or config.magic:
|
1383 |
+
await page.mouse.move(100, 100)
|
1384 |
+
await page.mouse.down()
|
1385 |
+
await page.mouse.up()
|
1386 |
+
await page.keyboard.press("ArrowDown")
|
1387 |
+
|
1388 |
+
# Handle wait_for condition
|
1389 |
+
if config.wait_for:
|
1390 |
+
try:
|
1391 |
+
await self.smart_wait(
|
1392 |
+
page, config.wait_for, timeout=config.page_timeout
|
1393 |
+
)
|
1394 |
+
except Exception as e:
|
1395 |
+
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
1396 |
+
|
1397 |
+
# Update image dimensions if needed
|
1398 |
+
if not self.browser_config.text_mode:
|
1399 |
+
update_image_dimensions_js = load_js_script("update_image_dimensions")
|
1400 |
+
try:
|
1401 |
+
try:
|
1402 |
+
await page.wait_for_load_state("domcontentloaded", timeout=5)
|
1403 |
+
except PlaywrightTimeoutError:
|
1404 |
+
pass
|
1405 |
+
await page.evaluate(update_image_dimensions_js)
|
1406 |
+
except Exception as e:
|
1407 |
+
self.logger.error(
|
1408 |
+
message="Error updating image dimensions: {error}",
|
1409 |
+
tag="ERROR",
|
1410 |
+
params={"error": str(e)},
|
1411 |
+
)
|
1412 |
+
|
1413 |
+
# Process iframes if needed
|
1414 |
+
if config.process_iframes:
|
1415 |
+
page = await self.process_iframes(page)
|
1416 |
+
|
1417 |
+
# Pre-content retrieval hooks and delay
|
1418 |
+
await self.execute_hook("before_retrieve_html", page, context=context)
|
1419 |
+
if config.delay_before_return_html:
|
1420 |
+
await asyncio.sleep(config.delay_before_return_html)
|
1421 |
+
|
1422 |
+
# Handle overlay removal
|
1423 |
+
if config.remove_overlay_elements:
|
1424 |
+
await self.remove_overlay_elements(page)
|
1425 |
+
|
1426 |
+
# Get final HTML content
|
1427 |
+
html = await page.content()
|
1428 |
+
await self.execute_hook("before_return_html", page = page, html = html, context=context)
|
1429 |
+
|
1430 |
+
# Handle PDF and screenshot generation
|
1431 |
+
start_export_time = time.perf_counter()
|
1432 |
+
pdf_data = None
|
1433 |
+
screenshot_data = None
|
1434 |
+
|
1435 |
+
if config.pdf:
|
1436 |
+
pdf_data = await self.export_pdf(page)
|
1437 |
+
|
1438 |
+
if config.screenshot:
|
1439 |
+
if config.screenshot_wait_for:
|
1440 |
+
await asyncio.sleep(config.screenshot_wait_for)
|
1441 |
+
screenshot_data = await self.take_screenshot(
|
1442 |
+
page, screenshot_height_threshold=config.screenshot_height_threshold
|
1443 |
+
)
|
1444 |
+
|
1445 |
+
if screenshot_data or pdf_data:
|
1446 |
+
self.logger.info(
|
1447 |
+
message="Exporting PDF and taking screenshot took {duration:.2f}s",
|
1448 |
+
tag="EXPORT",
|
1449 |
+
params={"duration": time.perf_counter() - start_export_time},
|
1450 |
+
)
|
1451 |
+
|
1452 |
+
# Define delayed content getter
|
1453 |
+
async def get_delayed_content(delay: float = 5.0) -> str:
|
1454 |
+
self.logger.info(
|
1455 |
+
message="Waiting for {delay} seconds before retrieving content for {url}",
|
1456 |
+
tag="INFO",
|
1457 |
+
params={"delay": delay, "url": url},
|
1458 |
+
)
|
1459 |
+
await asyncio.sleep(delay)
|
1460 |
+
return await page.content()
|
1461 |
+
|
1462 |
+
# Return complete response
|
1463 |
+
return AsyncCrawlResponse(
|
1464 |
+
html=html,
|
1465 |
+
response_headers=response_headers,
|
1466 |
+
status_code=status_code,
|
1467 |
+
screenshot=screenshot_data,
|
1468 |
+
pdf_data=pdf_data,
|
1469 |
+
get_delayed_content=get_delayed_content,
|
1470 |
+
ssl_certificate=ssl_cert,
|
1471 |
+
downloaded_files=(
|
1472 |
+
self._downloaded_files if self._downloaded_files else None
|
1473 |
+
),
|
1474 |
+
)
|
1475 |
+
|
1476 |
+
except Exception as e:
|
1477 |
+
raise e
|
1478 |
+
|
1479 |
+
finally:
|
1480 |
+
# If no session_id is given we should close the page
|
1481 |
+
if not config.session_id:
|
1482 |
+
await page.close()
|
1483 |
+
|
1484 |
+
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
1485 |
+
"""
|
1486 |
+
Helper method to handle full page scanning.
|
1487 |
+
|
1488 |
+
How it works:
|
1489 |
+
1. Get the viewport height.
|
1490 |
+
2. Scroll to the bottom of the page.
|
1491 |
+
3. Get the total height of the page.
|
1492 |
+
4. Scroll back to the top of the page.
|
1493 |
+
5. Scroll to the bottom of the page again.
|
1494 |
+
6. Continue scrolling until the bottom of the page is reached.
|
1495 |
+
|
1496 |
+
Args:
|
1497 |
+
page (Page): The Playwright page object
|
1498 |
+
scroll_delay (float): The delay between page scrolls
|
1499 |
+
|
1500 |
+
"""
|
1501 |
+
try:
|
1502 |
+
viewport_height = page.viewport_size.get(
|
1503 |
+
"height", self.browser_config.viewport_height
|
1504 |
+
)
|
1505 |
+
current_position = viewport_height
|
1506 |
+
|
1507 |
+
# await page.evaluate(f"window.scrollTo(0, {current_position})")
|
1508 |
+
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
|
1509 |
+
# await self.csp_scroll_to(page, 0, current_position)
|
1510 |
+
# await asyncio.sleep(scroll_delay)
|
1511 |
+
|
1512 |
+
# total_height = await page.evaluate("document.documentElement.scrollHeight")
|
1513 |
+
dimensions = await self.get_page_dimensions(page)
|
1514 |
+
total_height = dimensions['height']
|
1515 |
+
|
1516 |
+
while current_position < total_height:
|
1517 |
+
current_position = min(current_position + viewport_height, total_height)
|
1518 |
+
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
|
1519 |
+
# await page.evaluate(f"window.scrollTo(0, {current_position})")
|
1520 |
+
# await asyncio.sleep(scroll_delay)
|
1521 |
+
|
1522 |
+
# new_height = await page.evaluate("document.documentElement.scrollHeight")
|
1523 |
+
dimensions = await self.get_page_dimensions(page)
|
1524 |
+
new_height = dimensions['height']
|
1525 |
+
|
1526 |
+
if new_height > total_height:
|
1527 |
+
total_height = new_height
|
1528 |
+
|
1529 |
+
# await page.evaluate("window.scrollTo(0, 0)")
|
1530 |
+
await self.safe_scroll(page, 0, 0)
|
1531 |
+
|
1532 |
+
except Exception as e:
|
1533 |
+
self.logger.warning(
|
1534 |
+
message="Failed to perform full page scan: {error}",
|
1535 |
+
tag="PAGE_SCAN",
|
1536 |
+
params={"error": str(e)},
|
1537 |
+
)
|
1538 |
+
else:
|
1539 |
+
# await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
1540 |
+
await self.safe_scroll(page, 0, total_height)
|
1541 |
+
|
1542 |
+
async def _handle_download(self, download):
|
1543 |
+
"""
|
1544 |
+
Handle file downloads.
|
1545 |
+
|
1546 |
+
How it works:
|
1547 |
+
1. Get the suggested filename.
|
1548 |
+
2. Get the download path.
|
1549 |
+
3. Log the download.
|
1550 |
+
4. Start the download.
|
1551 |
+
5. Save the downloaded file.
|
1552 |
+
6. Log the completion.
|
1553 |
+
|
1554 |
+
Args:
|
1555 |
+
download (Download): The Playwright download object
|
1556 |
+
|
1557 |
+
Returns:
|
1558 |
+
None
|
1559 |
+
"""
|
1560 |
+
try:
|
1561 |
+
suggested_filename = download.suggested_filename
|
1562 |
+
download_path = os.path.join(self.downloads_path, suggested_filename)
|
1563 |
+
|
1564 |
+
self.logger.info(
|
1565 |
+
message="Downloading {filename} to {path}",
|
1566 |
+
tag="FETCH",
|
1567 |
+
params={"filename": suggested_filename, "path": download_path},
|
1568 |
+
)
|
1569 |
+
|
1570 |
+
start_time = time.perf_counter()
|
1571 |
+
await download.save_as(download_path)
|
1572 |
+
end_time = time.perf_counter()
|
1573 |
+
self._downloaded_files.append(download_path)
|
1574 |
+
|
1575 |
+
self.logger.success(
|
1576 |
+
message="Downloaded {filename} successfully",
|
1577 |
+
tag="COMPLETE",
|
1578 |
+
params={
|
1579 |
+
"filename": suggested_filename,
|
1580 |
+
"path": download_path,
|
1581 |
+
"duration": f"{end_time - start_time:.2f}s",
|
1582 |
+
},
|
1583 |
+
)
|
1584 |
+
except Exception as e:
|
1585 |
+
self.logger.error(
|
1586 |
+
message="Failed to handle download: {error}",
|
1587 |
+
tag="ERROR",
|
1588 |
+
params={"error": str(e)},
|
1589 |
+
)
|
1590 |
+
|
1591 |
+
async def remove_overlay_elements(self, page: Page) -> None:
|
1592 |
+
"""
|
1593 |
+
Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
|
1594 |
+
|
1595 |
+
Args:
|
1596 |
+
page (Page): The Playwright page instance
|
1597 |
+
"""
|
1598 |
+
remove_overlays_js = load_js_script("remove_overlay_elements")
|
1599 |
+
|
1600 |
+
try:
|
1601 |
+
await page.evaluate(f"""
|
1602 |
+
(() => {{
|
1603 |
+
try {{
|
1604 |
+
{remove_overlays_js}
|
1605 |
+
return {{ success: true }};
|
1606 |
+
}} catch (error) {{
|
1607 |
+
return {{
|
1608 |
+
success: false,
|
1609 |
+
error: error.toString(),
|
1610 |
+
stack: error.stack
|
1611 |
+
}};
|
1612 |
+
}}
|
1613 |
+
}})()
|
1614 |
+
""")
|
1615 |
+
await page.wait_for_timeout(500) # Wait for any animations to complete
|
1616 |
+
except Exception as e:
|
1617 |
+
self.logger.warning(
|
1618 |
+
message="Failed to remove overlay elements: {error}",
|
1619 |
+
tag="SCRAPE",
|
1620 |
+
params={"error": str(e)},
|
1621 |
+
)
|
1622 |
+
|
1623 |
+
async def export_pdf(self, page: Page) -> bytes:
|
1624 |
+
"""
|
1625 |
+
Exports the current page as a PDF.
|
1626 |
+
|
1627 |
+
Args:
|
1628 |
+
page (Page): The Playwright page object
|
1629 |
+
|
1630 |
+
Returns:
|
1631 |
+
bytes: The PDF data
|
1632 |
+
"""
|
1633 |
+
pdf_data = await page.pdf(print_background=True)
|
1634 |
+
return pdf_data
|
1635 |
+
|
1636 |
+
async def take_screenshot(self, page, **kwargs) -> str:
|
1637 |
+
"""
|
1638 |
+
Take a screenshot of the current page.
|
1639 |
+
|
1640 |
+
Args:
|
1641 |
+
page (Page): The Playwright page object
|
1642 |
+
kwargs: Additional keyword arguments
|
1643 |
+
|
1644 |
+
Returns:
|
1645 |
+
str: The base64-encoded screenshot data
|
1646 |
+
"""
|
1647 |
+
need_scroll = await self.page_need_scroll(page)
|
1648 |
+
|
1649 |
+
if not need_scroll:
|
1650 |
+
# Page is short enough, just take a screenshot
|
1651 |
+
return await self.take_screenshot_naive(page)
|
1652 |
+
else:
|
1653 |
+
# Page is too long, try to take a full-page screenshot
|
1654 |
+
return await self.take_screenshot_scroller(page, **kwargs)
|
1655 |
+
# return await self.take_screenshot_from_pdf(await self.export_pdf(page))
|
1656 |
+
|
1657 |
+
async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
|
1658 |
+
"""
|
1659 |
+
Convert the first page of the PDF to a screenshot.
|
1660 |
+
|
1661 |
+
Requires pdf2image and poppler.
|
1662 |
+
|
1663 |
+
Args:
|
1664 |
+
pdf_data (bytes): The PDF data
|
1665 |
+
|
1666 |
+
Returns:
|
1667 |
+
str: The base64-encoded screenshot data
|
1668 |
+
"""
|
1669 |
+
try:
|
1670 |
+
from pdf2image import convert_from_bytes
|
1671 |
+
|
1672 |
+
images = convert_from_bytes(pdf_data)
|
1673 |
+
final_img = images[0].convert("RGB")
|
1674 |
+
buffered = BytesIO()
|
1675 |
+
final_img.save(buffered, format="JPEG")
|
1676 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
1677 |
+
except Exception as e:
|
1678 |
+
error_message = f"Failed to take PDF-based screenshot: {str(e)}"
|
1679 |
+
self.logger.error(
|
1680 |
+
message="PDF Screenshot failed: {error}",
|
1681 |
+
tag="ERROR",
|
1682 |
+
params={"error": error_message},
|
1683 |
+
)
|
1684 |
+
# Return error image as fallback
|
1685 |
+
img = Image.new("RGB", (800, 600), color="black")
|
1686 |
+
draw = ImageDraw.Draw(img)
|
1687 |
+
font = ImageFont.load_default()
|
1688 |
+
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
1689 |
+
buffered = BytesIO()
|
1690 |
+
img.save(buffered, format="JPEG")
|
1691 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
1692 |
+
|
1693 |
+
async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
|
1694 |
+
"""
|
1695 |
+
Attempt to set a large viewport and take a full-page screenshot.
|
1696 |
+
If still too large, segment the page as before.
|
1697 |
+
|
1698 |
+
Requires pdf2image and poppler.
|
1699 |
+
|
1700 |
+
Args:
|
1701 |
+
page (Page): The Playwright page object
|
1702 |
+
kwargs: Additional keyword arguments
|
1703 |
+
|
1704 |
+
Returns:
|
1705 |
+
str: The base64-encoded screenshot data
|
1706 |
+
"""
|
1707 |
+
try:
|
1708 |
+
# Get page height
|
1709 |
+
dimensions = await self.get_page_dimensions(page)
|
1710 |
+
page_width = dimensions['width']
|
1711 |
+
page_height = dimensions['height']
|
1712 |
+
# page_height = await page.evaluate("document.documentElement.scrollHeight")
|
1713 |
+
# page_width = await page.evaluate("document.documentElement.scrollWidth")
|
1714 |
+
|
1715 |
+
# Set a large viewport
|
1716 |
+
large_viewport_height = min(
|
1717 |
+
page_height,
|
1718 |
+
kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
|
1719 |
+
)
|
1720 |
+
await page.set_viewport_size(
|
1721 |
+
{"width": page_width, "height": large_viewport_height}
|
1722 |
+
)
|
1723 |
+
|
1724 |
+
# Page still too long, segment approach
|
1725 |
+
segments = []
|
1726 |
+
viewport_size = page.viewport_size
|
1727 |
+
viewport_height = viewport_size["height"]
|
1728 |
+
|
1729 |
+
num_segments = (page_height // viewport_height) + 1
|
1730 |
+
for i in range(num_segments):
|
1731 |
+
y_offset = i * viewport_height
|
1732 |
+
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
1733 |
+
await asyncio.sleep(0.01) # wait for render
|
1734 |
+
seg_shot = await page.screenshot(full_page=False)
|
1735 |
+
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
1736 |
+
segments.append(img)
|
1737 |
+
|
1738 |
+
total_height = sum(img.height for img in segments)
|
1739 |
+
stitched = Image.new("RGB", (segments[0].width, total_height))
|
1740 |
+
offset = 0
|
1741 |
+
for img in segments:
|
1742 |
+
# stitched.paste(img, (0, offset))
|
1743 |
+
stitched.paste(img.convert("RGB"), (0, offset))
|
1744 |
+
offset += img.height
|
1745 |
+
|
1746 |
+
buffered = BytesIO()
|
1747 |
+
stitched = stitched.convert("RGB")
|
1748 |
+
stitched.save(buffered, format="BMP", quality=85)
|
1749 |
+
encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
1750 |
+
|
1751 |
+
return encoded
|
1752 |
+
except Exception as e:
|
1753 |
+
error_message = f"Failed to take large viewport screenshot: {str(e)}"
|
1754 |
+
self.logger.error(
|
1755 |
+
message="Large viewport screenshot failed: {error}",
|
1756 |
+
tag="ERROR",
|
1757 |
+
params={"error": error_message},
|
1758 |
+
)
|
1759 |
+
# return error image
|
1760 |
+
img = Image.new("RGB", (800, 600), color="black")
|
1761 |
+
draw = ImageDraw.Draw(img)
|
1762 |
+
font = ImageFont.load_default()
|
1763 |
+
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
1764 |
+
buffered = BytesIO()
|
1765 |
+
img.save(buffered, format="JPEG")
|
1766 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
1767 |
+
finally:
|
1768 |
+
await page.close()
|
1769 |
+
|
1770 |
+
async def take_screenshot_naive(self, page: Page) -> str:
|
1771 |
+
"""
|
1772 |
+
Takes a screenshot of the current page.
|
1773 |
+
|
1774 |
+
Args:
|
1775 |
+
page (Page): The Playwright page instance
|
1776 |
+
|
1777 |
+
Returns:
|
1778 |
+
str: Base64-encoded screenshot image
|
1779 |
+
"""
|
1780 |
+
try:
|
1781 |
+
# The page is already loaded, just take the screenshot
|
1782 |
+
screenshot = await page.screenshot(full_page=False)
|
1783 |
+
return base64.b64encode(screenshot).decode("utf-8")
|
1784 |
+
except Exception as e:
|
1785 |
+
error_message = f"Failed to take screenshot: {str(e)}"
|
1786 |
+
self.logger.error(
|
1787 |
+
message="Screenshot failed: {error}",
|
1788 |
+
tag="ERROR",
|
1789 |
+
params={"error": error_message},
|
1790 |
+
)
|
1791 |
+
|
1792 |
+
# Generate an error image
|
1793 |
+
img = Image.new("RGB", (800, 600), color="black")
|
1794 |
+
draw = ImageDraw.Draw(img)
|
1795 |
+
font = ImageFont.load_default()
|
1796 |
+
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
1797 |
+
|
1798 |
+
buffered = BytesIO()
|
1799 |
+
img.save(buffered, format="JPEG")
|
1800 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
1801 |
+
finally:
|
1802 |
+
await page.close()
|
1803 |
+
|
1804 |
+
async def export_storage_state(self, path: str = None) -> dict:
|
1805 |
+
"""
|
1806 |
+
Exports the current storage state (cookies, localStorage, sessionStorage)
|
1807 |
+
to a JSON file at the specified path.
|
1808 |
+
|
1809 |
+
Args:
|
1810 |
+
path (str): The path to save the storage state JSON file
|
1811 |
+
|
1812 |
+
Returns:
|
1813 |
+
dict: The exported storage state
|
1814 |
+
"""
|
1815 |
+
if self.default_context:
|
1816 |
+
state = await self.default_context.storage_state(path=path)
|
1817 |
+
self.logger.info(
|
1818 |
+
message="Exported storage state to {path}",
|
1819 |
+
tag="INFO",
|
1820 |
+
params={"path": path},
|
1821 |
+
)
|
1822 |
+
return state
|
1823 |
+
else:
|
1824 |
+
self.logger.warning(
|
1825 |
+
message="No default_context available to export storage state.",
|
1826 |
+
tag="WARNING",
|
1827 |
+
)
|
1828 |
+
|
1829 |
+
async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
|
1830 |
+
"""
|
1831 |
+
Executes user-provided JavaScript code with proper error handling and context,
|
1832 |
+
supporting both synchronous and async user code, plus navigations.
|
1833 |
+
|
1834 |
+
How it works:
|
1835 |
+
1. Wait for load state 'domcontentloaded'
|
1836 |
+
2. If js_code is a string, execute it directly
|
1837 |
+
3. If js_code is a list, execute each element in sequence
|
1838 |
+
4. Wait for load state 'networkidle'
|
1839 |
+
5. Return results
|
1840 |
+
|
1841 |
+
Args:
|
1842 |
+
page (Page): The Playwright page instance
|
1843 |
+
js_code (Union[str, List[str]]): The JavaScript code to execute
|
1844 |
+
|
1845 |
+
Returns:
|
1846 |
+
Dict[str, Any]: The results of the execution
|
1847 |
+
"""
|
1848 |
+
try:
|
1849 |
+
await page.wait_for_load_state('domcontentloaded')
|
1850 |
+
|
1851 |
+
if isinstance(js_code, str):
|
1852 |
+
scripts = [js_code]
|
1853 |
+
else:
|
1854 |
+
scripts = js_code
|
1855 |
+
|
1856 |
+
results = []
|
1857 |
+
for script in scripts:
|
1858 |
+
try:
|
1859 |
+
# Attempt the evaluate
|
1860 |
+
# If the user code triggers navigation, we catch the "context destroyed" error
|
1861 |
+
# then wait for the new page to load before continuing
|
1862 |
+
result = None
|
1863 |
+
try:
|
1864 |
+
result = await page.evaluate(f"""
|
1865 |
+
(async () => {{
|
1866 |
+
try {{
|
1867 |
+
{script}
|
1868 |
+
return {{ success: true }};
|
1869 |
+
}} catch (err) {{
|
1870 |
+
return {{ success: false, error: err.toString(), stack: err.stack }};
|
1871 |
+
}}
|
1872 |
+
}})();
|
1873 |
+
""")
|
1874 |
+
except Error as e:
|
1875 |
+
# If it's due to navigation destroying the context, handle gracefully
|
1876 |
+
if "Execution context was destroyed" in str(e):
|
1877 |
+
self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC")
|
1878 |
+
try:
|
1879 |
+
await page.wait_for_load_state('load', timeout=30000)
|
1880 |
+
except Error as nav_err:
|
1881 |
+
self.logger.warning(
|
1882 |
+
message="Navigation wait failed: {error}",
|
1883 |
+
tag="JS_EXEC",
|
1884 |
+
params={"error": str(nav_err)}
|
1885 |
+
)
|
1886 |
+
try:
|
1887 |
+
await page.wait_for_load_state('networkidle', timeout=30000)
|
1888 |
+
except Error as nav_err:
|
1889 |
+
self.logger.warning(
|
1890 |
+
message="Network idle wait failed: {error}",
|
1891 |
+
tag="JS_EXEC",
|
1892 |
+
params={"error": str(nav_err)}
|
1893 |
+
)
|
1894 |
+
# Return partial success, or adapt as you see fit
|
1895 |
+
result = {
|
1896 |
+
"success": True,
|
1897 |
+
"info": "Navigation triggered, ignoring context destroyed error"
|
1898 |
+
}
|
1899 |
+
else:
|
1900 |
+
# It's some other error, log and continue
|
1901 |
+
self.logger.error(
|
1902 |
+
message="Playwright execution error: {error}",
|
1903 |
+
tag="JS_EXEC",
|
1904 |
+
params={"error": str(e)}
|
1905 |
+
)
|
1906 |
+
result = {"success": False, "error": str(e)}
|
1907 |
+
|
1908 |
+
# If we made it this far with no repeated error, do post-load waits
|
1909 |
+
t1 = time.time()
|
1910 |
+
try:
|
1911 |
+
await page.wait_for_load_state('domcontentloaded', timeout=5000)
|
1912 |
+
print("DOM content loaded after script execution in", time.time() - t1)
|
1913 |
+
except Error as e:
|
1914 |
+
self.logger.warning(
|
1915 |
+
message="DOM content load timeout: {error}",
|
1916 |
+
tag="JS_EXEC",
|
1917 |
+
params={"error": str(e)}
|
1918 |
+
)
|
1919 |
+
|
1920 |
+
# t1 = time.time()
|
1921 |
+
# try:
|
1922 |
+
# await page.wait_for_load_state('networkidle', timeout=5000)
|
1923 |
+
# print("Network idle after script execution in", time.time() - t1)
|
1924 |
+
# except Error as e:
|
1925 |
+
# self.logger.warning(
|
1926 |
+
# message="Network idle timeout: {error}",
|
1927 |
+
# tag="JS_EXEC",
|
1928 |
+
# params={"error": str(e)}
|
1929 |
+
# )
|
1930 |
+
|
1931 |
+
results.append(result if result else {"success": True})
|
1932 |
+
|
1933 |
+
except Exception as e:
|
1934 |
+
# Catch anything else
|
1935 |
+
self.logger.error(
|
1936 |
+
message="Script chunk failed: {error}",
|
1937 |
+
tag="JS_EXEC",
|
1938 |
+
params={"error": str(e)}
|
1939 |
+
)
|
1940 |
+
results.append({"success": False, "error": str(e)})
|
1941 |
+
|
1942 |
+
return {"success": True, "results": results}
|
1943 |
+
|
1944 |
+
except Exception as e:
|
1945 |
+
self.logger.error(
|
1946 |
+
message="Script execution failed: {error}",
|
1947 |
+
tag="JS_EXEC",
|
1948 |
+
params={"error": str(e)}
|
1949 |
+
)
|
1950 |
+
return {"success": False, "error": str(e)}
|
1951 |
+
|
1952 |
+
async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
|
1953 |
+
"""
|
1954 |
+
Executes user-provided JavaScript code with proper error handling and context.
|
1955 |
+
|
1956 |
+
Args:
|
1957 |
+
page: Playwright page object
|
1958 |
+
js_code: Single JavaScript string or list of JavaScript code strings
|
1959 |
+
|
1960 |
+
Returns:
|
1961 |
+
Dict containing execution status and results/errors
|
1962 |
+
"""
|
1963 |
+
try:
|
1964 |
+
# Ensure the page is ready for script execution
|
1965 |
+
await page.wait_for_load_state('domcontentloaded')
|
1966 |
+
|
1967 |
+
# Handle single script or multiple scripts
|
1968 |
+
if isinstance(js_code, str):
|
1969 |
+
scripts = [js_code]
|
1970 |
+
else:
|
1971 |
+
scripts = js_code
|
1972 |
+
|
1973 |
+
results = []
|
1974 |
+
for script in scripts:
|
1975 |
+
try:
|
1976 |
+
# Execute the script and wait for network idle
|
1977 |
+
result = await page.evaluate(f"""
|
1978 |
+
(() => {{
|
1979 |
+
return new Promise((resolve) => {{
|
1980 |
+
try {{
|
1981 |
+
const result = (function() {{
|
1982 |
+
{script}
|
1983 |
+
}})();
|
1984 |
+
|
1985 |
+
// If result is a promise, wait for it
|
1986 |
+
if (result instanceof Promise) {{
|
1987 |
+
result.then(() => {{
|
1988 |
+
// Wait a bit for any triggered effects
|
1989 |
+
setTimeout(() => resolve({{ success: true }}), 100);
|
1990 |
+
}}).catch(error => {{
|
1991 |
+
resolve({{
|
1992 |
+
success: false,
|
1993 |
+
error: error.toString(),
|
1994 |
+
stack: error.stack
|
1995 |
+
}});
|
1996 |
+
}});
|
1997 |
+
}} else {{
|
1998 |
+
// For non-promise results, still wait a bit for effects
|
1999 |
+
setTimeout(() => resolve({{ success: true }}), 100);
|
2000 |
+
}}
|
2001 |
+
}} catch (error) {{
|
2002 |
+
resolve({{
|
2003 |
+
success: false,
|
2004 |
+
error: error.toString(),
|
2005 |
+
stack: error.stack
|
2006 |
+
}});
|
2007 |
+
}}
|
2008 |
+
}});
|
2009 |
+
}})()
|
2010 |
+
""")
|
2011 |
+
|
2012 |
+
# Wait for network idle after script execution
|
2013 |
+
t1 = time.time()
|
2014 |
+
await page.wait_for_load_state('domcontentloaded', timeout=5000)
|
2015 |
+
print("DOM content loaded after script execution in", time.time() - t1)
|
2016 |
+
|
2017 |
+
t1 = time.time()
|
2018 |
+
await page.wait_for_load_state('networkidle', timeout=5000)
|
2019 |
+
print("Network idle after script execution in", time.time() - t1)
|
2020 |
+
|
2021 |
+
results.append(result if result else {"success": True})
|
2022 |
+
|
2023 |
+
except Error as e:
|
2024 |
+
# Handle Playwright-specific errors
|
2025 |
+
self.logger.error(
|
2026 |
+
message="Playwright execution error: {error}",
|
2027 |
+
tag="JS_EXEC",
|
2028 |
+
params={"error": str(e)}
|
2029 |
+
)
|
2030 |
+
results.append({"success": False, "error": str(e)})
|
2031 |
+
|
2032 |
+
return {"success": True, "results": results}
|
2033 |
+
|
2034 |
+
except Exception as e:
|
2035 |
+
self.logger.error(
|
2036 |
+
message="Script execution failed: {error}",
|
2037 |
+
tag="JS_EXEC",
|
2038 |
+
params={"error": str(e)}
|
2039 |
+
)
|
2040 |
+
return {"success": False, "error": str(e)}
|
2041 |
+
|
2042 |
+
except Exception as e:
|
2043 |
+
self.logger.error(
|
2044 |
+
message="Script execution failed: {error}",
|
2045 |
+
tag="JS_EXEC",
|
2046 |
+
params={"error": str(e)}
|
2047 |
+
)
|
2048 |
+
return {"success": False, "error": str(e)}
|
2049 |
+
|
2050 |
+
async def check_visibility(self, page):
|
2051 |
+
"""
|
2052 |
+
Checks if an element is visible on the page.
|
2053 |
+
|
2054 |
+
Args:
|
2055 |
+
page: Playwright page object
|
2056 |
+
|
2057 |
+
Returns:
|
2058 |
+
Boolean indicating visibility
|
2059 |
+
"""
|
2060 |
+
return await page.evaluate("""
|
2061 |
+
() => {
|
2062 |
+
const element = document.body;
|
2063 |
+
if (!element) return false;
|
2064 |
+
const style = window.getComputedStyle(element);
|
2065 |
+
const isVisible = style.display !== 'none' &&
|
2066 |
+
style.visibility !== 'hidden' &&
|
2067 |
+
style.opacity !== '0';
|
2068 |
+
return isVisible;
|
2069 |
+
}
|
2070 |
+
""")
|
2071 |
+
|
2072 |
+
async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1):
|
2073 |
+
"""
|
2074 |
+
Safely scroll the page with rendering time.
|
2075 |
+
|
2076 |
+
Args:
|
2077 |
+
page: Playwright page object
|
2078 |
+
x: Horizontal scroll position
|
2079 |
+
y: Vertical scroll position
|
2080 |
+
"""
|
2081 |
+
result = await self.csp_scroll_to(page, x, y)
|
2082 |
+
if result['success']:
|
2083 |
+
await page.wait_for_timeout(delay * 1000)
|
2084 |
+
return result
|
2085 |
+
|
2086 |
+
async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:
|
2087 |
+
"""
|
2088 |
+
Performs a CSP-compliant scroll operation and returns the result status.
|
2089 |
+
|
2090 |
+
Args:
|
2091 |
+
page: Playwright page object
|
2092 |
+
x: Horizontal scroll position
|
2093 |
+
y: Vertical scroll position
|
2094 |
+
|
2095 |
+
Returns:
|
2096 |
+
Dict containing scroll status and position information
|
2097 |
+
"""
|
2098 |
+
try:
|
2099 |
+
result = await page.evaluate(
|
2100 |
+
f"""() => {{
|
2101 |
+
try {{
|
2102 |
+
const startX = window.scrollX;
|
2103 |
+
const startY = window.scrollY;
|
2104 |
+
window.scrollTo({x}, {y});
|
2105 |
+
|
2106 |
+
// Get final position after scroll
|
2107 |
+
const endX = window.scrollX;
|
2108 |
+
const endY = window.scrollY;
|
2109 |
+
|
2110 |
+
return {{
|
2111 |
+
success: true,
|
2112 |
+
startPosition: {{ x: startX, y: startY }},
|
2113 |
+
endPosition: {{ x: endX, y: endY }},
|
2114 |
+
targetPosition: {{ x: {x}, y: {y} }},
|
2115 |
+
delta: {{
|
2116 |
+
x: Math.abs(endX - {x}),
|
2117 |
+
y: Math.abs(endY - {y})
|
2118 |
+
}}
|
2119 |
+
}};
|
2120 |
+
}} catch (e) {{
|
2121 |
+
return {{
|
2122 |
+
success: false,
|
2123 |
+
error: e.toString()
|
2124 |
+
}};
|
2125 |
+
}}
|
2126 |
+
}}"""
|
2127 |
+
)
|
2128 |
+
|
2129 |
+
if not result['success']:
|
2130 |
+
self.logger.warning(
|
2131 |
+
message="Scroll operation failed: {error}",
|
2132 |
+
tag="SCROLL",
|
2133 |
+
params={"error": result.get('error')}
|
2134 |
+
)
|
2135 |
+
|
2136 |
+
return result
|
2137 |
+
|
2138 |
+
except Exception as e:
|
2139 |
+
self.logger.error(
|
2140 |
+
message="Failed to execute scroll: {error}",
|
2141 |
+
tag="SCROLL",
|
2142 |
+
params={"error": str(e)}
|
2143 |
+
)
|
2144 |
+
return {
|
2145 |
+
"success": False,
|
2146 |
+
"error": str(e)
|
2147 |
+
}
|
2148 |
+
|
2149 |
+
async def get_page_dimensions(self, page: Page):
|
2150 |
+
"""
|
2151 |
+
Get the dimensions of the page.
|
2152 |
+
|
2153 |
+
Args:
|
2154 |
+
page: Playwright page object
|
2155 |
+
|
2156 |
+
Returns:
|
2157 |
+
Dict containing width and height of the page
|
2158 |
+
"""
|
2159 |
+
return await page.evaluate("""
|
2160 |
+
() => {
|
2161 |
+
const {scrollWidth, scrollHeight} = document.documentElement;
|
2162 |
+
return {width: scrollWidth, height: scrollHeight};
|
2163 |
+
}
|
2164 |
+
""")
|
2165 |
+
|
2166 |
+
async def page_need_scroll(self, page: Page) -> bool:
|
2167 |
+
"""
|
2168 |
+
Determine whether the page need to scroll
|
2169 |
+
|
2170 |
+
Args:
|
2171 |
+
page: Playwright page object
|
2172 |
+
|
2173 |
+
Returns:
|
2174 |
+
bool: True if page needs scrolling
|
2175 |
+
"""
|
2176 |
+
try:
|
2177 |
+
need_scroll = await page.evaluate("""
|
2178 |
+
() => {
|
2179 |
+
const scrollHeight = document.documentElement.scrollHeight;
|
2180 |
+
const viewportHeight = window.innerHeight;
|
2181 |
+
return scrollHeight > viewportHeight;
|
2182 |
+
}
|
2183 |
+
""")
|
2184 |
+
return need_scroll
|
2185 |
+
except Exception as e:
|
2186 |
+
self.logger.warning(
|
2187 |
+
message="Failed to check scroll need: {error}. Defaulting to True for safety.",
|
2188 |
+
tag="SCROLL",
|
2189 |
+
params={"error": str(e)}
|
2190 |
+
)
|
2191 |
+
return True # Default to scrolling if check fails
|
crawl4ai/async_database.py
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
from pathlib import Path
|
3 |
+
import aiosqlite
|
4 |
+
import asyncio
|
5 |
+
from typing import Optional, Tuple, Dict
|
6 |
+
from contextlib import asynccontextmanager
|
7 |
+
import logging
|
8 |
+
import json # Added for serialization/deserialization
|
9 |
+
from .utils import ensure_content_dirs, generate_content_hash
|
10 |
+
from .models import CrawlResult, MarkdownGenerationResult
|
11 |
+
import xxhash
|
12 |
+
import aiofiles
|
13 |
+
from .config import NEED_MIGRATION
|
14 |
+
from .version_manager import VersionManager
|
15 |
+
from .async_logger import AsyncLogger
|
16 |
+
from .utils import get_error_context, create_box_message
|
17 |
+
# Set up logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
22 |
+
os.makedirs(DB_PATH, exist_ok=True)
|
23 |
+
DB_PATH = os.path.join(base_directory, "crawl4ai.db")
|
24 |
+
|
25 |
+
class AsyncDatabaseManager:
|
26 |
+
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
27 |
+
self.db_path = DB_PATH
|
28 |
+
self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH))
|
29 |
+
self.pool_size = pool_size
|
30 |
+
self.max_retries = max_retries
|
31 |
+
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
|
32 |
+
self.pool_lock = asyncio.Lock()
|
33 |
+
self.init_lock = asyncio.Lock()
|
34 |
+
self.connection_semaphore = asyncio.Semaphore(pool_size)
|
35 |
+
self._initialized = False
|
36 |
+
self.version_manager = VersionManager()
|
37 |
+
self.logger = AsyncLogger(
|
38 |
+
log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
|
39 |
+
verbose=False,
|
40 |
+
tag_width=10
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
async def initialize(self):
|
45 |
+
"""Initialize the database and connection pool"""
|
46 |
+
try:
|
47 |
+
self.logger.info("Initializing database", tag="INIT")
|
48 |
+
# Ensure the database file exists
|
49 |
+
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
|
50 |
+
|
51 |
+
# Check if version update is needed
|
52 |
+
needs_update = self.version_manager.needs_update()
|
53 |
+
|
54 |
+
# Always ensure base table exists
|
55 |
+
await self.ainit_db()
|
56 |
+
|
57 |
+
# Verify the table exists
|
58 |
+
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
59 |
+
async with db.execute(
|
60 |
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
|
61 |
+
) as cursor:
|
62 |
+
result = await cursor.fetchone()
|
63 |
+
if not result:
|
64 |
+
raise Exception("crawled_data table was not created")
|
65 |
+
|
66 |
+
# If version changed or fresh install, run updates
|
67 |
+
if needs_update:
|
68 |
+
self.logger.info("New version detected, running updates", tag="INIT")
|
69 |
+
await self.update_db_schema()
|
70 |
+
from .migrations import run_migration # Import here to avoid circular imports
|
71 |
+
await run_migration()
|
72 |
+
self.version_manager.update_version() # Update stored version after successful migration
|
73 |
+
self.logger.success("Version update completed successfully", tag="COMPLETE")
|
74 |
+
else:
|
75 |
+
self.logger.success("Database initialization completed successfully", tag="COMPLETE")
|
76 |
+
|
77 |
+
|
78 |
+
except Exception as e:
|
79 |
+
self.logger.error(
|
80 |
+
message="Database initialization error: {error}",
|
81 |
+
tag="ERROR",
|
82 |
+
params={"error": str(e)}
|
83 |
+
)
|
84 |
+
self.logger.info(
|
85 |
+
message="Database will be initialized on first use",
|
86 |
+
tag="INIT"
|
87 |
+
)
|
88 |
+
|
89 |
+
raise
|
90 |
+
|
91 |
+
|
92 |
+
async def cleanup(self):
|
93 |
+
"""Cleanup connections when shutting down"""
|
94 |
+
async with self.pool_lock:
|
95 |
+
for conn in self.connection_pool.values():
|
96 |
+
await conn.close()
|
97 |
+
self.connection_pool.clear()
|
98 |
+
|
99 |
+
@asynccontextmanager
|
100 |
+
async def get_connection(self):
|
101 |
+
"""Connection pool manager with enhanced error handling"""
|
102 |
+
if not self._initialized:
|
103 |
+
async with self.init_lock:
|
104 |
+
if not self._initialized:
|
105 |
+
try:
|
106 |
+
await self.initialize()
|
107 |
+
self._initialized = True
|
108 |
+
except Exception as e:
|
109 |
+
import sys
|
110 |
+
error_context = get_error_context(sys.exc_info())
|
111 |
+
self.logger.error(
|
112 |
+
message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
|
113 |
+
tag="ERROR",
|
114 |
+
force_verbose=True,
|
115 |
+
params={
|
116 |
+
"error": str(e),
|
117 |
+
"context": error_context["code_context"],
|
118 |
+
"traceback": error_context["full_traceback"]
|
119 |
+
}
|
120 |
+
)
|
121 |
+
raise
|
122 |
+
|
123 |
+
await self.connection_semaphore.acquire()
|
124 |
+
task_id = id(asyncio.current_task())
|
125 |
+
|
126 |
+
try:
|
127 |
+
async with self.pool_lock:
|
128 |
+
if task_id not in self.connection_pool:
|
129 |
+
try:
|
130 |
+
conn = await aiosqlite.connect(
|
131 |
+
self.db_path,
|
132 |
+
timeout=30.0
|
133 |
+
)
|
134 |
+
await conn.execute('PRAGMA journal_mode = WAL')
|
135 |
+
await conn.execute('PRAGMA busy_timeout = 5000')
|
136 |
+
|
137 |
+
# Verify database structure
|
138 |
+
async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
|
139 |
+
columns = await cursor.fetchall()
|
140 |
+
column_names = [col[1] for col in columns]
|
141 |
+
expected_columns = {
|
142 |
+
'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
|
143 |
+
'success', 'media', 'links', 'metadata', 'screenshot',
|
144 |
+
'response_headers', 'downloaded_files'
|
145 |
+
}
|
146 |
+
missing_columns = expected_columns - set(column_names)
|
147 |
+
if missing_columns:
|
148 |
+
raise ValueError(f"Database missing columns: {missing_columns}")
|
149 |
+
|
150 |
+
self.connection_pool[task_id] = conn
|
151 |
+
except Exception as e:
|
152 |
+
import sys
|
153 |
+
error_context = get_error_context(sys.exc_info())
|
154 |
+
error_message = (
|
155 |
+
f"Unexpected error in db get_connection at line {error_context['line_no']} "
|
156 |
+
f"in {error_context['function']} ({error_context['filename']}):\n"
|
157 |
+
f"Error: {str(e)}\n\n"
|
158 |
+
f"Code context:\n{error_context['code_context']}"
|
159 |
+
)
|
160 |
+
self.logger.error(
|
161 |
+
message=create_box_message(error_message, type= "error"),
|
162 |
+
)
|
163 |
+
|
164 |
+
raise
|
165 |
+
|
166 |
+
yield self.connection_pool[task_id]
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
import sys
|
170 |
+
error_context = get_error_context(sys.exc_info())
|
171 |
+
error_message = (
|
172 |
+
f"Unexpected error in db get_connection at line {error_context['line_no']} "
|
173 |
+
f"in {error_context['function']} ({error_context['filename']}):\n"
|
174 |
+
f"Error: {str(e)}\n\n"
|
175 |
+
f"Code context:\n{error_context['code_context']}"
|
176 |
+
)
|
177 |
+
self.logger.error(
|
178 |
+
message=create_box_message(error_message, type= "error"),
|
179 |
+
)
|
180 |
+
raise
|
181 |
+
finally:
|
182 |
+
async with self.pool_lock:
|
183 |
+
if task_id in self.connection_pool:
|
184 |
+
await self.connection_pool[task_id].close()
|
185 |
+
del self.connection_pool[task_id]
|
186 |
+
self.connection_semaphore.release()
|
187 |
+
|
188 |
+
|
189 |
+
async def execute_with_retry(self, operation, *args):
|
190 |
+
"""Execute database operations with retry logic"""
|
191 |
+
for attempt in range(self.max_retries):
|
192 |
+
try:
|
193 |
+
async with self.get_connection() as db:
|
194 |
+
result = await operation(db, *args)
|
195 |
+
await db.commit()
|
196 |
+
return result
|
197 |
+
except Exception as e:
|
198 |
+
if attempt == self.max_retries - 1:
|
199 |
+
self.logger.error(
|
200 |
+
message="Operation failed after {retries} attempts: {error}",
|
201 |
+
tag="ERROR",
|
202 |
+
force_verbose=True,
|
203 |
+
params={
|
204 |
+
"retries": self.max_retries,
|
205 |
+
"error": str(e)
|
206 |
+
}
|
207 |
+
)
|
208 |
+
raise
|
209 |
+
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
|
210 |
+
|
211 |
+
async def ainit_db(self):
|
212 |
+
"""Initialize database schema"""
|
213 |
+
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
214 |
+
await db.execute('''
|
215 |
+
CREATE TABLE IF NOT EXISTS crawled_data (
|
216 |
+
url TEXT PRIMARY KEY,
|
217 |
+
html TEXT,
|
218 |
+
cleaned_html TEXT,
|
219 |
+
markdown TEXT,
|
220 |
+
extracted_content TEXT,
|
221 |
+
success BOOLEAN,
|
222 |
+
media TEXT DEFAULT "{}",
|
223 |
+
links TEXT DEFAULT "{}",
|
224 |
+
metadata TEXT DEFAULT "{}",
|
225 |
+
screenshot TEXT DEFAULT "",
|
226 |
+
response_headers TEXT DEFAULT "{}",
|
227 |
+
downloaded_files TEXT DEFAULT "{}" -- New column added
|
228 |
+
)
|
229 |
+
''')
|
230 |
+
await db.commit()
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
async def update_db_schema(self):
|
235 |
+
"""Update database schema if needed"""
|
236 |
+
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
237 |
+
cursor = await db.execute("PRAGMA table_info(crawled_data)")
|
238 |
+
columns = await cursor.fetchall()
|
239 |
+
column_names = [column[1] for column in columns]
|
240 |
+
|
241 |
+
# List of new columns to add
|
242 |
+
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
|
243 |
+
|
244 |
+
for column in new_columns:
|
245 |
+
if column not in column_names:
|
246 |
+
await self.aalter_db_add_column(column, db)
|
247 |
+
await db.commit()
|
248 |
+
|
249 |
+
async def aalter_db_add_column(self, new_column: str, db):
|
250 |
+
"""Add new column to the database"""
|
251 |
+
if new_column == 'response_headers':
|
252 |
+
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
|
253 |
+
else:
|
254 |
+
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
255 |
+
self.logger.info(
|
256 |
+
message="Added column '{column}' to the database",
|
257 |
+
tag="INIT",
|
258 |
+
params={"column": new_column}
|
259 |
+
)
|
260 |
+
|
261 |
+
|
262 |
+
async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
|
263 |
+
"""Retrieve cached URL data as CrawlResult"""
|
264 |
+
async def _get(db):
|
265 |
+
async with db.execute(
|
266 |
+
'SELECT * FROM crawled_data WHERE url = ?', (url,)
|
267 |
+
) as cursor:
|
268 |
+
row = await cursor.fetchone()
|
269 |
+
if not row:
|
270 |
+
return None
|
271 |
+
|
272 |
+
# Get column names
|
273 |
+
columns = [description[0] for description in cursor.description]
|
274 |
+
# Create dict from row data
|
275 |
+
row_dict = dict(zip(columns, row))
|
276 |
+
|
277 |
+
# Load content from files using stored hashes
|
278 |
+
content_fields = {
|
279 |
+
'html': row_dict['html'],
|
280 |
+
'cleaned_html': row_dict['cleaned_html'],
|
281 |
+
'markdown': row_dict['markdown'],
|
282 |
+
'extracted_content': row_dict['extracted_content'],
|
283 |
+
'screenshot': row_dict['screenshot'],
|
284 |
+
'screenshots': row_dict['screenshot'],
|
285 |
+
}
|
286 |
+
|
287 |
+
for field, hash_value in content_fields.items():
|
288 |
+
if hash_value:
|
289 |
+
content = await self._load_content(
|
290 |
+
hash_value,
|
291 |
+
field.split('_')[0] # Get content type from field name
|
292 |
+
)
|
293 |
+
row_dict[field] = content or ""
|
294 |
+
else:
|
295 |
+
row_dict[field] = ""
|
296 |
+
|
297 |
+
# Parse JSON fields
|
298 |
+
json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
|
299 |
+
for field in json_fields:
|
300 |
+
try:
|
301 |
+
row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
|
302 |
+
except json.JSONDecodeError:
|
303 |
+
row_dict[field] = {}
|
304 |
+
|
305 |
+
if isinstance(row_dict['markdown'], Dict):
|
306 |
+
row_dict['markdown_v2'] = row_dict['markdown']
|
307 |
+
if row_dict['markdown'].get('raw_markdown'):
|
308 |
+
row_dict['markdown'] = row_dict['markdown']['raw_markdown']
|
309 |
+
|
310 |
+
# Parse downloaded_files
|
311 |
+
try:
|
312 |
+
row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
|
313 |
+
except json.JSONDecodeError:
|
314 |
+
row_dict['downloaded_files'] = []
|
315 |
+
|
316 |
+
# Remove any fields not in CrawlResult model
|
317 |
+
valid_fields = CrawlResult.__annotations__.keys()
|
318 |
+
filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
|
319 |
+
|
320 |
+
return CrawlResult(**filtered_dict)
|
321 |
+
|
322 |
+
try:
|
323 |
+
return await self.execute_with_retry(_get)
|
324 |
+
except Exception as e:
|
325 |
+
self.logger.error(
|
326 |
+
message="Error retrieving cached URL: {error}",
|
327 |
+
tag="ERROR",
|
328 |
+
force_verbose=True,
|
329 |
+
params={"error": str(e)}
|
330 |
+
)
|
331 |
+
return None
|
332 |
+
|
333 |
+
async def acache_url(self, result: CrawlResult):
|
334 |
+
"""Cache CrawlResult data"""
|
335 |
+
# Store content files and get hashes
|
336 |
+
content_map = {
|
337 |
+
'html': (result.html, 'html'),
|
338 |
+
'cleaned_html': (result.cleaned_html or "", 'cleaned'),
|
339 |
+
'markdown': None,
|
340 |
+
'extracted_content': (result.extracted_content or "", 'extracted'),
|
341 |
+
'screenshot': (result.screenshot or "", 'screenshots')
|
342 |
+
}
|
343 |
+
|
344 |
+
try:
|
345 |
+
if isinstance(result.markdown, MarkdownGenerationResult):
|
346 |
+
content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
|
347 |
+
elif hasattr(result, 'markdown_v2'):
|
348 |
+
content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
|
349 |
+
elif isinstance(result.markdown, str):
|
350 |
+
markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
|
351 |
+
content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
|
352 |
+
else:
|
353 |
+
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
354 |
+
except Exception as e:
|
355 |
+
self.logger.warning(
|
356 |
+
message=f"Error processing markdown content: {str(e)}",
|
357 |
+
tag="WARNING"
|
358 |
+
)
|
359 |
+
# Fallback to empty markdown result
|
360 |
+
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
361 |
+
|
362 |
+
content_hashes = {}
|
363 |
+
for field, (content, content_type) in content_map.items():
|
364 |
+
content_hashes[field] = await self._store_content(content, content_type)
|
365 |
+
|
366 |
+
async def _cache(db):
|
367 |
+
await db.execute('''
|
368 |
+
INSERT INTO crawled_data (
|
369 |
+
url, html, cleaned_html, markdown,
|
370 |
+
extracted_content, success, media, links, metadata,
|
371 |
+
screenshot, response_headers, downloaded_files
|
372 |
+
)
|
373 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
374 |
+
ON CONFLICT(url) DO UPDATE SET
|
375 |
+
html = excluded.html,
|
376 |
+
cleaned_html = excluded.cleaned_html,
|
377 |
+
markdown = excluded.markdown,
|
378 |
+
extracted_content = excluded.extracted_content,
|
379 |
+
success = excluded.success,
|
380 |
+
media = excluded.media,
|
381 |
+
links = excluded.links,
|
382 |
+
metadata = excluded.metadata,
|
383 |
+
screenshot = excluded.screenshot,
|
384 |
+
response_headers = excluded.response_headers,
|
385 |
+
downloaded_files = excluded.downloaded_files
|
386 |
+
''', (
|
387 |
+
result.url,
|
388 |
+
content_hashes['html'],
|
389 |
+
content_hashes['cleaned_html'],
|
390 |
+
content_hashes['markdown'],
|
391 |
+
content_hashes['extracted_content'],
|
392 |
+
result.success,
|
393 |
+
json.dumps(result.media),
|
394 |
+
json.dumps(result.links),
|
395 |
+
json.dumps(result.metadata or {}),
|
396 |
+
content_hashes['screenshot'],
|
397 |
+
json.dumps(result.response_headers or {}),
|
398 |
+
json.dumps(result.downloaded_files or [])
|
399 |
+
))
|
400 |
+
|
401 |
+
try:
|
402 |
+
await self.execute_with_retry(_cache)
|
403 |
+
except Exception as e:
|
404 |
+
self.logger.error(
|
405 |
+
message="Error caching URL: {error}",
|
406 |
+
tag="ERROR",
|
407 |
+
force_verbose=True,
|
408 |
+
params={"error": str(e)}
|
409 |
+
)
|
410 |
+
|
411 |
+
|
412 |
+
async def aget_total_count(self) -> int:
|
413 |
+
"""Get total number of cached URLs"""
|
414 |
+
async def _count(db):
|
415 |
+
async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
|
416 |
+
result = await cursor.fetchone()
|
417 |
+
return result[0] if result else 0
|
418 |
+
|
419 |
+
try:
|
420 |
+
return await self.execute_with_retry(_count)
|
421 |
+
except Exception as e:
|
422 |
+
self.logger.error(
|
423 |
+
message="Error getting total count: {error}",
|
424 |
+
tag="ERROR",
|
425 |
+
force_verbose=True,
|
426 |
+
params={"error": str(e)}
|
427 |
+
)
|
428 |
+
return 0
|
429 |
+
|
430 |
+
async def aclear_db(self):
|
431 |
+
"""Clear all data from the database"""
|
432 |
+
async def _clear(db):
|
433 |
+
await db.execute('DELETE FROM crawled_data')
|
434 |
+
|
435 |
+
try:
|
436 |
+
await self.execute_with_retry(_clear)
|
437 |
+
except Exception as e:
|
438 |
+
self.logger.error(
|
439 |
+
message="Error clearing database: {error}",
|
440 |
+
tag="ERROR",
|
441 |
+
force_verbose=True,
|
442 |
+
params={"error": str(e)}
|
443 |
+
)
|
444 |
+
|
445 |
+
async def aflush_db(self):
|
446 |
+
"""Drop the entire table"""
|
447 |
+
async def _flush(db):
|
448 |
+
await db.execute('DROP TABLE IF EXISTS crawled_data')
|
449 |
+
|
450 |
+
try:
|
451 |
+
await self.execute_with_retry(_flush)
|
452 |
+
except Exception as e:
|
453 |
+
self.logger.error(
|
454 |
+
message="Error flushing database: {error}",
|
455 |
+
tag="ERROR",
|
456 |
+
force_verbose=True,
|
457 |
+
params={"error": str(e)}
|
458 |
+
)
|
459 |
+
|
460 |
+
|
461 |
+
async def _store_content(self, content: str, content_type: str) -> str:
|
462 |
+
"""Store content in filesystem and return hash"""
|
463 |
+
if not content:
|
464 |
+
return ""
|
465 |
+
|
466 |
+
content_hash = generate_content_hash(content)
|
467 |
+
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
468 |
+
|
469 |
+
# Only write if file doesn't exist
|
470 |
+
if not os.path.exists(file_path):
|
471 |
+
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
472 |
+
await f.write(content)
|
473 |
+
|
474 |
+
return content_hash
|
475 |
+
|
476 |
+
async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
|
477 |
+
"""Load content from filesystem by hash"""
|
478 |
+
if not content_hash:
|
479 |
+
return None
|
480 |
+
|
481 |
+
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
482 |
+
try:
|
483 |
+
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
|
484 |
+
return await f.read()
|
485 |
+
except:
|
486 |
+
self.logger.error(
|
487 |
+
message="Failed to load content: {file_path}",
|
488 |
+
tag="ERROR",
|
489 |
+
force_verbose=True,
|
490 |
+
params={"file_path": file_path}
|
491 |
+
)
|
492 |
+
return None
|
493 |
+
|
494 |
+
# Create a singleton instance
|
495 |
+
async_db_manager = AsyncDatabaseManager()
|
crawl4ai/async_logger.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
from typing import Optional, Dict, Any, Union
|
3 |
+
from colorama import Fore, Back, Style, init
|
4 |
+
import time
|
5 |
+
import os
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
class LogLevel(Enum):
|
9 |
+
DEBUG = 1
|
10 |
+
INFO = 2
|
11 |
+
SUCCESS = 3
|
12 |
+
WARNING = 4
|
13 |
+
ERROR = 5
|
14 |
+
|
15 |
+
class AsyncLogger:
|
16 |
+
"""
|
17 |
+
Asynchronous logger with support for colored console output and file logging.
|
18 |
+
Supports templated messages with colored components.
|
19 |
+
"""
|
20 |
+
|
21 |
+
DEFAULT_ICONS = {
|
22 |
+
'INIT': '→',
|
23 |
+
'READY': '✓',
|
24 |
+
'FETCH': '↓',
|
25 |
+
'SCRAPE': '◆',
|
26 |
+
'EXTRACT': '■',
|
27 |
+
'COMPLETE': '●',
|
28 |
+
'ERROR': '×',
|
29 |
+
'DEBUG': '⋯',
|
30 |
+
'INFO': 'ℹ',
|
31 |
+
'WARNING': '⚠',
|
32 |
+
}
|
33 |
+
|
34 |
+
DEFAULT_COLORS = {
|
35 |
+
LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
|
36 |
+
LogLevel.INFO: Fore.CYAN,
|
37 |
+
LogLevel.SUCCESS: Fore.GREEN,
|
38 |
+
LogLevel.WARNING: Fore.YELLOW,
|
39 |
+
LogLevel.ERROR: Fore.RED,
|
40 |
+
}
|
41 |
+
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
log_file: Optional[str] = None,
|
45 |
+
log_level: LogLevel = LogLevel.DEBUG,
|
46 |
+
tag_width: int = 10,
|
47 |
+
icons: Optional[Dict[str, str]] = None,
|
48 |
+
colors: Optional[Dict[LogLevel, str]] = None,
|
49 |
+
verbose: bool = True
|
50 |
+
):
|
51 |
+
"""
|
52 |
+
Initialize the logger.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
log_file: Optional file path for logging
|
56 |
+
log_level: Minimum log level to display
|
57 |
+
tag_width: Width for tag formatting
|
58 |
+
icons: Custom icons for different tags
|
59 |
+
colors: Custom colors for different log levels
|
60 |
+
verbose: Whether to output to console
|
61 |
+
"""
|
62 |
+
init() # Initialize colorama
|
63 |
+
self.log_file = log_file
|
64 |
+
self.log_level = log_level
|
65 |
+
self.tag_width = tag_width
|
66 |
+
self.icons = icons or self.DEFAULT_ICONS
|
67 |
+
self.colors = colors or self.DEFAULT_COLORS
|
68 |
+
self.verbose = verbose
|
69 |
+
|
70 |
+
# Create log file directory if needed
|
71 |
+
if log_file:
|
72 |
+
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
73 |
+
|
74 |
+
def _format_tag(self, tag: str) -> str:
|
75 |
+
"""Format a tag with consistent width."""
|
76 |
+
return f"[{tag}]".ljust(self.tag_width, ".")
|
77 |
+
|
78 |
+
def _get_icon(self, tag: str) -> str:
|
79 |
+
"""Get the icon for a tag, defaulting to info icon if not found."""
|
80 |
+
return self.icons.get(tag, self.icons['INFO'])
|
81 |
+
|
82 |
+
def _write_to_file(self, message: str):
|
83 |
+
"""Write a message to the log file if configured."""
|
84 |
+
if self.log_file:
|
85 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
86 |
+
with open(self.log_file, 'a', encoding='utf-8') as f:
|
87 |
+
# Strip ANSI color codes for file output
|
88 |
+
clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
|
89 |
+
for color in vars(Fore).values():
|
90 |
+
if isinstance(color, str):
|
91 |
+
clean_message = clean_message.replace(color, '')
|
92 |
+
f.write(f"[{timestamp}] {clean_message}\n")
|
93 |
+
|
94 |
+
def _log(
|
95 |
+
self,
|
96 |
+
level: LogLevel,
|
97 |
+
message: str,
|
98 |
+
tag: str,
|
99 |
+
params: Optional[Dict[str, Any]] = None,
|
100 |
+
colors: Optional[Dict[str, str]] = None,
|
101 |
+
base_color: Optional[str] = None,
|
102 |
+
**kwargs
|
103 |
+
):
|
104 |
+
"""
|
105 |
+
Core logging method that handles message formatting and output.
|
106 |
+
|
107 |
+
Args:
|
108 |
+
level: Log level for this message
|
109 |
+
message: Message template string
|
110 |
+
tag: Tag for the message
|
111 |
+
params: Parameters to format into the message
|
112 |
+
colors: Color overrides for specific parameters
|
113 |
+
base_color: Base color for the entire message
|
114 |
+
"""
|
115 |
+
if level.value < self.log_level.value:
|
116 |
+
return
|
117 |
+
|
118 |
+
# Format the message with parameters if provided
|
119 |
+
if params:
|
120 |
+
try:
|
121 |
+
# First format the message with raw parameters
|
122 |
+
formatted_message = message.format(**params)
|
123 |
+
|
124 |
+
# Then apply colors if specified
|
125 |
+
if colors:
|
126 |
+
for key, color in colors.items():
|
127 |
+
# Find the formatted value in the message and wrap it with color
|
128 |
+
if key in params:
|
129 |
+
value_str = str(params[key])
|
130 |
+
formatted_message = formatted_message.replace(
|
131 |
+
value_str,
|
132 |
+
f"{color}{value_str}{Style.RESET_ALL}"
|
133 |
+
)
|
134 |
+
|
135 |
+
except KeyError as e:
|
136 |
+
formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
|
137 |
+
level = LogLevel.ERROR
|
138 |
+
else:
|
139 |
+
formatted_message = message
|
140 |
+
|
141 |
+
# Construct the full log line
|
142 |
+
color = base_color or self.colors[level]
|
143 |
+
log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
|
144 |
+
|
145 |
+
# Output to console if verbose
|
146 |
+
if self.verbose or kwargs.get("force_verbose", False):
|
147 |
+
print(log_line)
|
148 |
+
|
149 |
+
# Write to file if configured
|
150 |
+
self._write_to_file(log_line)
|
151 |
+
|
152 |
+
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
153 |
+
"""Log a debug message."""
|
154 |
+
self._log(LogLevel.DEBUG, message, tag, **kwargs)
|
155 |
+
|
156 |
+
def info(self, message: str, tag: str = "INFO", **kwargs):
|
157 |
+
"""Log an info message."""
|
158 |
+
self._log(LogLevel.INFO, message, tag, **kwargs)
|
159 |
+
|
160 |
+
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
161 |
+
"""Log a success message."""
|
162 |
+
self._log(LogLevel.SUCCESS, message, tag, **kwargs)
|
163 |
+
|
164 |
+
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
165 |
+
"""Log a warning message."""
|
166 |
+
self._log(LogLevel.WARNING, message, tag, **kwargs)
|
167 |
+
|
168 |
+
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
169 |
+
"""Log an error message."""
|
170 |
+
self._log(LogLevel.ERROR, message, tag, **kwargs)
|
171 |
+
|
172 |
+
def url_status(
|
173 |
+
self,
|
174 |
+
url: str,
|
175 |
+
success: bool,
|
176 |
+
timing: float,
|
177 |
+
tag: str = "FETCH",
|
178 |
+
url_length: int = 50
|
179 |
+
):
|
180 |
+
"""
|
181 |
+
Convenience method for logging URL fetch status.
|
182 |
+
|
183 |
+
Args:
|
184 |
+
url: The URL being processed
|
185 |
+
success: Whether the operation was successful
|
186 |
+
timing: Time taken for the operation
|
187 |
+
tag: Tag for the message
|
188 |
+
url_length: Maximum length for URL in log
|
189 |
+
"""
|
190 |
+
self._log(
|
191 |
+
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
|
192 |
+
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
|
193 |
+
tag=tag,
|
194 |
+
params={
|
195 |
+
"url": url,
|
196 |
+
"url_length": url_length,
|
197 |
+
"status": success,
|
198 |
+
"timing": timing
|
199 |
+
},
|
200 |
+
colors={
|
201 |
+
"status": Fore.GREEN if success else Fore.RED,
|
202 |
+
"timing": Fore.YELLOW
|
203 |
+
}
|
204 |
+
)
|
205 |
+
|
206 |
+
def error_status(
|
207 |
+
self,
|
208 |
+
url: str,
|
209 |
+
error: str,
|
210 |
+
tag: str = "ERROR",
|
211 |
+
url_length: int = 50
|
212 |
+
):
|
213 |
+
"""
|
214 |
+
Convenience method for logging error status.
|
215 |
+
|
216 |
+
Args:
|
217 |
+
url: The URL being processed
|
218 |
+
error: Error message
|
219 |
+
tag: Tag for the message
|
220 |
+
url_length: Maximum length for URL in log
|
221 |
+
"""
|
222 |
+
self._log(
|
223 |
+
level=LogLevel.ERROR,
|
224 |
+
message="{url:.{url_length}}... | Error: {error}",
|
225 |
+
tag=tag,
|
226 |
+
params={
|
227 |
+
"url": url,
|
228 |
+
"url_length": url_length,
|
229 |
+
"error": error
|
230 |
+
}
|
231 |
+
)
|
crawl4ai/async_webcrawler.py
ADDED
@@ -0,0 +1,833 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import time
|
3 |
+
import warnings
|
4 |
+
from enum import Enum
|
5 |
+
from colorama import init, Fore, Back, Style
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import Optional, List, Union
|
8 |
+
import json
|
9 |
+
import asyncio
|
10 |
+
# from contextlib import nullcontext, asynccontextmanager
|
11 |
+
from contextlib import asynccontextmanager
|
12 |
+
from .models import CrawlResult, MarkdownGenerationResult
|
13 |
+
from .async_database import async_db_manager
|
14 |
+
from .chunking_strategy import *
|
15 |
+
from .content_filter_strategy import *
|
16 |
+
from .extraction_strategy import *
|
17 |
+
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
18 |
+
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
19 |
+
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
20 |
+
from .content_scraping_strategy import WebScrapingStrategy
|
21 |
+
from .async_logger import AsyncLogger
|
22 |
+
from .async_configs import BrowserConfig, CrawlerRunConfig
|
23 |
+
from .config import (
|
24 |
+
MIN_WORD_THRESHOLD,
|
25 |
+
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
26 |
+
URL_LOG_SHORTEN_LENGTH
|
27 |
+
)
|
28 |
+
from .utils import (
|
29 |
+
sanitize_input_encode,
|
30 |
+
InvalidCSSSelectorError,
|
31 |
+
format_html,
|
32 |
+
fast_format_html,
|
33 |
+
create_box_message
|
34 |
+
)
|
35 |
+
|
36 |
+
from urllib.parse import urlparse
|
37 |
+
import random
|
38 |
+
from .__version__ import __version__ as crawl4ai_version
|
39 |
+
|
40 |
+
|
41 |
+
class AsyncWebCrawler:
|
42 |
+
"""
|
43 |
+
Asynchronous web crawler with flexible caching capabilities.
|
44 |
+
|
45 |
+
There are two ways to use the crawler:
|
46 |
+
|
47 |
+
1. Using context manager (recommended for simple cases):
|
48 |
+
```python
|
49 |
+
async with AsyncWebCrawler() as crawler:
|
50 |
+
result = await crawler.arun(url="https://example.com")
|
51 |
+
```
|
52 |
+
|
53 |
+
2. Using explicit lifecycle management (recommended for long-running applications):
|
54 |
+
```python
|
55 |
+
crawler = AsyncWebCrawler()
|
56 |
+
await crawler.start()
|
57 |
+
|
58 |
+
# Use the crawler multiple times
|
59 |
+
result1 = await crawler.arun(url="https://example.com")
|
60 |
+
result2 = await crawler.arun(url="https://another.com")
|
61 |
+
|
62 |
+
await crawler.close()
|
63 |
+
```
|
64 |
+
|
65 |
+
Migration Guide:
|
66 |
+
Old way (deprecated):
|
67 |
+
crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
|
68 |
+
|
69 |
+
New way (recommended):
|
70 |
+
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
71 |
+
crawler = AsyncWebCrawler(config=browser_config)
|
72 |
+
|
73 |
+
|
74 |
+
Attributes:
|
75 |
+
browser_config (BrowserConfig): Configuration object for browser settings.
|
76 |
+
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
77 |
+
logger (AsyncLogger): Logger instance for recording events and errors.
|
78 |
+
always_bypass_cache (bool): Whether to always bypass cache.
|
79 |
+
crawl4ai_folder (str): Directory for storing cache.
|
80 |
+
base_directory (str): Base directory for storing cache.
|
81 |
+
ready (bool): Whether the crawler is ready for use.
|
82 |
+
|
83 |
+
Methods:
|
84 |
+
start(): Start the crawler explicitly without using context manager.
|
85 |
+
close(): Close the crawler explicitly without using context manager.
|
86 |
+
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
87 |
+
awarmup(): Perform warmup sequence.
|
88 |
+
arun_many(): Run the crawler for multiple sources.
|
89 |
+
aprocess_html(): Process HTML content.
|
90 |
+
|
91 |
+
Typical Usage:
|
92 |
+
async with AsyncWebCrawler() as crawler:
|
93 |
+
result = await crawler.arun(url="https://example.com")
|
94 |
+
print(result.markdown)
|
95 |
+
|
96 |
+
Using configuration:
|
97 |
+
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
98 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
99 |
+
crawler_config = CrawlerRunConfig(
|
100 |
+
cache_mode=CacheMode.BYPASS
|
101 |
+
)
|
102 |
+
result = await crawler.arun(url="https://example.com", config=crawler_config)
|
103 |
+
print(result.markdown)
|
104 |
+
"""
|
105 |
+
_domain_last_hit = {}
|
106 |
+
|
107 |
+
def __init__(
|
108 |
+
self,
|
109 |
+
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
110 |
+
config: Optional[BrowserConfig] = None,
|
111 |
+
always_bypass_cache: bool = False,
|
112 |
+
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
113 |
+
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
114 |
+
thread_safe: bool = False,
|
115 |
+
**kwargs,
|
116 |
+
):
|
117 |
+
"""
|
118 |
+
Initialize the AsyncWebCrawler.
|
119 |
+
|
120 |
+
Args:
|
121 |
+
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
|
122 |
+
config: Configuration object for browser settings. If None, will be created from kwargs
|
123 |
+
always_bypass_cache: Whether to always bypass cache (new parameter)
|
124 |
+
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
125 |
+
base_directory: Base directory for storing cache
|
126 |
+
thread_safe: Whether to use thread-safe operations
|
127 |
+
**kwargs: Additional arguments for backwards compatibility
|
128 |
+
"""
|
129 |
+
# Handle browser configuration
|
130 |
+
browser_config = config
|
131 |
+
if browser_config is not None:
|
132 |
+
if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
|
133 |
+
self.logger.warning(
|
134 |
+
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
|
135 |
+
tag="WARNING"
|
136 |
+
)
|
137 |
+
else:
|
138 |
+
# Create browser config from kwargs for backwards compatibility
|
139 |
+
browser_config = BrowserConfig.from_kwargs(kwargs)
|
140 |
+
|
141 |
+
self.browser_config = browser_config
|
142 |
+
|
143 |
+
# Initialize logger first since other components may need it
|
144 |
+
self.logger = AsyncLogger(
|
145 |
+
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
146 |
+
verbose=self.browser_config.verbose,
|
147 |
+
tag_width=10
|
148 |
+
)
|
149 |
+
|
150 |
+
|
151 |
+
# Initialize crawler strategy
|
152 |
+
params = {
|
153 |
+
k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
|
154 |
+
}
|
155 |
+
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
156 |
+
browser_config=browser_config,
|
157 |
+
logger=self.logger,
|
158 |
+
**params # Pass remaining kwargs for backwards compatibility
|
159 |
+
)
|
160 |
+
|
161 |
+
# If craweler strategy doesnt have logger, use crawler logger
|
162 |
+
if not self.crawler_strategy.logger:
|
163 |
+
self.crawler_strategy.logger = self.logger
|
164 |
+
|
165 |
+
# Handle deprecated cache parameter
|
166 |
+
if always_by_pass_cache is not None:
|
167 |
+
if kwargs.get("warning", True):
|
168 |
+
warnings.warn(
|
169 |
+
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
|
170 |
+
"Use 'always_bypass_cache' instead. "
|
171 |
+
"Pass warning=False to suppress this warning.",
|
172 |
+
DeprecationWarning,
|
173 |
+
stacklevel=2
|
174 |
+
)
|
175 |
+
self.always_bypass_cache = always_by_pass_cache
|
176 |
+
else:
|
177 |
+
self.always_bypass_cache = always_bypass_cache
|
178 |
+
|
179 |
+
# Thread safety setup
|
180 |
+
self._lock = asyncio.Lock() if thread_safe else None
|
181 |
+
|
182 |
+
# Initialize directories
|
183 |
+
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
184 |
+
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
185 |
+
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
186 |
+
|
187 |
+
self.ready = False
|
188 |
+
|
189 |
+
async def start(self):
|
190 |
+
"""
|
191 |
+
Start the crawler explicitly without using context manager.
|
192 |
+
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
193 |
+
|
194 |
+
This method will:
|
195 |
+
1. Initialize the browser and context
|
196 |
+
2. Perform warmup sequence
|
197 |
+
3. Return the crawler instance for method chaining
|
198 |
+
|
199 |
+
Returns:
|
200 |
+
AsyncWebCrawler: The initialized crawler instance
|
201 |
+
"""
|
202 |
+
await self.crawler_strategy.__aenter__()
|
203 |
+
await self.awarmup()
|
204 |
+
return self
|
205 |
+
|
206 |
+
async def close(self):
|
207 |
+
"""
|
208 |
+
Close the crawler explicitly without using context manager.
|
209 |
+
This should be called when you're done with the crawler if you used start().
|
210 |
+
|
211 |
+
This method will:
|
212 |
+
1. Clean up browser resources
|
213 |
+
2. Close any open pages and contexts
|
214 |
+
"""
|
215 |
+
await self.crawler_strategy.__aexit__(None, None, None)
|
216 |
+
|
217 |
+
async def __aenter__(self):
|
218 |
+
return await self.start()
|
219 |
+
|
220 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
221 |
+
await self.close()
|
222 |
+
|
223 |
+
async def awarmup(self):
|
224 |
+
"""
|
225 |
+
Initialize the crawler with warm-up sequence.
|
226 |
+
|
227 |
+
This method:
|
228 |
+
1. Logs initialization info
|
229 |
+
2. Sets up browser configuration
|
230 |
+
3. Marks the crawler as ready
|
231 |
+
"""
|
232 |
+
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
233 |
+
self.ready = True
|
234 |
+
|
235 |
+
@asynccontextmanager
|
236 |
+
async def nullcontext(self):
|
237 |
+
"""异步空上下文管理器"""
|
238 |
+
yield
|
239 |
+
|
240 |
+
async def arun(
|
241 |
+
self,
|
242 |
+
url: str,
|
243 |
+
config: Optional[CrawlerRunConfig] = None,
|
244 |
+
# Legacy parameters maintained for backwards compatibility
|
245 |
+
word_count_threshold=MIN_WORD_THRESHOLD,
|
246 |
+
extraction_strategy: ExtractionStrategy = None,
|
247 |
+
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
248 |
+
content_filter: RelevantContentFilter = None,
|
249 |
+
cache_mode: Optional[CacheMode] = None,
|
250 |
+
# Deprecated cache parameters
|
251 |
+
bypass_cache: bool = False,
|
252 |
+
disable_cache: bool = False,
|
253 |
+
no_cache_read: bool = False,
|
254 |
+
no_cache_write: bool = False,
|
255 |
+
# Other legacy parameters
|
256 |
+
css_selector: str = None,
|
257 |
+
screenshot: bool = False,
|
258 |
+
pdf: bool = False,
|
259 |
+
user_agent: str = None,
|
260 |
+
verbose=True,
|
261 |
+
**kwargs,
|
262 |
+
) -> CrawlResult:
|
263 |
+
"""
|
264 |
+
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
265 |
+
|
266 |
+
Migration Guide:
|
267 |
+
Old way (deprecated):
|
268 |
+
result = await crawler.arun(
|
269 |
+
url="https://example.com",
|
270 |
+
word_count_threshold=200,
|
271 |
+
screenshot=True,
|
272 |
+
...
|
273 |
+
)
|
274 |
+
|
275 |
+
New way (recommended):
|
276 |
+
config = CrawlerRunConfig(
|
277 |
+
word_count_threshold=200,
|
278 |
+
screenshot=True,
|
279 |
+
...
|
280 |
+
)
|
281 |
+
result = await crawler.arun(url="https://example.com", crawler_config=config)
|
282 |
+
|
283 |
+
Args:
|
284 |
+
url: The URL to crawl (http://, https://, file://, or raw:)
|
285 |
+
crawler_config: Configuration object controlling crawl behavior
|
286 |
+
[other parameters maintained for backwards compatibility]
|
287 |
+
|
288 |
+
Returns:
|
289 |
+
CrawlResult: The result of crawling and processing
|
290 |
+
"""
|
291 |
+
crawler_config = config
|
292 |
+
if not isinstance(url, str) or not url:
|
293 |
+
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
294 |
+
|
295 |
+
async with self._lock or self.nullcontext():
|
296 |
+
try:
|
297 |
+
# Handle configuration
|
298 |
+
if crawler_config is not None:
|
299 |
+
# if any(param is not None for param in [
|
300 |
+
# word_count_threshold, extraction_strategy, chunking_strategy,
|
301 |
+
# content_filter, cache_mode, css_selector, screenshot, pdf
|
302 |
+
# ]):
|
303 |
+
# self.logger.warning(
|
304 |
+
# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
305 |
+
# tag="WARNING"
|
306 |
+
# )
|
307 |
+
config = crawler_config
|
308 |
+
else:
|
309 |
+
# Merge all parameters into a single kwargs dict for config creation
|
310 |
+
config_kwargs = {
|
311 |
+
"word_count_threshold": word_count_threshold,
|
312 |
+
"extraction_strategy": extraction_strategy,
|
313 |
+
"chunking_strategy": chunking_strategy,
|
314 |
+
"content_filter": content_filter,
|
315 |
+
"cache_mode": cache_mode,
|
316 |
+
"bypass_cache": bypass_cache,
|
317 |
+
"disable_cache": disable_cache,
|
318 |
+
"no_cache_read": no_cache_read,
|
319 |
+
"no_cache_write": no_cache_write,
|
320 |
+
"css_selector": css_selector,
|
321 |
+
"screenshot": screenshot,
|
322 |
+
"pdf": pdf,
|
323 |
+
"verbose": verbose,
|
324 |
+
**kwargs
|
325 |
+
}
|
326 |
+
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
327 |
+
|
328 |
+
# Handle deprecated cache parameters
|
329 |
+
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
330 |
+
if kwargs.get("warning", True):
|
331 |
+
warnings.warn(
|
332 |
+
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
|
333 |
+
"Use 'cache_mode' parameter instead.",
|
334 |
+
DeprecationWarning,
|
335 |
+
stacklevel=2
|
336 |
+
)
|
337 |
+
|
338 |
+
# Convert legacy parameters if cache_mode not provided
|
339 |
+
if config.cache_mode is None:
|
340 |
+
config.cache_mode = _legacy_to_cache_mode(
|
341 |
+
disable_cache=disable_cache,
|
342 |
+
bypass_cache=bypass_cache,
|
343 |
+
no_cache_read=no_cache_read,
|
344 |
+
no_cache_write=no_cache_write
|
345 |
+
)
|
346 |
+
|
347 |
+
# Default to ENABLED if no cache mode specified
|
348 |
+
if config.cache_mode is None:
|
349 |
+
config.cache_mode = CacheMode.ENABLED
|
350 |
+
|
351 |
+
# Create cache context
|
352 |
+
cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)
|
353 |
+
|
354 |
+
# Initialize processing variables
|
355 |
+
async_response: AsyncCrawlResponse = None
|
356 |
+
cached_result: CrawlResult = None
|
357 |
+
screenshot_data = None
|
358 |
+
pdf_data = None
|
359 |
+
extracted_content = None
|
360 |
+
start_time = time.perf_counter()
|
361 |
+
|
362 |
+
# Try to get cached result if appropriate
|
363 |
+
if cache_context.should_read():
|
364 |
+
cached_result = await async_db_manager.aget_cached_url(url)
|
365 |
+
|
366 |
+
if cached_result:
|
367 |
+
html = sanitize_input_encode(cached_result.html)
|
368 |
+
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
369 |
+
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
|
370 |
+
# If screenshot is requested but its not in cache, then set cache_result to None
|
371 |
+
screenshot_data = cached_result.screenshot
|
372 |
+
pdf_data = cached_result.pdf
|
373 |
+
if config.screenshot and not screenshot or config.pdf and not pdf:
|
374 |
+
cached_result = None
|
375 |
+
|
376 |
+
self.logger.url_status(
|
377 |
+
url=cache_context.display_url,
|
378 |
+
success=bool(html),
|
379 |
+
timing=time.perf_counter() - start_time,
|
380 |
+
tag="FETCH"
|
381 |
+
)
|
382 |
+
|
383 |
+
# Fetch fresh content if needed
|
384 |
+
if not cached_result or not html:
|
385 |
+
t1 = time.perf_counter()
|
386 |
+
|
387 |
+
if user_agent:
|
388 |
+
self.crawler_strategy.update_user_agent(user_agent)
|
389 |
+
|
390 |
+
# Pass config to crawl method
|
391 |
+
async_response = await self.crawler_strategy.crawl(
|
392 |
+
url,
|
393 |
+
config=config # Pass the entire config object
|
394 |
+
)
|
395 |
+
|
396 |
+
html = sanitize_input_encode(async_response.html)
|
397 |
+
screenshot_data = async_response.screenshot
|
398 |
+
pdf_data = async_response.pdf_data
|
399 |
+
|
400 |
+
t2 = time.perf_counter()
|
401 |
+
self.logger.url_status(
|
402 |
+
url=cache_context.display_url,
|
403 |
+
success=bool(html),
|
404 |
+
timing=t2 - t1,
|
405 |
+
tag="FETCH"
|
406 |
+
)
|
407 |
+
|
408 |
+
# Process the HTML content
|
409 |
+
crawl_result = await self.aprocess_html(
|
410 |
+
url=url,
|
411 |
+
html=html,
|
412 |
+
extracted_content=extracted_content,
|
413 |
+
config=config, # Pass the config object instead of individual parameters
|
414 |
+
screenshot=screenshot_data,
|
415 |
+
pdf_data=pdf_data,
|
416 |
+
verbose=config.verbose,
|
417 |
+
is_raw_html = True if url.startswith("raw:") else False,
|
418 |
+
**kwargs
|
419 |
+
)
|
420 |
+
|
421 |
+
crawl_result.status_code = async_response.status_code
|
422 |
+
crawl_result.response_headers = async_response.response_headers
|
423 |
+
crawl_result.downloaded_files = async_response.downloaded_files
|
424 |
+
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
425 |
+
|
426 |
+
# # Check and set values from async_response to crawl_result
|
427 |
+
# try:
|
428 |
+
# for key in vars(async_response):
|
429 |
+
# if hasattr(crawl_result, key):
|
430 |
+
# value = getattr(async_response, key, None)
|
431 |
+
# current_value = getattr(crawl_result, key, None)
|
432 |
+
# if value is not None and not current_value:
|
433 |
+
# try:
|
434 |
+
# setattr(crawl_result, key, value)
|
435 |
+
# except Exception as e:
|
436 |
+
# self.logger.warning(
|
437 |
+
# message=f"Failed to set attribute {key}: {str(e)}",
|
438 |
+
# tag="WARNING"
|
439 |
+
# )
|
440 |
+
# except Exception as e:
|
441 |
+
# self.logger.warning(
|
442 |
+
# message=f"Error copying response attributes: {str(e)}",
|
443 |
+
# tag="WARNING"
|
444 |
+
# )
|
445 |
+
|
446 |
+
crawl_result.success = bool(html)
|
447 |
+
crawl_result.session_id = getattr(config, 'session_id', None)
|
448 |
+
|
449 |
+
self.logger.success(
|
450 |
+
message="{url:.50}... | Status: {status} | Total: {timing}",
|
451 |
+
tag="COMPLETE",
|
452 |
+
params={
|
453 |
+
"url": cache_context.display_url,
|
454 |
+
"status": crawl_result.success,
|
455 |
+
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
456 |
+
},
|
457 |
+
colors={
|
458 |
+
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
459 |
+
"timing": Fore.YELLOW
|
460 |
+
}
|
461 |
+
)
|
462 |
+
|
463 |
+
# Update cache if appropriate
|
464 |
+
if cache_context.should_write() and not bool(cached_result):
|
465 |
+
await async_db_manager.acache_url(crawl_result)
|
466 |
+
|
467 |
+
return crawl_result
|
468 |
+
|
469 |
+
else:
|
470 |
+
self.logger.success(
|
471 |
+
message="{url:.50}... | Status: {status} | Total: {timing}",
|
472 |
+
tag="COMPLETE",
|
473 |
+
params={
|
474 |
+
"url": cache_context.display_url,
|
475 |
+
"status": True,
|
476 |
+
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
477 |
+
},
|
478 |
+
colors={
|
479 |
+
"status": Fore.GREEN,
|
480 |
+
"timing": Fore.YELLOW
|
481 |
+
}
|
482 |
+
)
|
483 |
+
|
484 |
+
cached_result.success = bool(html)
|
485 |
+
cached_result.session_id = getattr(config, 'session_id', None)
|
486 |
+
return cached_result
|
487 |
+
|
488 |
+
except Exception as e:
|
489 |
+
error_context = get_error_context(sys.exc_info())
|
490 |
+
|
491 |
+
error_message = (
|
492 |
+
f"Unexpected error in _crawl_web at line {error_context['line_no']} "
|
493 |
+
f"in {error_context['function']} ({error_context['filename']}):\n"
|
494 |
+
f"Error: {str(e)}\n\n"
|
495 |
+
f"Code context:\n{error_context['code_context']}"
|
496 |
+
)
|
497 |
+
# if not hasattr(e, "msg"):
|
498 |
+
# e.msg = str(e)
|
499 |
+
|
500 |
+
self.logger.error_status(
|
501 |
+
url=url,
|
502 |
+
error=create_box_message(error_message, type="error"),
|
503 |
+
tag="ERROR"
|
504 |
+
)
|
505 |
+
|
506 |
+
return CrawlResult(
|
507 |
+
url=url,
|
508 |
+
html="",
|
509 |
+
success=False,
|
510 |
+
error_message=error_message
|
511 |
+
)
|
512 |
+
|
513 |
+
async def aprocess_html(
|
514 |
+
self,
|
515 |
+
url: str,
|
516 |
+
html: str,
|
517 |
+
extracted_content: str,
|
518 |
+
config: CrawlerRunConfig,
|
519 |
+
screenshot: str,
|
520 |
+
pdf_data: str,
|
521 |
+
verbose: bool,
|
522 |
+
**kwargs,
|
523 |
+
) -> CrawlResult:
|
524 |
+
"""
|
525 |
+
Process HTML content using the provided configuration.
|
526 |
+
|
527 |
+
Args:
|
528 |
+
url: The URL being processed
|
529 |
+
html: Raw HTML content
|
530 |
+
extracted_content: Previously extracted content (if any)
|
531 |
+
config: Configuration object controlling processing behavior
|
532 |
+
screenshot: Screenshot data (if any)
|
533 |
+
pdf_data: PDF data (if any)
|
534 |
+
verbose: Whether to enable verbose logging
|
535 |
+
**kwargs: Additional parameters for backwards compatibility
|
536 |
+
|
537 |
+
Returns:
|
538 |
+
CrawlResult: Processed result containing extracted and formatted content
|
539 |
+
"""
|
540 |
+
try:
|
541 |
+
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
542 |
+
t1 = time.perf_counter()
|
543 |
+
|
544 |
+
# Initialize scraping strategy
|
545 |
+
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
|
546 |
+
|
547 |
+
# Process HTML content
|
548 |
+
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
549 |
+
# add keys from kwargs to params that doesn't exist in params
|
550 |
+
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
551 |
+
|
552 |
+
result = scrapping_strategy.scrap(
|
553 |
+
url,
|
554 |
+
html,
|
555 |
+
**params,
|
556 |
+
# word_count_threshold=config.word_count_threshold,
|
557 |
+
# css_selector=config.css_selector,
|
558 |
+
# only_text=config.only_text,
|
559 |
+
# image_description_min_word_threshold=config.image_description_min_word_threshold,
|
560 |
+
# content_filter=config.content_filter,
|
561 |
+
# **kwargs
|
562 |
+
)
|
563 |
+
|
564 |
+
if result is None:
|
565 |
+
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
566 |
+
|
567 |
+
except InvalidCSSSelectorError as e:
|
568 |
+
raise ValueError(str(e))
|
569 |
+
except Exception as e:
|
570 |
+
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
571 |
+
|
572 |
+
|
573 |
+
|
574 |
+
# Extract results
|
575 |
+
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
576 |
+
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
577 |
+
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
578 |
+
media = result.get("media", [])
|
579 |
+
links = result.get("links", [])
|
580 |
+
metadata = result.get("metadata", {})
|
581 |
+
|
582 |
+
# Markdown Generation
|
583 |
+
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
584 |
+
|
585 |
+
# Uncomment if by default we want to use PruningContentFilter
|
586 |
+
# if not config.content_filter and not markdown_generator.content_filter:
|
587 |
+
# markdown_generator.content_filter = PruningContentFilter()
|
588 |
+
|
589 |
+
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
590 |
+
cleaned_html=cleaned_html,
|
591 |
+
base_url=url,
|
592 |
+
# html2text_options=kwargs.get('html2text', {})
|
593 |
+
)
|
594 |
+
markdown_v2 = markdown_result
|
595 |
+
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
596 |
+
|
597 |
+
# Log processing completion
|
598 |
+
self.logger.info(
|
599 |
+
message="Processed {url:.50}... | Time: {timing}ms",
|
600 |
+
tag="SCRAPE",
|
601 |
+
params={
|
602 |
+
"url": _url,
|
603 |
+
"timing": int((time.perf_counter() - t1) * 1000)
|
604 |
+
}
|
605 |
+
)
|
606 |
+
|
607 |
+
# Handle content extraction if needed
|
608 |
+
if (extracted_content is None and
|
609 |
+
config.extraction_strategy and
|
610 |
+
config.chunking_strategy and
|
611 |
+
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
612 |
+
|
613 |
+
t1 = time.perf_counter()
|
614 |
+
|
615 |
+
# Choose content based on input_format
|
616 |
+
content_format = config.extraction_strategy.input_format
|
617 |
+
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
618 |
+
self.logger.warning(
|
619 |
+
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
620 |
+
tag="EXTRACT",
|
621 |
+
params={"url": _url}
|
622 |
+
)
|
623 |
+
content_format = "markdown"
|
624 |
+
|
625 |
+
content = {
|
626 |
+
"markdown": markdown,
|
627 |
+
"html": html,
|
628 |
+
"fit_markdown": markdown_result.raw_markdown
|
629 |
+
}.get(content_format, markdown)
|
630 |
+
|
631 |
+
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
|
632 |
+
chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
|
633 |
+
sections = chunking.chunk(content)
|
634 |
+
extracted_content = config.extraction_strategy.run(url, sections)
|
635 |
+
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
636 |
+
|
637 |
+
# Log extraction completion
|
638 |
+
self.logger.info(
|
639 |
+
message="Completed for {url:.50}... | Time: {timing}s",
|
640 |
+
tag="EXTRACT",
|
641 |
+
params={
|
642 |
+
"url": _url,
|
643 |
+
"timing": time.perf_counter() - t1
|
644 |
+
}
|
645 |
+
)
|
646 |
+
|
647 |
+
# Handle screenshot and PDF data
|
648 |
+
screenshot_data = None if not screenshot else screenshot
|
649 |
+
pdf_data = None if not pdf_data else pdf_data
|
650 |
+
|
651 |
+
# Apply HTML formatting if requested
|
652 |
+
if config.prettiify:
|
653 |
+
cleaned_html = fast_format_html(cleaned_html)
|
654 |
+
|
655 |
+
# Return complete crawl result
|
656 |
+
return CrawlResult(
|
657 |
+
url=url,
|
658 |
+
html=html,
|
659 |
+
cleaned_html=cleaned_html,
|
660 |
+
markdown_v2=markdown_v2,
|
661 |
+
markdown=markdown,
|
662 |
+
fit_markdown=fit_markdown,
|
663 |
+
fit_html=fit_html,
|
664 |
+
media=media,
|
665 |
+
links=links,
|
666 |
+
metadata=metadata,
|
667 |
+
screenshot=screenshot_data,
|
668 |
+
pdf=pdf_data,
|
669 |
+
extracted_content=extracted_content,
|
670 |
+
success=True,
|
671 |
+
error_message="",
|
672 |
+
)
|
673 |
+
|
674 |
+
async def arun_many(
|
675 |
+
self,
|
676 |
+
urls: List[str],
|
677 |
+
config: Optional[CrawlerRunConfig] = None,
|
678 |
+
# Legacy parameters maintained for backwards compatibility
|
679 |
+
word_count_threshold=MIN_WORD_THRESHOLD,
|
680 |
+
extraction_strategy: ExtractionStrategy = None,
|
681 |
+
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
682 |
+
content_filter: RelevantContentFilter = None,
|
683 |
+
cache_mode: Optional[CacheMode] = None,
|
684 |
+
bypass_cache: bool = False,
|
685 |
+
css_selector: str = None,
|
686 |
+
screenshot: bool = False,
|
687 |
+
pdf: bool = False,
|
688 |
+
user_agent: str = None,
|
689 |
+
verbose=True,
|
690 |
+
**kwargs,
|
691 |
+
) -> List[CrawlResult]:
|
692 |
+
"""
|
693 |
+
Runs the crawler for multiple URLs concurrently.
|
694 |
+
|
695 |
+
Migration Guide:
|
696 |
+
Old way (deprecated):
|
697 |
+
results = await crawler.arun_many(
|
698 |
+
urls,
|
699 |
+
word_count_threshold=200,
|
700 |
+
screenshot=True,
|
701 |
+
...
|
702 |
+
)
|
703 |
+
|
704 |
+
New way (recommended):
|
705 |
+
config = CrawlerRunConfig(
|
706 |
+
word_count_threshold=200,
|
707 |
+
screenshot=True,
|
708 |
+
...
|
709 |
+
)
|
710 |
+
results = await crawler.arun_many(urls, crawler_config=config)
|
711 |
+
|
712 |
+
Args:
|
713 |
+
urls: List of URLs to crawl
|
714 |
+
crawler_config: Configuration object controlling crawl behavior for all URLs
|
715 |
+
[other parameters maintained for backwards compatibility]
|
716 |
+
|
717 |
+
Returns:
|
718 |
+
List[CrawlResult]: Results for each URL
|
719 |
+
"""
|
720 |
+
crawler_config = config
|
721 |
+
# Handle configuration
|
722 |
+
if crawler_config is not None:
|
723 |
+
if any(param is not None for param in [
|
724 |
+
word_count_threshold, extraction_strategy, chunking_strategy,
|
725 |
+
content_filter, cache_mode, css_selector, screenshot, pdf
|
726 |
+
]):
|
727 |
+
self.logger.warning(
|
728 |
+
message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
729 |
+
tag="WARNING"
|
730 |
+
)
|
731 |
+
config = crawler_config
|
732 |
+
else:
|
733 |
+
# Merge all parameters into a single kwargs dict for config creation
|
734 |
+
config_kwargs = {
|
735 |
+
"word_count_threshold": word_count_threshold,
|
736 |
+
"extraction_strategy": extraction_strategy,
|
737 |
+
"chunking_strategy": chunking_strategy,
|
738 |
+
"content_filter": content_filter,
|
739 |
+
"cache_mode": cache_mode,
|
740 |
+
"bypass_cache": bypass_cache,
|
741 |
+
"css_selector": css_selector,
|
742 |
+
"screenshot": screenshot,
|
743 |
+
"pdf": pdf,
|
744 |
+
"verbose": verbose,
|
745 |
+
**kwargs
|
746 |
+
}
|
747 |
+
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
748 |
+
|
749 |
+
if bypass_cache:
|
750 |
+
if kwargs.get("warning", True):
|
751 |
+
warnings.warn(
|
752 |
+
"'bypass_cache' is deprecated and will be removed in version 0.5.0. "
|
753 |
+
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
754 |
+
"Pass warning=False to suppress this warning.",
|
755 |
+
DeprecationWarning,
|
756 |
+
stacklevel=2
|
757 |
+
)
|
758 |
+
if config.cache_mode is None:
|
759 |
+
config.cache_mode = CacheMode.BYPASS
|
760 |
+
|
761 |
+
semaphore_count = config.semaphore_count or 5
|
762 |
+
semaphore = asyncio.Semaphore(semaphore_count)
|
763 |
+
|
764 |
+
async def crawl_with_semaphore(url):
|
765 |
+
# Handle rate limiting per domain
|
766 |
+
domain = urlparse(url).netloc
|
767 |
+
current_time = time.time()
|
768 |
+
|
769 |
+
self.logger.debug(
|
770 |
+
message="Started task for {url:.50}...",
|
771 |
+
tag="PARALLEL",
|
772 |
+
params={"url": url}
|
773 |
+
)
|
774 |
+
|
775 |
+
# Get delay settings from config
|
776 |
+
mean_delay = config.mean_delay
|
777 |
+
max_range = config.max_range
|
778 |
+
|
779 |
+
# Apply rate limiting
|
780 |
+
if domain in self._domain_last_hit:
|
781 |
+
time_since_last = current_time - self._domain_last_hit[domain]
|
782 |
+
if time_since_last < mean_delay:
|
783 |
+
delay = mean_delay + random.uniform(0, max_range)
|
784 |
+
await asyncio.sleep(delay)
|
785 |
+
|
786 |
+
self._domain_last_hit[domain] = current_time
|
787 |
+
|
788 |
+
async with semaphore:
|
789 |
+
return await self.arun(
|
790 |
+
url,
|
791 |
+
crawler_config=config, # Pass the entire config object
|
792 |
+
user_agent=user_agent # Maintain user_agent override capability
|
793 |
+
)
|
794 |
+
|
795 |
+
# Log start of concurrent crawling
|
796 |
+
self.logger.info(
|
797 |
+
message="Starting concurrent crawling for {count} URLs...",
|
798 |
+
tag="INIT",
|
799 |
+
params={"count": len(urls)}
|
800 |
+
)
|
801 |
+
|
802 |
+
# Execute concurrent crawls
|
803 |
+
start_time = time.perf_counter()
|
804 |
+
tasks = [crawl_with_semaphore(url) for url in urls]
|
805 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
806 |
+
end_time = time.perf_counter()
|
807 |
+
|
808 |
+
# Log completion
|
809 |
+
self.logger.success(
|
810 |
+
message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
|
811 |
+
tag="COMPLETE",
|
812 |
+
params={
|
813 |
+
"count": len(urls),
|
814 |
+
"timing": f"{end_time - start_time:.2f}s"
|
815 |
+
},
|
816 |
+
colors={
|
817 |
+
"timing": Fore.YELLOW
|
818 |
+
}
|
819 |
+
)
|
820 |
+
|
821 |
+
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
822 |
+
|
823 |
+
async def aclear_cache(self):
|
824 |
+
"""Clear the cache database."""
|
825 |
+
await async_db_manager.cleanup()
|
826 |
+
|
827 |
+
async def aflush_cache(self):
|
828 |
+
"""Flush the cache database."""
|
829 |
+
await async_db_manager.aflush_db()
|
830 |
+
|
831 |
+
async def aget_cache_size(self):
|
832 |
+
"""Get the total number of cached items."""
|
833 |
+
return await async_db_manager.aget_total_count()
|
crawl4ai/cache_context.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
|
3 |
+
|
4 |
+
class CacheMode(Enum):
|
5 |
+
"""
|
6 |
+
Defines the caching behavior for web crawling operations.
|
7 |
+
|
8 |
+
Modes:
|
9 |
+
- ENABLED: Normal caching behavior (read and write)
|
10 |
+
- DISABLED: No caching at all
|
11 |
+
- READ_ONLY: Only read from cache, don't write
|
12 |
+
- WRITE_ONLY: Only write to cache, don't read
|
13 |
+
- BYPASS: Bypass cache for this operation
|
14 |
+
"""
|
15 |
+
ENABLED = "enabled"
|
16 |
+
DISABLED = "disabled"
|
17 |
+
READ_ONLY = "read_only"
|
18 |
+
WRITE_ONLY = "write_only"
|
19 |
+
BYPASS = "bypass"
|
20 |
+
|
21 |
+
|
22 |
+
class CacheContext:
|
23 |
+
"""
|
24 |
+
Encapsulates cache-related decisions and URL handling.
|
25 |
+
|
26 |
+
This class centralizes all cache-related logic and URL type checking,
|
27 |
+
making the caching behavior more predictable and maintainable.
|
28 |
+
|
29 |
+
Attributes:
|
30 |
+
url (str): The URL being processed.
|
31 |
+
cache_mode (CacheMode): The cache mode for the current operation.
|
32 |
+
always_bypass (bool): If True, bypasses caching for this operation.
|
33 |
+
is_cacheable (bool): True if the URL is cacheable, False otherwise.
|
34 |
+
is_web_url (bool): True if the URL is a web URL, False otherwise.
|
35 |
+
is_local_file (bool): True if the URL is a local file, False otherwise.
|
36 |
+
is_raw_html (bool): True if the URL is raw HTML, False otherwise.
|
37 |
+
_url_display (str): The display name for the URL (web, local file, or raw HTML).
|
38 |
+
"""
|
39 |
+
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
|
40 |
+
"""
|
41 |
+
Initializes the CacheContext with the provided URL and cache mode.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
url (str): The URL being processed.
|
45 |
+
cache_mode (CacheMode): The cache mode for the current operation.
|
46 |
+
always_bypass (bool): If True, bypasses caching for this operation.
|
47 |
+
"""
|
48 |
+
self.url = url
|
49 |
+
self.cache_mode = cache_mode
|
50 |
+
self.always_bypass = always_bypass
|
51 |
+
self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
|
52 |
+
self.is_web_url = url.startswith(('http://', 'https://'))
|
53 |
+
self.is_local_file = url.startswith("file://")
|
54 |
+
self.is_raw_html = url.startswith("raw:")
|
55 |
+
self._url_display = url if not self.is_raw_html else "Raw HTML"
|
56 |
+
|
57 |
+
def should_read(self) -> bool:
|
58 |
+
"""
|
59 |
+
Determines if cache should be read based on context.
|
60 |
+
|
61 |
+
How it works:
|
62 |
+
1. If always_bypass is True or is_cacheable is False, return False.
|
63 |
+
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
bool: True if cache should be read, False otherwise.
|
67 |
+
"""
|
68 |
+
if self.always_bypass or not self.is_cacheable:
|
69 |
+
return False
|
70 |
+
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
71 |
+
|
72 |
+
def should_write(self) -> bool:
|
73 |
+
"""
|
74 |
+
Determines if cache should be written based on context.
|
75 |
+
|
76 |
+
How it works:
|
77 |
+
1. If always_bypass is True or is_cacheable is False, return False.
|
78 |
+
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
bool: True if cache should be written, False otherwise.
|
82 |
+
"""
|
83 |
+
if self.always_bypass or not self.is_cacheable:
|
84 |
+
return False
|
85 |
+
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
86 |
+
|
87 |
+
@property
|
88 |
+
def display_url(self) -> str:
|
89 |
+
"""Returns the URL in display format."""
|
90 |
+
return self._url_display
|
91 |
+
|
92 |
+
|
93 |
+
def _legacy_to_cache_mode(
|
94 |
+
disable_cache: bool = False,
|
95 |
+
bypass_cache: bool = False,
|
96 |
+
no_cache_read: bool = False,
|
97 |
+
no_cache_write: bool = False
|
98 |
+
) -> CacheMode:
|
99 |
+
"""
|
100 |
+
Converts legacy cache parameters to the new CacheMode enum.
|
101 |
+
|
102 |
+
This is an internal function to help transition from the old boolean flags
|
103 |
+
to the new CacheMode system.
|
104 |
+
"""
|
105 |
+
if disable_cache:
|
106 |
+
return CacheMode.DISABLED
|
107 |
+
if bypass_cache:
|
108 |
+
return CacheMode.BYPASS
|
109 |
+
if no_cache_read and no_cache_write:
|
110 |
+
return CacheMode.DISABLED
|
111 |
+
if no_cache_read:
|
112 |
+
return CacheMode.WRITE_ONLY
|
113 |
+
if no_cache_write:
|
114 |
+
return CacheMode.READ_ONLY
|
115 |
+
return CacheMode.ENABLED
|
crawl4ai/chunking_strategy.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
import string
|
5 |
+
from .model_loader import load_nltk_punkt
|
6 |
+
from .utils import *
|
7 |
+
|
8 |
+
# Define the abstract base class for chunking strategies
|
9 |
+
class ChunkingStrategy(ABC):
|
10 |
+
"""
|
11 |
+
Abstract base class for chunking strategies.
|
12 |
+
"""
|
13 |
+
|
14 |
+
@abstractmethod
|
15 |
+
def chunk(self, text: str) -> list:
|
16 |
+
"""
|
17 |
+
Abstract method to chunk the given text.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
text (str): The text to chunk.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
list: A list of chunks.
|
24 |
+
"""
|
25 |
+
pass
|
26 |
+
|
27 |
+
# Create an identity chunking strategy f(x) = [x]
|
28 |
+
class IdentityChunking(ChunkingStrategy):
|
29 |
+
"""
|
30 |
+
Chunking strategy that returns the input text as a single chunk.
|
31 |
+
"""
|
32 |
+
def chunk(self, text: str) -> list:
|
33 |
+
return [text]
|
34 |
+
|
35 |
+
# Regex-based chunking
|
36 |
+
class RegexChunking(ChunkingStrategy):
|
37 |
+
"""
|
38 |
+
Chunking strategy that splits text based on regular expression patterns.
|
39 |
+
"""
|
40 |
+
def __init__(self, patterns=None, **kwargs):
|
41 |
+
"""
|
42 |
+
Initialize the RegexChunking object.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
patterns (list): A list of regular expression patterns to split text.
|
46 |
+
"""
|
47 |
+
if patterns is None:
|
48 |
+
patterns = [r'\n\n'] # Default split pattern
|
49 |
+
self.patterns = patterns
|
50 |
+
|
51 |
+
def chunk(self, text: str) -> list:
|
52 |
+
paragraphs = [text]
|
53 |
+
for pattern in self.patterns:
|
54 |
+
new_paragraphs = []
|
55 |
+
for paragraph in paragraphs:
|
56 |
+
new_paragraphs.extend(re.split(pattern, paragraph))
|
57 |
+
paragraphs = new_paragraphs
|
58 |
+
return paragraphs
|
59 |
+
|
60 |
+
# NLP-based sentence chunking
|
61 |
+
class NlpSentenceChunking(ChunkingStrategy):
|
62 |
+
"""
|
63 |
+
Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
|
64 |
+
"""
|
65 |
+
def __init__(self, **kwargs):
|
66 |
+
"""
|
67 |
+
Initialize the NlpSentenceChunking object.
|
68 |
+
"""
|
69 |
+
load_nltk_punkt()
|
70 |
+
|
71 |
+
|
72 |
+
def chunk(self, text: str) -> list:
|
73 |
+
# Improved regex for sentence splitting
|
74 |
+
# sentence_endings = re.compile(
|
75 |
+
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
76 |
+
# )
|
77 |
+
# sentences = sentence_endings.split(text)
|
78 |
+
# sens = [sent.strip() for sent in sentences if sent]
|
79 |
+
from nltk.tokenize import sent_tokenize
|
80 |
+
sentences = sent_tokenize(text)
|
81 |
+
sens = [sent.strip() for sent in sentences]
|
82 |
+
|
83 |
+
return list(set(sens))
|
84 |
+
|
85 |
+
# Topic-based segmentation using TextTiling
|
86 |
+
class TopicSegmentationChunking(ChunkingStrategy):
|
87 |
+
"""
|
88 |
+
Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
|
89 |
+
|
90 |
+
How it works:
|
91 |
+
1. Segment the text into topics using TextTilingTokenizer
|
92 |
+
2. Extract keywords for each topic segment
|
93 |
+
"""
|
94 |
+
|
95 |
+
def __init__(self, num_keywords=3, **kwargs):
|
96 |
+
"""
|
97 |
+
Initialize the TopicSegmentationChunking object.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
num_keywords (int): The number of keywords to extract for each topic segment.
|
101 |
+
"""
|
102 |
+
import nltk as nl
|
103 |
+
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
104 |
+
self.num_keywords = num_keywords
|
105 |
+
|
106 |
+
def chunk(self, text: str) -> list:
|
107 |
+
# Use the TextTilingTokenizer to segment the text
|
108 |
+
segmented_topics = self.tokenizer.tokenize(text)
|
109 |
+
return segmented_topics
|
110 |
+
|
111 |
+
def extract_keywords(self, text: str) -> list:
|
112 |
+
# Tokenize and remove stopwords and punctuation
|
113 |
+
import nltk as nl
|
114 |
+
tokens = nl.toknize.word_tokenize(text)
|
115 |
+
tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
|
116 |
+
|
117 |
+
# Calculate frequency distribution
|
118 |
+
freq_dist = Counter(tokens)
|
119 |
+
keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
|
120 |
+
return keywords
|
121 |
+
|
122 |
+
def chunk_with_topics(self, text: str) -> list:
|
123 |
+
# Segment the text into topics
|
124 |
+
segments = self.chunk(text)
|
125 |
+
# Extract keywords for each topic segment
|
126 |
+
segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
|
127 |
+
return segments_with_topics
|
128 |
+
|
129 |
+
# Fixed-length word chunks
|
130 |
+
class FixedLengthWordChunking(ChunkingStrategy):
|
131 |
+
"""
|
132 |
+
Chunking strategy that splits text into fixed-length word chunks.
|
133 |
+
|
134 |
+
How it works:
|
135 |
+
1. Split the text into words
|
136 |
+
2. Create chunks of fixed length
|
137 |
+
3. Return the list of chunks
|
138 |
+
"""
|
139 |
+
def __init__(self, chunk_size=100, **kwargs):
|
140 |
+
"""
|
141 |
+
Initialize the fixed-length word chunking strategy with the given chunk size.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
chunk_size (int): The size of each chunk in words.
|
145 |
+
"""
|
146 |
+
self.chunk_size = chunk_size
|
147 |
+
|
148 |
+
def chunk(self, text: str) -> list:
|
149 |
+
words = text.split()
|
150 |
+
return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
|
151 |
+
|
152 |
+
# Sliding window chunking
|
153 |
+
class SlidingWindowChunking(ChunkingStrategy):
|
154 |
+
"""
|
155 |
+
Chunking strategy that splits text into overlapping word chunks.
|
156 |
+
|
157 |
+
How it works:
|
158 |
+
1. Split the text into words
|
159 |
+
2. Create chunks of fixed length
|
160 |
+
3. Return the list of chunks
|
161 |
+
"""
|
162 |
+
def __init__(self, window_size=100, step=50, **kwargs):
|
163 |
+
"""
|
164 |
+
Initialize the sliding window chunking strategy with the given window size and
|
165 |
+
step size.
|
166 |
+
|
167 |
+
Args:
|
168 |
+
window_size (int): The size of the sliding window in words.
|
169 |
+
step (int): The step size for sliding the window in words.
|
170 |
+
"""
|
171 |
+
self.window_size = window_size
|
172 |
+
self.step = step
|
173 |
+
|
174 |
+
def chunk(self, text: str) -> list:
|
175 |
+
words = text.split()
|
176 |
+
chunks = []
|
177 |
+
|
178 |
+
if len(words) <= self.window_size:
|
179 |
+
return [text]
|
180 |
+
|
181 |
+
for i in range(0, len(words) - self.window_size + 1, self.step):
|
182 |
+
chunk = ' '.join(words[i:i + self.window_size])
|
183 |
+
chunks.append(chunk)
|
184 |
+
|
185 |
+
# Handle the last chunk if it doesn't align perfectly
|
186 |
+
if i + self.window_size < len(words):
|
187 |
+
chunks.append(' '.join(words[-self.window_size:]))
|
188 |
+
|
189 |
+
return chunks
|
190 |
+
|
191 |
+
class OverlappingWindowChunking(ChunkingStrategy):
|
192 |
+
"""
|
193 |
+
Chunking strategy that splits text into overlapping word chunks.
|
194 |
+
|
195 |
+
How it works:
|
196 |
+
1. Split the text into words using whitespace
|
197 |
+
2. Create chunks of fixed length equal to the window size
|
198 |
+
3. Slide the window by the overlap size
|
199 |
+
4. Return the list of chunks
|
200 |
+
"""
|
201 |
+
def __init__(self, window_size=1000, overlap=100, **kwargs):
|
202 |
+
"""
|
203 |
+
Initialize the overlapping window chunking strategy with the given window size and
|
204 |
+
overlap size.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
window_size (int): The size of the window in words.
|
208 |
+
overlap (int): The size of the overlap between consecutive chunks in words.
|
209 |
+
"""
|
210 |
+
self.window_size = window_size
|
211 |
+
self.overlap = overlap
|
212 |
+
|
213 |
+
def chunk(self, text: str) -> list:
|
214 |
+
words = text.split()
|
215 |
+
chunks = []
|
216 |
+
|
217 |
+
if len(words) <= self.window_size:
|
218 |
+
return [text]
|
219 |
+
|
220 |
+
start = 0
|
221 |
+
while start < len(words):
|
222 |
+
end = start + self.window_size
|
223 |
+
chunk = ' '.join(words[start:end])
|
224 |
+
chunks.append(chunk)
|
225 |
+
|
226 |
+
if end >= len(words):
|
227 |
+
break
|
228 |
+
|
229 |
+
start = end - self.overlap
|
230 |
+
|
231 |
+
return chunks
|
crawl4ai/cli.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import click
|
2 |
+
import sys
|
3 |
+
import asyncio
|
4 |
+
from typing import List
|
5 |
+
from .docs_manager import DocsManager
|
6 |
+
from .async_logger import AsyncLogger
|
7 |
+
|
8 |
+
logger = AsyncLogger(verbose=True)
|
9 |
+
docs_manager = DocsManager(logger)
|
10 |
+
|
11 |
+
def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
|
12 |
+
"""Print formatted table with headers and rows"""
|
13 |
+
widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
|
14 |
+
border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
|
15 |
+
|
16 |
+
def format_row(row):
|
17 |
+
return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
|
18 |
+
for cell, w in zip(row, widths)) + '|'
|
19 |
+
|
20 |
+
click.echo(border)
|
21 |
+
click.echo(format_row(headers))
|
22 |
+
click.echo(border)
|
23 |
+
for row in rows:
|
24 |
+
click.echo(format_row(row))
|
25 |
+
click.echo(border)
|
26 |
+
|
27 |
+
@click.group()
|
28 |
+
def cli():
|
29 |
+
"""Crawl4AI Command Line Interface"""
|
30 |
+
pass
|
31 |
+
|
32 |
+
@cli.group()
|
33 |
+
def docs():
|
34 |
+
"""Documentation operations"""
|
35 |
+
pass
|
36 |
+
|
37 |
+
@docs.command()
|
38 |
+
@click.argument('sections', nargs=-1)
|
39 |
+
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
|
40 |
+
def combine(sections: tuple, mode: str):
|
41 |
+
"""Combine documentation sections"""
|
42 |
+
try:
|
43 |
+
asyncio.run(docs_manager.ensure_docs_exist())
|
44 |
+
click.echo(docs_manager.generate(sections, mode))
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(str(e), tag="ERROR")
|
47 |
+
sys.exit(1)
|
48 |
+
|
49 |
+
@docs.command()
|
50 |
+
@click.argument('query')
|
51 |
+
@click.option('--top-k', '-k', default=5)
|
52 |
+
@click.option('--build-index', is_flag=True, help='Build index if missing')
|
53 |
+
def search(query: str, top_k: int, build_index: bool):
|
54 |
+
"""Search documentation"""
|
55 |
+
try:
|
56 |
+
result = docs_manager.search(query, top_k)
|
57 |
+
if result == "No search index available. Call build_search_index() first.":
|
58 |
+
if build_index or click.confirm('No search index found. Build it now?'):
|
59 |
+
asyncio.run(docs_manager.llm_text.generate_index_files())
|
60 |
+
result = docs_manager.search(query, top_k)
|
61 |
+
click.echo(result)
|
62 |
+
except Exception as e:
|
63 |
+
click.echo(f"Error: {str(e)}", err=True)
|
64 |
+
sys.exit(1)
|
65 |
+
|
66 |
+
@docs.command()
|
67 |
+
def update():
|
68 |
+
"""Update docs from GitHub"""
|
69 |
+
try:
|
70 |
+
asyncio.run(docs_manager.fetch_docs())
|
71 |
+
click.echo("Documentation updated successfully")
|
72 |
+
except Exception as e:
|
73 |
+
click.echo(f"Error: {str(e)}", err=True)
|
74 |
+
sys.exit(1)
|
75 |
+
|
76 |
+
@docs.command()
|
77 |
+
@click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
|
78 |
+
@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
|
79 |
+
def index(force_facts: bool, clear_cache: bool):
|
80 |
+
"""Build or rebuild search indexes"""
|
81 |
+
try:
|
82 |
+
asyncio.run(docs_manager.ensure_docs_exist())
|
83 |
+
asyncio.run(docs_manager.llm_text.generate_index_files(
|
84 |
+
force_generate_facts=force_facts,
|
85 |
+
clear_bm25_cache=clear_cache
|
86 |
+
))
|
87 |
+
click.echo("Search indexes built successfully")
|
88 |
+
except Exception as e:
|
89 |
+
click.echo(f"Error: {str(e)}", err=True)
|
90 |
+
sys.exit(1)
|
91 |
+
|
92 |
+
# Add docs list command
|
93 |
+
@docs.command()
|
94 |
+
def list():
|
95 |
+
"""List available documentation sections"""
|
96 |
+
try:
|
97 |
+
sections = docs_manager.list()
|
98 |
+
print_table(["Sections"], [[section] for section in sections])
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
click.echo(f"Error: {str(e)}", err=True)
|
102 |
+
sys.exit(1)
|
103 |
+
|
104 |
+
if __name__ == '__main__':
|
105 |
+
cli()
|
crawl4ai/config.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv() # Load environment variables from .env file
|
5 |
+
|
6 |
+
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
7 |
+
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
8 |
+
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
9 |
+
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
10 |
+
PROVIDER_MODELS = {
|
11 |
+
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
12 |
+
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
13 |
+
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
14 |
+
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
|
15 |
+
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
16 |
+
"openai/o1-mini": os.getenv("OPENAI_API_KEY"),
|
17 |
+
"openai/o1-preview": os.getenv("OPENAI_API_KEY"),
|
18 |
+
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
19 |
+
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
20 |
+
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
21 |
+
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
|
22 |
+
}
|
23 |
+
|
24 |
+
# Chunk token threshold
|
25 |
+
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
|
26 |
+
OVERLAP_RATE = 0.1
|
27 |
+
WORD_TOKEN_RATE = 1.3
|
28 |
+
|
29 |
+
# Threshold for the minimum number of word in a HTML tag to be considered
|
30 |
+
MIN_WORD_THRESHOLD = 1
|
31 |
+
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
32 |
+
|
33 |
+
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
|
34 |
+
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
|
35 |
+
SOCIAL_MEDIA_DOMAINS = [
|
36 |
+
'facebook.com',
|
37 |
+
'twitter.com',
|
38 |
+
'x.com',
|
39 |
+
'linkedin.com',
|
40 |
+
'instagram.com',
|
41 |
+
'pinterest.com',
|
42 |
+
'tiktok.com',
|
43 |
+
'snapchat.com',
|
44 |
+
'reddit.com',
|
45 |
+
]
|
46 |
+
|
47 |
+
# Threshold for the Image extraction - Range is 1 to 6
|
48 |
+
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
49 |
+
# to each image based on the following aspects.
|
50 |
+
# If either height or width exceeds 150px
|
51 |
+
# If image size is greater than 10Kb
|
52 |
+
# If alt property is set
|
53 |
+
# If image format is in jpg, png or webp
|
54 |
+
# If image is in the first half of the total images extracted from the page
|
55 |
+
IMAGE_SCORE_THRESHOLD = 2
|
56 |
+
|
57 |
+
MAX_METRICS_HISTORY = 1000
|
58 |
+
|
59 |
+
NEED_MIGRATION = True
|
60 |
+
URL_LOG_SHORTEN_LENGTH = 30
|
61 |
+
SHOW_DEPRECATION_WARNINGS = True
|
62 |
+
SCREENSHOT_HEIGHT_TRESHOLD = 10000
|
63 |
+
PAGE_TIMEOUT=60000
|
64 |
+
DOWNLOAD_PAGE_TIMEOUT=60000
|
crawl4ai/content_filter_strategy.py
ADDED
@@ -0,0 +1,627 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from bs4 import BeautifulSoup, Tag
|
3 |
+
from typing import List, Tuple, Dict
|
4 |
+
from rank_bm25 import BM25Okapi
|
5 |
+
from time import perf_counter
|
6 |
+
from collections import deque
|
7 |
+
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
8 |
+
from .utils import clean_tokens
|
9 |
+
from abc import ABC, abstractmethod
|
10 |
+
import math
|
11 |
+
from snowballstemmer import stemmer
|
12 |
+
class RelevantContentFilter(ABC):
|
13 |
+
"""Abstract base class for content filtering strategies"""
|
14 |
+
def __init__(self, user_query: str = None):
|
15 |
+
self.user_query = user_query
|
16 |
+
self.included_tags = {
|
17 |
+
# Primary structure
|
18 |
+
'article', 'main', 'section', 'div',
|
19 |
+
# List structures
|
20 |
+
'ul', 'ol', 'li', 'dl', 'dt', 'dd',
|
21 |
+
# Text content
|
22 |
+
'p', 'span', 'blockquote', 'pre', 'code',
|
23 |
+
# Headers
|
24 |
+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
25 |
+
# Tables
|
26 |
+
'table', 'thead', 'tbody', 'tr', 'td', 'th',
|
27 |
+
# Other semantic elements
|
28 |
+
'figure', 'figcaption', 'details', 'summary',
|
29 |
+
# Text formatting
|
30 |
+
'em', 'strong', 'b', 'i', 'mark', 'small',
|
31 |
+
# Rich content
|
32 |
+
'time', 'address', 'cite', 'q'
|
33 |
+
}
|
34 |
+
self.excluded_tags = {
|
35 |
+
'nav', 'footer', 'header', 'aside', 'script',
|
36 |
+
'style', 'form', 'iframe', 'noscript'
|
37 |
+
}
|
38 |
+
self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
39 |
+
self.negative_patterns = re.compile(
|
40 |
+
r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share',
|
41 |
+
re.I
|
42 |
+
)
|
43 |
+
self.min_word_count = 2
|
44 |
+
|
45 |
+
@abstractmethod
|
46 |
+
def filter_content(self, html: str) -> List[str]:
|
47 |
+
"""Abstract method to be implemented by specific filtering strategies"""
|
48 |
+
pass
|
49 |
+
|
50 |
+
def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
|
51 |
+
"""Common method to extract page metadata with fallbacks"""
|
52 |
+
if self.user_query:
|
53 |
+
return self.user_query
|
54 |
+
|
55 |
+
query_parts = []
|
56 |
+
|
57 |
+
# Title
|
58 |
+
try:
|
59 |
+
title = soup.title.string
|
60 |
+
if title:
|
61 |
+
query_parts.append(title)
|
62 |
+
except Exception:
|
63 |
+
pass
|
64 |
+
|
65 |
+
if soup.find('h1'):
|
66 |
+
query_parts.append(soup.find('h1').get_text())
|
67 |
+
|
68 |
+
# Meta tags
|
69 |
+
temp = ""
|
70 |
+
for meta_name in ['keywords', 'description']:
|
71 |
+
meta = soup.find('meta', attrs={'name': meta_name})
|
72 |
+
if meta and meta.get('content'):
|
73 |
+
query_parts.append(meta['content'])
|
74 |
+
temp += meta['content']
|
75 |
+
|
76 |
+
# If still empty, grab first significant paragraph
|
77 |
+
if not temp:
|
78 |
+
# Find the first tag P thatits text contains more than 50 characters
|
79 |
+
for p in body.find_all('p'):
|
80 |
+
if len(p.get_text()) > 150:
|
81 |
+
query_parts.append(p.get_text()[:150])
|
82 |
+
break
|
83 |
+
|
84 |
+
return ' '.join(filter(None, query_parts))
|
85 |
+
|
86 |
+
def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
|
87 |
+
"""
|
88 |
+
Extracts text chunks from a BeautifulSoup body element while preserving order.
|
89 |
+
Returns list of tuples (text, tag_name) for classification.
|
90 |
+
|
91 |
+
Args:
|
92 |
+
body: BeautifulSoup Tag object representing the body element
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
List of (text, tag_name) tuples
|
96 |
+
"""
|
97 |
+
# Tags to ignore - inline elements that shouldn't break text flow
|
98 |
+
INLINE_TAGS = {
|
99 |
+
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
|
100 |
+
'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
|
101 |
+
'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
|
102 |
+
'textarea', 'time', 'tt', 'var'
|
103 |
+
}
|
104 |
+
|
105 |
+
# Tags that typically contain meaningful headers
|
106 |
+
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'}
|
107 |
+
|
108 |
+
chunks = []
|
109 |
+
current_text = []
|
110 |
+
chunk_index = 0
|
111 |
+
|
112 |
+
def should_break_chunk(tag: Tag) -> bool:
|
113 |
+
"""Determine if a tag should cause a break in the current text chunk"""
|
114 |
+
return (
|
115 |
+
tag.name not in INLINE_TAGS
|
116 |
+
and not (tag.name == 'p' and len(current_text) == 0)
|
117 |
+
)
|
118 |
+
|
119 |
+
# Use deque for efficient push/pop operations
|
120 |
+
stack = deque([(body, False)])
|
121 |
+
|
122 |
+
while stack:
|
123 |
+
element, visited = stack.pop()
|
124 |
+
|
125 |
+
if visited:
|
126 |
+
# End of block element - flush accumulated text
|
127 |
+
if current_text and should_break_chunk(element):
|
128 |
+
text = ' '.join(''.join(current_text).split())
|
129 |
+
if text:
|
130 |
+
tag_type = 'header' if element.name in HEADER_TAGS else 'content'
|
131 |
+
chunks.append((chunk_index, text, tag_type, element))
|
132 |
+
chunk_index += 1
|
133 |
+
current_text = []
|
134 |
+
continue
|
135 |
+
|
136 |
+
if isinstance(element, NavigableString):
|
137 |
+
if str(element).strip():
|
138 |
+
current_text.append(str(element).strip())
|
139 |
+
continue
|
140 |
+
|
141 |
+
# Pre-allocate children to avoid multiple list operations
|
142 |
+
children = list(element.children)
|
143 |
+
if not children:
|
144 |
+
continue
|
145 |
+
|
146 |
+
# Mark block for revisit after processing children
|
147 |
+
stack.append((element, True))
|
148 |
+
|
149 |
+
# Add children in reverse order for correct processing
|
150 |
+
for child in reversed(children):
|
151 |
+
if isinstance(child, (Tag, NavigableString)):
|
152 |
+
stack.append((child, False))
|
153 |
+
|
154 |
+
# Handle any remaining text
|
155 |
+
if current_text:
|
156 |
+
text = ' '.join(''.join(current_text).split())
|
157 |
+
if text:
|
158 |
+
chunks.append((chunk_index, text, 'content', body))
|
159 |
+
|
160 |
+
if min_word_threshold:
|
161 |
+
chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
|
162 |
+
|
163 |
+
return chunks
|
164 |
+
|
165 |
+
def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
166 |
+
"""Common method for extracting text chunks"""
|
167 |
+
_text_cache = {}
|
168 |
+
def fast_text(element: Tag) -> str:
|
169 |
+
elem_id = id(element)
|
170 |
+
if elem_id in _text_cache:
|
171 |
+
return _text_cache[elem_id]
|
172 |
+
texts = []
|
173 |
+
for content in element.contents:
|
174 |
+
if isinstance(content, str):
|
175 |
+
text = content.strip()
|
176 |
+
if text:
|
177 |
+
texts.append(text)
|
178 |
+
result = ' '.join(texts)
|
179 |
+
_text_cache[elem_id] = result
|
180 |
+
return result
|
181 |
+
|
182 |
+
candidates = []
|
183 |
+
index = 0
|
184 |
+
|
185 |
+
def dfs(element):
|
186 |
+
nonlocal index
|
187 |
+
if isinstance(element, Tag):
|
188 |
+
if element.name in self.included_tags:
|
189 |
+
if not self.is_excluded(element):
|
190 |
+
text = fast_text(element)
|
191 |
+
word_count = len(text.split())
|
192 |
+
|
193 |
+
# Headers pass through with adjusted minimum
|
194 |
+
if element.name in self.header_tags:
|
195 |
+
if word_count >= 3: # Minimal sanity check for headers
|
196 |
+
candidates.append((index, text, element))
|
197 |
+
index += 1
|
198 |
+
# Regular content uses standard minimum
|
199 |
+
elif word_count >= self.min_word_count:
|
200 |
+
candidates.append((index, text, element))
|
201 |
+
index += 1
|
202 |
+
|
203 |
+
for child in element.children:
|
204 |
+
dfs(child)
|
205 |
+
|
206 |
+
dfs(soup.body if soup.body else soup)
|
207 |
+
return candidates
|
208 |
+
|
209 |
+
def is_excluded(self, tag: Tag) -> bool:
|
210 |
+
"""Common method for exclusion logic"""
|
211 |
+
if tag.name in self.excluded_tags:
|
212 |
+
return True
|
213 |
+
class_id = ' '.join(filter(None, [
|
214 |
+
' '.join(tag.get('class', [])),
|
215 |
+
tag.get('id', '')
|
216 |
+
]))
|
217 |
+
return bool(self.negative_patterns.search(class_id))
|
218 |
+
|
219 |
+
def clean_element(self, tag: Tag) -> str:
|
220 |
+
"""Common method for cleaning HTML elements with minimal overhead"""
|
221 |
+
if not tag or not isinstance(tag, Tag):
|
222 |
+
return ""
|
223 |
+
|
224 |
+
unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'}
|
225 |
+
unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'}
|
226 |
+
|
227 |
+
# Use string builder pattern for better performance
|
228 |
+
builder = []
|
229 |
+
|
230 |
+
def render_tag(elem):
|
231 |
+
if not isinstance(elem, Tag):
|
232 |
+
if isinstance(elem, str):
|
233 |
+
builder.append(elem.strip())
|
234 |
+
return
|
235 |
+
|
236 |
+
if elem.name in unwanted_tags:
|
237 |
+
return
|
238 |
+
|
239 |
+
# Start tag
|
240 |
+
builder.append(f'<{elem.name}')
|
241 |
+
|
242 |
+
# Add cleaned attributes
|
243 |
+
attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
|
244 |
+
for key, value in attrs.items():
|
245 |
+
builder.append(f' {key}="{value}"')
|
246 |
+
|
247 |
+
builder.append('>')
|
248 |
+
|
249 |
+
# Process children
|
250 |
+
for child in elem.children:
|
251 |
+
render_tag(child)
|
252 |
+
|
253 |
+
# Close tag
|
254 |
+
builder.append(f'</{elem.name}>')
|
255 |
+
|
256 |
+
try:
|
257 |
+
render_tag(tag)
|
258 |
+
return ''.join(builder)
|
259 |
+
except Exception:
|
260 |
+
return str(tag) # Fallback to original if anything fails
|
261 |
+
|
262 |
+
class BM25ContentFilter(RelevantContentFilter):
|
263 |
+
"""
|
264 |
+
Content filtering using BM25 algorithm with priority tag handling.
|
265 |
+
|
266 |
+
How it works:
|
267 |
+
1. Extracts page metadata with fallbacks.
|
268 |
+
2. Extracts text chunks from the body element.
|
269 |
+
3. Tokenizes the corpus and query.
|
270 |
+
4. Applies BM25 algorithm to calculate scores for each chunk.
|
271 |
+
5. Filters out chunks below the threshold.
|
272 |
+
6. Sorts chunks by score in descending order.
|
273 |
+
7. Returns the top N chunks.
|
274 |
+
|
275 |
+
Attributes:
|
276 |
+
user_query (str): User query for filtering (optional).
|
277 |
+
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
278 |
+
language (str): Language for stemming (default: 'english').
|
279 |
+
|
280 |
+
Methods:
|
281 |
+
filter_content(self, html: str, min_word_threshold: int = None)
|
282 |
+
"""
|
283 |
+
def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
|
284 |
+
"""
|
285 |
+
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
|
286 |
+
|
287 |
+
Note:
|
288 |
+
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
289 |
+
|
290 |
+
Args:
|
291 |
+
user_query (str): User query for filtering (optional).
|
292 |
+
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
293 |
+
language (str): Language for stemming (default: 'english').
|
294 |
+
"""
|
295 |
+
super().__init__(user_query=user_query)
|
296 |
+
self.bm25_threshold = bm25_threshold
|
297 |
+
self.priority_tags = {
|
298 |
+
'h1': 5.0,
|
299 |
+
'h2': 4.0,
|
300 |
+
'h3': 3.0,
|
301 |
+
'title': 4.0,
|
302 |
+
'strong': 2.0,
|
303 |
+
'b': 1.5,
|
304 |
+
'em': 1.5,
|
305 |
+
'blockquote': 2.0,
|
306 |
+
'code': 2.0,
|
307 |
+
'pre': 1.5,
|
308 |
+
'th': 1.5, # Table headers
|
309 |
+
}
|
310 |
+
self.stemmer = stemmer(language)
|
311 |
+
|
312 |
+
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
313 |
+
"""
|
314 |
+
Implements content filtering using BM25 algorithm with priority tag handling.
|
315 |
+
|
316 |
+
Note:
|
317 |
+
This method implements the filtering logic for the BM25ContentFilter class.
|
318 |
+
It takes HTML content as input and returns a list of filtered text chunks.
|
319 |
+
|
320 |
+
Args:
|
321 |
+
html (str): HTML content to be filtered.
|
322 |
+
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
323 |
+
|
324 |
+
Returns:
|
325 |
+
List[str]: List of filtered text chunks.
|
326 |
+
"""
|
327 |
+
if not html or not isinstance(html, str):
|
328 |
+
return []
|
329 |
+
|
330 |
+
soup = BeautifulSoup(html, 'lxml')
|
331 |
+
|
332 |
+
# Check if body is present
|
333 |
+
if not soup.body:
|
334 |
+
# Wrap in body tag if missing
|
335 |
+
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
336 |
+
body = soup.find('body')
|
337 |
+
|
338 |
+
query = self.extract_page_query(soup, body)
|
339 |
+
|
340 |
+
if not query:
|
341 |
+
return []
|
342 |
+
# return [self.clean_element(soup)]
|
343 |
+
|
344 |
+
candidates = self.extract_text_chunks(body, min_word_threshold)
|
345 |
+
|
346 |
+
if not candidates:
|
347 |
+
return []
|
348 |
+
|
349 |
+
# Tokenize corpus
|
350 |
+
# tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
|
351 |
+
# tokenized_query = query.lower().split()
|
352 |
+
|
353 |
+
# tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
|
354 |
+
# for _, chunk, _, _ in candidates]
|
355 |
+
# tokenized_query = [ps.stem(word) for word in query.lower().split()]
|
356 |
+
|
357 |
+
tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()]
|
358 |
+
for _, chunk, _, _ in candidates]
|
359 |
+
tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
|
360 |
+
|
361 |
+
# tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
|
362 |
+
# for _, chunk, _, _ in candidates]
|
363 |
+
# tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
|
364 |
+
|
365 |
+
# Clean from stop words and noise
|
366 |
+
tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
|
367 |
+
tokenized_query = clean_tokens(tokenized_query)
|
368 |
+
|
369 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
370 |
+
scores = bm25.get_scores(tokenized_query)
|
371 |
+
|
372 |
+
# Adjust scores with tag weights
|
373 |
+
adjusted_candidates = []
|
374 |
+
for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
|
375 |
+
tag_weight = self.priority_tags.get(tag.name, 1.0)
|
376 |
+
adjusted_score = score * tag_weight
|
377 |
+
adjusted_candidates.append((adjusted_score, index, chunk, tag))
|
378 |
+
|
379 |
+
# Filter candidates by threshold
|
380 |
+
selected_candidates = [
|
381 |
+
(index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
|
382 |
+
if adjusted_score >= self.bm25_threshold
|
383 |
+
]
|
384 |
+
|
385 |
+
if not selected_candidates:
|
386 |
+
return []
|
387 |
+
|
388 |
+
# Sort selected candidates by original document order
|
389 |
+
selected_candidates.sort(key=lambda x: x[0])
|
390 |
+
|
391 |
+
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
392 |
+
|
393 |
+
class PruningContentFilter(RelevantContentFilter):
|
394 |
+
"""
|
395 |
+
Content filtering using pruning algorithm with dynamic threshold.
|
396 |
+
|
397 |
+
How it works:
|
398 |
+
1. Extracts page metadata with fallbacks.
|
399 |
+
2. Extracts text chunks from the body element.
|
400 |
+
3. Applies pruning algorithm to calculate scores for each chunk.
|
401 |
+
4. Filters out chunks below the threshold.
|
402 |
+
5. Sorts chunks by score in descending order.
|
403 |
+
6. Returns the top N chunks.
|
404 |
+
|
405 |
+
Attributes:
|
406 |
+
user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
|
407 |
+
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
408 |
+
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
409 |
+
threshold (float): Fixed threshold value (default: 0.48).
|
410 |
+
|
411 |
+
Methods:
|
412 |
+
filter_content(self, html: str, min_word_threshold: int = None):
|
413 |
+
"""
|
414 |
+
def __init__(self, user_query: str = None, min_word_threshold: int = None,
|
415 |
+
threshold_type: str = 'fixed', threshold: float = 0.48):
|
416 |
+
"""
|
417 |
+
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
|
418 |
+
|
419 |
+
Note:
|
420 |
+
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
421 |
+
|
422 |
+
Args:
|
423 |
+
user_query (str): User query for filtering (optional).
|
424 |
+
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
425 |
+
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
426 |
+
threshold (float): Fixed threshold value (default: 0.48).
|
427 |
+
"""
|
428 |
+
super().__init__(None)
|
429 |
+
self.min_word_threshold = min_word_threshold
|
430 |
+
self.threshold_type = threshold_type
|
431 |
+
self.threshold = threshold
|
432 |
+
|
433 |
+
# Add tag importance for dynamic threshold
|
434 |
+
self.tag_importance = {
|
435 |
+
'article': 1.5,
|
436 |
+
'main': 1.4,
|
437 |
+
'section': 1.3,
|
438 |
+
'p': 1.2,
|
439 |
+
'h1': 1.4,
|
440 |
+
'h2': 1.3,
|
441 |
+
'h3': 1.2,
|
442 |
+
'div': 0.7,
|
443 |
+
'span': 0.6
|
444 |
+
}
|
445 |
+
|
446 |
+
# Metric configuration
|
447 |
+
self.metric_config = {
|
448 |
+
'text_density': True,
|
449 |
+
'link_density': True,
|
450 |
+
'tag_weight': True,
|
451 |
+
'class_id_weight': True,
|
452 |
+
'text_length': True,
|
453 |
+
}
|
454 |
+
|
455 |
+
self.metric_weights = {
|
456 |
+
'text_density': 0.4,
|
457 |
+
'link_density': 0.2,
|
458 |
+
'tag_weight': 0.2,
|
459 |
+
'class_id_weight': 0.1,
|
460 |
+
'text_length': 0.1,
|
461 |
+
}
|
462 |
+
|
463 |
+
self.tag_weights = {
|
464 |
+
'div': 0.5,
|
465 |
+
'p': 1.0,
|
466 |
+
'article': 1.5,
|
467 |
+
'section': 1.0,
|
468 |
+
'span': 0.3,
|
469 |
+
'li': 0.5,
|
470 |
+
'ul': 0.5,
|
471 |
+
'ol': 0.5,
|
472 |
+
'h1': 1.2,
|
473 |
+
'h2': 1.1,
|
474 |
+
'h3': 1.0,
|
475 |
+
'h4': 0.9,
|
476 |
+
'h5': 0.8,
|
477 |
+
'h6': 0.7,
|
478 |
+
}
|
479 |
+
|
480 |
+
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
481 |
+
"""
|
482 |
+
Implements content filtering using pruning algorithm with dynamic threshold.
|
483 |
+
|
484 |
+
Note:
|
485 |
+
This method implements the filtering logic for the PruningContentFilter class.
|
486 |
+
It takes HTML content as input and returns a list of filtered text chunks.
|
487 |
+
|
488 |
+
Args:
|
489 |
+
html (str): HTML content to be filtered.
|
490 |
+
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
491 |
+
|
492 |
+
Returns:
|
493 |
+
List[str]: List of filtered text chunks.
|
494 |
+
"""
|
495 |
+
if not html or not isinstance(html, str):
|
496 |
+
return []
|
497 |
+
|
498 |
+
soup = BeautifulSoup(html, 'lxml')
|
499 |
+
if not soup.body:
|
500 |
+
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
501 |
+
|
502 |
+
# Remove comments and unwanted tags
|
503 |
+
self._remove_comments(soup)
|
504 |
+
self._remove_unwanted_tags(soup)
|
505 |
+
|
506 |
+
# Prune tree starting from body
|
507 |
+
body = soup.find('body')
|
508 |
+
self._prune_tree(body)
|
509 |
+
|
510 |
+
# Extract remaining content as list of HTML strings
|
511 |
+
content_blocks = []
|
512 |
+
for element in body.children:
|
513 |
+
if isinstance(element, str) or not hasattr(element, 'name'):
|
514 |
+
continue
|
515 |
+
if len(element.get_text(strip=True)) > 0:
|
516 |
+
content_blocks.append(str(element))
|
517 |
+
|
518 |
+
return content_blocks
|
519 |
+
|
520 |
+
def _remove_comments(self, soup):
|
521 |
+
"""Removes HTML comments"""
|
522 |
+
for element in soup(text=lambda text: isinstance(text, Comment)):
|
523 |
+
element.extract()
|
524 |
+
|
525 |
+
def _remove_unwanted_tags(self, soup):
|
526 |
+
"""Removes unwanted tags"""
|
527 |
+
for tag in self.excluded_tags:
|
528 |
+
for element in soup.find_all(tag):
|
529 |
+
element.decompose()
|
530 |
+
|
531 |
+
def _prune_tree(self, node):
|
532 |
+
"""
|
533 |
+
Prunes the tree starting from the given node.
|
534 |
+
|
535 |
+
Args:
|
536 |
+
node (Tag): The node from which the pruning starts.
|
537 |
+
"""
|
538 |
+
if not node or not hasattr(node, 'name') or node.name is None:
|
539 |
+
return
|
540 |
+
|
541 |
+
text_len = len(node.get_text(strip=True))
|
542 |
+
tag_len = len(node.encode_contents().decode('utf-8'))
|
543 |
+
link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
|
544 |
+
|
545 |
+
metrics = {
|
546 |
+
'node': node,
|
547 |
+
'tag_name': node.name,
|
548 |
+
'text_len': text_len,
|
549 |
+
'tag_len': tag_len,
|
550 |
+
'link_text_len': link_text_len
|
551 |
+
}
|
552 |
+
|
553 |
+
score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
|
554 |
+
|
555 |
+
if self.threshold_type == 'fixed':
|
556 |
+
should_remove = score < self.threshold
|
557 |
+
else: # dynamic
|
558 |
+
tag_importance = self.tag_importance.get(node.name, 0.7)
|
559 |
+
text_ratio = text_len / tag_len if tag_len > 0 else 0
|
560 |
+
link_ratio = link_text_len / text_len if text_len > 0 else 1
|
561 |
+
|
562 |
+
threshold = self.threshold # base threshold
|
563 |
+
if tag_importance > 1:
|
564 |
+
threshold *= 0.8
|
565 |
+
if text_ratio > 0.4:
|
566 |
+
threshold *= 0.9
|
567 |
+
if link_ratio > 0.6:
|
568 |
+
threshold *= 1.2
|
569 |
+
|
570 |
+
should_remove = score < threshold
|
571 |
+
|
572 |
+
if should_remove:
|
573 |
+
node.decompose()
|
574 |
+
else:
|
575 |
+
children = [child for child in node.children if hasattr(child, 'name')]
|
576 |
+
for child in children:
|
577 |
+
self._prune_tree(child)
|
578 |
+
|
579 |
+
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
|
580 |
+
"""Computes the composite score"""
|
581 |
+
if self.min_word_threshold:
|
582 |
+
# Get raw text from metrics node - avoid extra processing
|
583 |
+
text = metrics['node'].get_text(strip=True)
|
584 |
+
word_count = text.count(' ') + 1
|
585 |
+
if word_count < self.min_word_threshold:
|
586 |
+
return -1.0 # Guaranteed removal
|
587 |
+
score = 0.0
|
588 |
+
total_weight = 0.0
|
589 |
+
|
590 |
+
if self.metric_config['text_density']:
|
591 |
+
density = text_len / tag_len if tag_len > 0 else 0
|
592 |
+
score += self.metric_weights['text_density'] * density
|
593 |
+
total_weight += self.metric_weights['text_density']
|
594 |
+
|
595 |
+
if self.metric_config['link_density']:
|
596 |
+
density = 1 - (link_text_len / text_len if text_len > 0 else 0)
|
597 |
+
score += self.metric_weights['link_density'] * density
|
598 |
+
total_weight += self.metric_weights['link_density']
|
599 |
+
|
600 |
+
if self.metric_config['tag_weight']:
|
601 |
+
tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
|
602 |
+
score += self.metric_weights['tag_weight'] * tag_score
|
603 |
+
total_weight += self.metric_weights['tag_weight']
|
604 |
+
|
605 |
+
if self.metric_config['class_id_weight']:
|
606 |
+
class_score = self._compute_class_id_weight(metrics['node'])
|
607 |
+
score += self.metric_weights['class_id_weight'] * max(0, class_score)
|
608 |
+
total_weight += self.metric_weights['class_id_weight']
|
609 |
+
|
610 |
+
if self.metric_config['text_length']:
|
611 |
+
score += self.metric_weights['text_length'] * math.log(text_len + 1)
|
612 |
+
total_weight += self.metric_weights['text_length']
|
613 |
+
|
614 |
+
return score / total_weight if total_weight > 0 else 0
|
615 |
+
|
616 |
+
def _compute_class_id_weight(self, node):
|
617 |
+
"""Computes the class ID weight"""
|
618 |
+
class_id_score = 0
|
619 |
+
if 'class' in node.attrs:
|
620 |
+
classes = ' '.join(node['class'])
|
621 |
+
if self.negative_patterns.match(classes):
|
622 |
+
class_id_score -= 0.5
|
623 |
+
if 'id' in node.attrs:
|
624 |
+
element_id = node['id']
|
625 |
+
if self.negative_patterns.match(element_id):
|
626 |
+
class_id_score -= 0.5
|
627 |
+
return class_id_score
|
crawl4ai/content_scraping_strategy.py
ADDED
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re # Point 1: Pre-Compile Regular Expressions
|
2 |
+
import time
|
3 |
+
from abc import ABC, abstractmethod
|
4 |
+
from typing import Dict, Any, Optional
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from concurrent.futures import ThreadPoolExecutor
|
7 |
+
import asyncio, requests, re, os
|
8 |
+
from .config import *
|
9 |
+
from bs4 import element, NavigableString, Comment
|
10 |
+
from bs4 import PageElement, Tag
|
11 |
+
from urllib.parse import urljoin
|
12 |
+
from requests.exceptions import InvalidSchema
|
13 |
+
# from .content_cleaning_strategy import ContentCleaningStrategy
|
14 |
+
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
|
15 |
+
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
16 |
+
from .models import MarkdownGenerationResult
|
17 |
+
from .utils import (
|
18 |
+
extract_metadata,
|
19 |
+
normalize_url,
|
20 |
+
is_external_url,
|
21 |
+
get_base_domain,
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
26 |
+
OG_REGEX = re.compile(r'^og:')
|
27 |
+
TWITTER_REGEX = re.compile(r'^twitter:')
|
28 |
+
DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
29 |
+
|
30 |
+
# Function to parse image height/width value and units
|
31 |
+
def parse_dimension(dimension):
|
32 |
+
if dimension:
|
33 |
+
# match = re.match(r"(\d+)(\D*)", dimension)
|
34 |
+
match = DIMENSION_REGEX.match(dimension)
|
35 |
+
if match:
|
36 |
+
number = int(match.group(1))
|
37 |
+
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
38 |
+
return number, unit
|
39 |
+
return None, None
|
40 |
+
|
41 |
+
# Fetch image file metadata to extract size and extension
|
42 |
+
def fetch_image_file_size(img, base_url):
|
43 |
+
#If src is relative path construct full URL, if not it may be CDN URL
|
44 |
+
img_url = urljoin(base_url,img.get('src'))
|
45 |
+
try:
|
46 |
+
response = requests.head(img_url)
|
47 |
+
if response.status_code == 200:
|
48 |
+
return response.headers.get('Content-Length',None)
|
49 |
+
else:
|
50 |
+
print(f"Failed to retrieve file size for {img_url}")
|
51 |
+
return None
|
52 |
+
except InvalidSchema as e:
|
53 |
+
return None
|
54 |
+
finally:
|
55 |
+
return
|
56 |
+
|
57 |
+
class ContentScrapingStrategy(ABC):
|
58 |
+
@abstractmethod
|
59 |
+
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
60 |
+
pass
|
61 |
+
|
62 |
+
@abstractmethod
|
63 |
+
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
64 |
+
pass
|
65 |
+
|
66 |
+
class WebScrapingStrategy(ContentScrapingStrategy):
|
67 |
+
"""
|
68 |
+
Class for web content scraping. Perhaps the most important class.
|
69 |
+
|
70 |
+
How it works:
|
71 |
+
1. Extract content from HTML using BeautifulSoup.
|
72 |
+
2. Clean the extracted content using a content cleaning strategy.
|
73 |
+
3. Filter the cleaned content using a content filtering strategy.
|
74 |
+
4. Generate markdown content from the filtered content.
|
75 |
+
5. Return the markdown content.
|
76 |
+
"""
|
77 |
+
|
78 |
+
def __init__(self, logger=None):
|
79 |
+
self.logger = logger
|
80 |
+
|
81 |
+
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
82 |
+
"""Helper method to safely use logger."""
|
83 |
+
if self.logger:
|
84 |
+
log_method = getattr(self.logger, level)
|
85 |
+
log_method(message=message, tag=tag, **kwargs)
|
86 |
+
|
87 |
+
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
88 |
+
"""
|
89 |
+
Main entry point for content scraping.
|
90 |
+
|
91 |
+
Args:
|
92 |
+
url (str): The URL of the page to scrape.
|
93 |
+
html (str): The HTML content of the page.
|
94 |
+
**kwargs: Additional keyword arguments.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
98 |
+
|
99 |
+
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
100 |
+
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
101 |
+
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
102 |
+
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
103 |
+
"""
|
104 |
+
return self._scrap(url, html, is_async=False, **kwargs)
|
105 |
+
|
106 |
+
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
107 |
+
"""
|
108 |
+
Main entry point for asynchronous content scraping.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
url (str): The URL of the page to scrape.
|
112 |
+
html (str): The HTML content of the page.
|
113 |
+
**kwargs: Additional keyword arguments.
|
114 |
+
|
115 |
+
Returns:
|
116 |
+
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
117 |
+
|
118 |
+
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
119 |
+
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
120 |
+
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
121 |
+
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
122 |
+
"""
|
123 |
+
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
124 |
+
|
125 |
+
def flatten_nested_elements(self, node):
|
126 |
+
"""
|
127 |
+
Flatten nested elements in a HTML tree.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
node (Tag): The root node of the HTML tree.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
Tag: The flattened HTML tree.
|
134 |
+
"""
|
135 |
+
if isinstance(node, NavigableString):
|
136 |
+
return node
|
137 |
+
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
|
138 |
+
return self.flatten_nested_elements(node.contents[0])
|
139 |
+
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
140 |
+
return node
|
141 |
+
|
142 |
+
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
143 |
+
"""
|
144 |
+
Find the closest parent with useful text.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
tag (Tag): The starting tag to search from.
|
148 |
+
**kwargs: Additional keyword arguments.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
Tag: The closest parent with useful text, or None if not found.
|
152 |
+
"""
|
153 |
+
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
154 |
+
current_tag = tag
|
155 |
+
while current_tag:
|
156 |
+
current_tag = current_tag.parent
|
157 |
+
# Get the text content of the parent tag
|
158 |
+
if current_tag:
|
159 |
+
text_content = current_tag.get_text(separator=' ',strip=True)
|
160 |
+
# Check if the text content has at least word_count_threshold
|
161 |
+
if len(text_content.split()) >= image_description_min_word_threshold:
|
162 |
+
return text_content
|
163 |
+
return None
|
164 |
+
|
165 |
+
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
|
166 |
+
"""
|
167 |
+
Remove unwanted attributes from an HTML element.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
element (Tag): The HTML element to remove attributes from.
|
171 |
+
important_attrs (list): List of important attributes to keep.
|
172 |
+
keep_data_attributes (bool): Whether to keep data attributes.
|
173 |
+
|
174 |
+
Returns:
|
175 |
+
None
|
176 |
+
"""
|
177 |
+
attrs_to_remove = []
|
178 |
+
for attr in element.attrs:
|
179 |
+
if attr not in important_attrs:
|
180 |
+
if keep_data_attributes:
|
181 |
+
if not attr.startswith('data-'):
|
182 |
+
attrs_to_remove.append(attr)
|
183 |
+
else:
|
184 |
+
attrs_to_remove.append(attr)
|
185 |
+
|
186 |
+
for attr in attrs_to_remove:
|
187 |
+
del element[attr]
|
188 |
+
|
189 |
+
def process_image(self, img, url, index, total_images, **kwargs):
|
190 |
+
"""
|
191 |
+
Process an image element.
|
192 |
+
|
193 |
+
How it works:
|
194 |
+
1. Check if the image has valid display and inside undesired html elements.
|
195 |
+
2. Score an image for it's usefulness.
|
196 |
+
3. Extract image file metadata to extract size and extension.
|
197 |
+
4. Generate a dictionary with the processed image information.
|
198 |
+
5. Return the processed image information.
|
199 |
+
|
200 |
+
Args:
|
201 |
+
img (Tag): The image element to process.
|
202 |
+
url (str): The URL of the page containing the image.
|
203 |
+
index (int): The index of the image in the list of images.
|
204 |
+
total_images (int): The total number of images in the list.
|
205 |
+
**kwargs: Additional keyword arguments.
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
dict: A dictionary containing the processed image information.
|
209 |
+
"""
|
210 |
+
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
211 |
+
if ' ' in u else None}
|
212 |
+
for u in [f"http{p}" for p in s.split("http") if p]]
|
213 |
+
|
214 |
+
# Constants for checks
|
215 |
+
classes_to_check = frozenset(['button', 'icon', 'logo'])
|
216 |
+
tags_to_check = frozenset(['button', 'input'])
|
217 |
+
image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif'])
|
218 |
+
|
219 |
+
# Pre-fetch commonly used attributes
|
220 |
+
style = img.get('style', '')
|
221 |
+
alt = img.get('alt', '')
|
222 |
+
src = img.get('src', '')
|
223 |
+
data_src = img.get('data-src', '')
|
224 |
+
srcset = img.get('srcset', '')
|
225 |
+
data_srcset = img.get('data-srcset', '')
|
226 |
+
width = img.get('width')
|
227 |
+
height = img.get('height')
|
228 |
+
parent = img.parent
|
229 |
+
parent_classes = parent.get('class', [])
|
230 |
+
|
231 |
+
# Quick validation checks
|
232 |
+
if ('display:none' in style or
|
233 |
+
parent.name in tags_to_check or
|
234 |
+
any(c in cls for c in parent_classes for cls in classes_to_check) or
|
235 |
+
any(c in src for c in classes_to_check) or
|
236 |
+
any(c in alt for c in classes_to_check)):
|
237 |
+
return None
|
238 |
+
|
239 |
+
# Quick score calculation
|
240 |
+
score = 0
|
241 |
+
if width and width.isdigit():
|
242 |
+
width_val = int(width)
|
243 |
+
score += 1 if width_val > 150 else 0
|
244 |
+
if height and height.isdigit():
|
245 |
+
height_val = int(height)
|
246 |
+
score += 1 if height_val > 150 else 0
|
247 |
+
if alt:
|
248 |
+
score += 1
|
249 |
+
score += index/total_images < 0.5
|
250 |
+
|
251 |
+
# image_format = ''
|
252 |
+
# if "data:image/" in src:
|
253 |
+
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
254 |
+
# else:
|
255 |
+
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
256 |
+
|
257 |
+
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
258 |
+
# score += 1
|
259 |
+
|
260 |
+
|
261 |
+
# Check for image format in all possible sources
|
262 |
+
def has_image_format(url):
|
263 |
+
return any(fmt in url.lower() for fmt in image_formats)
|
264 |
+
|
265 |
+
# Score for having proper image sources
|
266 |
+
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
267 |
+
score += 1
|
268 |
+
if srcset or data_srcset:
|
269 |
+
score += 1
|
270 |
+
if img.find_parent('picture'):
|
271 |
+
score += 1
|
272 |
+
|
273 |
+
# Detect format from any available source
|
274 |
+
detected_format = None
|
275 |
+
for url in [src, data_src, srcset, data_srcset]:
|
276 |
+
if url:
|
277 |
+
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
278 |
+
if format_matches:
|
279 |
+
detected_format = format_matches[0]
|
280 |
+
break
|
281 |
+
|
282 |
+
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
283 |
+
return None
|
284 |
+
|
285 |
+
# Use set for deduplication
|
286 |
+
unique_urls = set()
|
287 |
+
image_variants = []
|
288 |
+
|
289 |
+
# Generate a unique group ID for this set of variants
|
290 |
+
group_id = index
|
291 |
+
|
292 |
+
# Base image info template
|
293 |
+
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
294 |
+
base_info = {
|
295 |
+
'alt': alt,
|
296 |
+
'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
|
297 |
+
'score': score,
|
298 |
+
'type': 'image',
|
299 |
+
'group_id': group_id, # Group ID for this set of variants
|
300 |
+
'format': detected_format,
|
301 |
+
}
|
302 |
+
|
303 |
+
# Inline function for adding variants
|
304 |
+
def add_variant(src, width=None):
|
305 |
+
if src and not src.startswith('data:') and src not in unique_urls:
|
306 |
+
unique_urls.add(src)
|
307 |
+
image_variants.append({**base_info, 'src': src, 'width': width})
|
308 |
+
|
309 |
+
# Process all sources
|
310 |
+
add_variant(src)
|
311 |
+
add_variant(data_src)
|
312 |
+
|
313 |
+
# Handle srcset and data-srcset in one pass
|
314 |
+
for attr in ('srcset', 'data-srcset'):
|
315 |
+
if value := img.get(attr):
|
316 |
+
for source in parse_srcset(value):
|
317 |
+
add_variant(source['url'], source['width'])
|
318 |
+
|
319 |
+
# Quick picture element check
|
320 |
+
if picture := img.find_parent('picture'):
|
321 |
+
for source in picture.find_all('source'):
|
322 |
+
if srcset := source.get('srcset'):
|
323 |
+
for src in parse_srcset(srcset):
|
324 |
+
add_variant(src['url'], src['width'])
|
325 |
+
|
326 |
+
# Framework-specific attributes in one pass
|
327 |
+
for attr, value in img.attrs.items():
|
328 |
+
if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
|
329 |
+
add_variant(value)
|
330 |
+
|
331 |
+
return image_variants if image_variants else None
|
332 |
+
|
333 |
+
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
334 |
+
"""
|
335 |
+
Process an HTML element.
|
336 |
+
|
337 |
+
How it works:
|
338 |
+
1. Check if the element is an image, video, or audio.
|
339 |
+
2. Extract the element's attributes and content.
|
340 |
+
3. Process the element based on its type.
|
341 |
+
4. Return the processed element information.
|
342 |
+
|
343 |
+
Args:
|
344 |
+
url (str): The URL of the page containing the element.
|
345 |
+
element (Tag): The HTML element to process.
|
346 |
+
**kwargs: Additional keyword arguments.
|
347 |
+
|
348 |
+
Returns:
|
349 |
+
dict: A dictionary containing the processed element information.
|
350 |
+
"""
|
351 |
+
media = {'images': [], 'videos': [], 'audios': []}
|
352 |
+
internal_links_dict = {}
|
353 |
+
external_links_dict = {}
|
354 |
+
self._process_element(
|
355 |
+
url,
|
356 |
+
element,
|
357 |
+
media,
|
358 |
+
internal_links_dict,
|
359 |
+
external_links_dict,
|
360 |
+
**kwargs
|
361 |
+
)
|
362 |
+
return {
|
363 |
+
'media': media,
|
364 |
+
'internal_links_dict': internal_links_dict,
|
365 |
+
'external_links_dict': external_links_dict
|
366 |
+
}
|
367 |
+
|
368 |
+
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
|
369 |
+
"""
|
370 |
+
Process an HTML element.
|
371 |
+
"""
|
372 |
+
try:
|
373 |
+
if isinstance(element, NavigableString):
|
374 |
+
if isinstance(element, Comment):
|
375 |
+
element.extract()
|
376 |
+
return False
|
377 |
+
|
378 |
+
# if element.name == 'img':
|
379 |
+
# process_image(element, url, 0, 1)
|
380 |
+
# return True
|
381 |
+
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
382 |
+
|
383 |
+
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
384 |
+
element.decompose()
|
385 |
+
return False
|
386 |
+
|
387 |
+
keep_element = False
|
388 |
+
|
389 |
+
exclude_domains = kwargs.get('exclude_domains', [])
|
390 |
+
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
391 |
+
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
392 |
+
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
393 |
+
|
394 |
+
try:
|
395 |
+
if element.name == 'a' and element.get('href'):
|
396 |
+
href = element.get('href', '').strip()
|
397 |
+
if not href: # Skip empty hrefs
|
398 |
+
return False
|
399 |
+
|
400 |
+
url_base = url.split('/')[2]
|
401 |
+
|
402 |
+
# Normalize the URL
|
403 |
+
try:
|
404 |
+
normalized_href = normalize_url(href, url)
|
405 |
+
except ValueError as e:
|
406 |
+
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
407 |
+
return False
|
408 |
+
|
409 |
+
link_data = {
|
410 |
+
'href': normalized_href,
|
411 |
+
'text': element.get_text().strip(),
|
412 |
+
'title': element.get('title', '').strip(),
|
413 |
+
'base_domain': base_domain
|
414 |
+
}
|
415 |
+
|
416 |
+
is_external = is_external_url(normalized_href, base_domain)
|
417 |
+
|
418 |
+
keep_element = True
|
419 |
+
|
420 |
+
# Handle external link exclusions
|
421 |
+
if is_external:
|
422 |
+
link_base_domain = get_base_domain(normalized_href)
|
423 |
+
link_data['base_domain'] = link_base_domain
|
424 |
+
if kwargs.get('exclude_external_links', False):
|
425 |
+
element.decompose()
|
426 |
+
return False
|
427 |
+
# elif kwargs.get('exclude_social_media_links', False):
|
428 |
+
# if link_base_domain in exclude_social_media_domains:
|
429 |
+
# element.decompose()
|
430 |
+
# return False
|
431 |
+
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
432 |
+
# element.decompose()
|
433 |
+
# return False
|
434 |
+
elif exclude_domains:
|
435 |
+
if link_base_domain in exclude_domains:
|
436 |
+
element.decompose()
|
437 |
+
return False
|
438 |
+
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
439 |
+
# element.decompose()
|
440 |
+
# return False
|
441 |
+
|
442 |
+
if is_external:
|
443 |
+
if normalized_href not in external_links_dict:
|
444 |
+
external_links_dict[normalized_href] = link_data
|
445 |
+
else:
|
446 |
+
if normalized_href not in internal_links_dict:
|
447 |
+
internal_links_dict[normalized_href] = link_data
|
448 |
+
|
449 |
+
|
450 |
+
except Exception as e:
|
451 |
+
raise Exception(f"Error processing links: {str(e)}")
|
452 |
+
|
453 |
+
try:
|
454 |
+
if element.name == 'img':
|
455 |
+
potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
|
456 |
+
src = element.get('src', '')
|
457 |
+
while not src and potential_sources:
|
458 |
+
src = element.get(potential_sources.pop(0), '')
|
459 |
+
if not src:
|
460 |
+
element.decompose()
|
461 |
+
return False
|
462 |
+
|
463 |
+
# If it is srcset pick up the first image
|
464 |
+
if 'srcset' in element.attrs:
|
465 |
+
src = element.attrs['srcset'].split(',')[0].split(' ')[0]
|
466 |
+
|
467 |
+
# If image src is internal, then skip
|
468 |
+
if not is_external_url(src, base_domain):
|
469 |
+
return True
|
470 |
+
|
471 |
+
image_src_base_domain = get_base_domain(src)
|
472 |
+
|
473 |
+
# Check flag if we should remove external images
|
474 |
+
if kwargs.get('exclude_external_images', False):
|
475 |
+
element.decompose()
|
476 |
+
return False
|
477 |
+
# src_url_base = src.split('/')[2]
|
478 |
+
# url_base = url.split('/')[2]
|
479 |
+
# if url_base not in src_url_base:
|
480 |
+
# element.decompose()
|
481 |
+
# return False
|
482 |
+
|
483 |
+
# if kwargs.get('exclude_social_media_links', False):
|
484 |
+
# if image_src_base_domain in exclude_social_media_domains:
|
485 |
+
# element.decompose()
|
486 |
+
# return False
|
487 |
+
# src_url_base = src.split('/')[2]
|
488 |
+
# url_base = url.split('/')[2]
|
489 |
+
# if any(domain in src for domain in exclude_social_media_domains):
|
490 |
+
# element.decompose()
|
491 |
+
# return False
|
492 |
+
|
493 |
+
# Handle exclude domains
|
494 |
+
if exclude_domains:
|
495 |
+
if image_src_base_domain in exclude_domains:
|
496 |
+
element.decompose()
|
497 |
+
return False
|
498 |
+
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
499 |
+
# element.decompose()
|
500 |
+
# return False
|
501 |
+
|
502 |
+
return True # Always keep image elements
|
503 |
+
except Exception as e:
|
504 |
+
raise "Error processing images"
|
505 |
+
|
506 |
+
|
507 |
+
# Check if flag to remove all forms is set
|
508 |
+
if kwargs.get('remove_forms', False) and element.name == 'form':
|
509 |
+
element.decompose()
|
510 |
+
return False
|
511 |
+
|
512 |
+
if element.name in ['video', 'audio']:
|
513 |
+
media[f"{element.name}s"].append({
|
514 |
+
'src': element.get('src'),
|
515 |
+
'alt': element.get('alt'),
|
516 |
+
'type': element.name,
|
517 |
+
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
518 |
+
})
|
519 |
+
source_tags = element.find_all('source')
|
520 |
+
for source_tag in source_tags:
|
521 |
+
media[f"{element.name}s"].append({
|
522 |
+
'src': source_tag.get('src'),
|
523 |
+
'alt': element.get('alt'),
|
524 |
+
'type': element.name,
|
525 |
+
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
526 |
+
})
|
527 |
+
return True # Always keep video and audio elements
|
528 |
+
|
529 |
+
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
530 |
+
if kwargs.get('only_text', False):
|
531 |
+
element.replace_with(element.get_text())
|
532 |
+
|
533 |
+
try:
|
534 |
+
self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
535 |
+
except Exception as e:
|
536 |
+
# print('Error removing unwanted attributes:', str(e))
|
537 |
+
self._log('error',
|
538 |
+
message="Error removing unwanted attributes: {error}",
|
539 |
+
tag="SCRAPE",
|
540 |
+
params={"error": str(e)}
|
541 |
+
)
|
542 |
+
# Process children
|
543 |
+
for child in list(element.children):
|
544 |
+
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
545 |
+
if len(child.strip()) > 0:
|
546 |
+
keep_element = True
|
547 |
+
else:
|
548 |
+
if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs):
|
549 |
+
keep_element = True
|
550 |
+
|
551 |
+
|
552 |
+
# Check word count
|
553 |
+
word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD)
|
554 |
+
if not keep_element:
|
555 |
+
word_count = len(element.get_text(strip=True).split())
|
556 |
+
keep_element = word_count >= word_count_threshold
|
557 |
+
|
558 |
+
if not keep_element:
|
559 |
+
element.decompose()
|
560 |
+
|
561 |
+
return keep_element
|
562 |
+
except Exception as e:
|
563 |
+
# print('Error processing element:', str(e))
|
564 |
+
self._log('error',
|
565 |
+
message="Error processing element: {error}",
|
566 |
+
tag="SCRAPE",
|
567 |
+
params={"error": str(e)}
|
568 |
+
)
|
569 |
+
return False
|
570 |
+
|
571 |
+
def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
572 |
+
"""
|
573 |
+
Extract content from HTML using BeautifulSoup.
|
574 |
+
|
575 |
+
Args:
|
576 |
+
url (str): The URL of the page to scrape.
|
577 |
+
html (str): The HTML content of the page to scrape.
|
578 |
+
word_count_threshold (int): The minimum word count threshold for content extraction.
|
579 |
+
css_selector (str): The CSS selector to use for content extraction.
|
580 |
+
**kwargs: Additional keyword arguments.
|
581 |
+
|
582 |
+
Returns:
|
583 |
+
dict: A dictionary containing the extracted content.
|
584 |
+
"""
|
585 |
+
success = True
|
586 |
+
if not html:
|
587 |
+
return None
|
588 |
+
|
589 |
+
parser_type = kwargs.get('parser', 'lxml')
|
590 |
+
soup = BeautifulSoup(html, parser_type)
|
591 |
+
body = soup.body
|
592 |
+
base_domain = get_base_domain(url)
|
593 |
+
|
594 |
+
try:
|
595 |
+
meta = extract_metadata("", soup)
|
596 |
+
except Exception as e:
|
597 |
+
self._log('error',
|
598 |
+
message="Error extracting metadata: {error}",
|
599 |
+
tag="SCRAPE",
|
600 |
+
params={"error": str(e)}
|
601 |
+
)
|
602 |
+
meta = {}
|
603 |
+
|
604 |
+
# Handle tag-based removal first - faster than CSS selection
|
605 |
+
excluded_tags = set(kwargs.get('excluded_tags', []) or [])
|
606 |
+
if excluded_tags:
|
607 |
+
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
608 |
+
element.extract()
|
609 |
+
|
610 |
+
# Handle CSS selector-based removal
|
611 |
+
excluded_selector = kwargs.get('excluded_selector', '')
|
612 |
+
if excluded_selector:
|
613 |
+
is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector
|
614 |
+
if is_single_selector:
|
615 |
+
while element := body.select_one(excluded_selector):
|
616 |
+
element.extract()
|
617 |
+
else:
|
618 |
+
for element in body.select(excluded_selector):
|
619 |
+
element.extract()
|
620 |
+
|
621 |
+
if css_selector:
|
622 |
+
selected_elements = body.select(css_selector)
|
623 |
+
if not selected_elements:
|
624 |
+
return {
|
625 |
+
'markdown': '',
|
626 |
+
'cleaned_html': '',
|
627 |
+
'success': True,
|
628 |
+
'media': {'images': [], 'videos': [], 'audios': []},
|
629 |
+
'links': {'internal': [], 'external': []},
|
630 |
+
'metadata': {},
|
631 |
+
'message': f"No elements found for CSS selector: {css_selector}"
|
632 |
+
}
|
633 |
+
# raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
634 |
+
body = soup.new_tag('div')
|
635 |
+
for el in selected_elements:
|
636 |
+
body.append(el)
|
637 |
+
|
638 |
+
kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS)
|
639 |
+
kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', []))
|
640 |
+
if kwargs.get('exclude_social_media_links', False):
|
641 |
+
kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains'])
|
642 |
+
|
643 |
+
result_obj = self.process_element(
|
644 |
+
url,
|
645 |
+
body,
|
646 |
+
word_count_threshold = word_count_threshold,
|
647 |
+
base_domain=base_domain,
|
648 |
+
**kwargs
|
649 |
+
)
|
650 |
+
|
651 |
+
links = {'internal': [], 'external': []}
|
652 |
+
media = result_obj['media']
|
653 |
+
internal_links_dict = result_obj['internal_links_dict']
|
654 |
+
external_links_dict = result_obj['external_links_dict']
|
655 |
+
|
656 |
+
# Update the links dictionary with unique links
|
657 |
+
links['internal'] = list(internal_links_dict.values())
|
658 |
+
links['external'] = list(external_links_dict.values())
|
659 |
+
|
660 |
+
# # Process images using ThreadPoolExecutor
|
661 |
+
imgs = body.find_all('img')
|
662 |
+
|
663 |
+
media['images'] = [
|
664 |
+
img for result in (self.process_image(img, url, i, len(imgs))
|
665 |
+
for i, img in enumerate(imgs))
|
666 |
+
if result is not None
|
667 |
+
for img in result
|
668 |
+
]
|
669 |
+
|
670 |
+
body = self.flatten_nested_elements(body)
|
671 |
+
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
672 |
+
for img in imgs:
|
673 |
+
src = img.get('src', '')
|
674 |
+
if base64_pattern.match(src):
|
675 |
+
# Replace base64 data with empty string
|
676 |
+
img['src'] = base64_pattern.sub('', src)
|
677 |
+
|
678 |
+
str_body = ""
|
679 |
+
try:
|
680 |
+
str_body = body.encode_contents().decode('utf-8')
|
681 |
+
except Exception as e:
|
682 |
+
# Reset body to the original HTML
|
683 |
+
success = False
|
684 |
+
body = BeautifulSoup(html, 'html.parser')
|
685 |
+
|
686 |
+
# Create a new div with a special ID
|
687 |
+
error_div = body.new_tag('div', id='crawl4ai_error_message')
|
688 |
+
error_div.string = '''
|
689 |
+
Crawl4AI Error: This page is not fully supported.
|
690 |
+
|
691 |
+
Possible reasons:
|
692 |
+
1. The page may have restrictions that prevent crawling.
|
693 |
+
2. The page might not be fully loaded.
|
694 |
+
|
695 |
+
Suggestions:
|
696 |
+
- Try calling the crawl function with these parameters:
|
697 |
+
magic=True,
|
698 |
+
- Set headless=False to visualize what's happening on the page.
|
699 |
+
|
700 |
+
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
701 |
+
'''
|
702 |
+
|
703 |
+
# Append the error div to the body
|
704 |
+
body.body.append(error_div)
|
705 |
+
str_body = body.encode_contents().decode('utf-8')
|
706 |
+
|
707 |
+
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
708 |
+
self._log('error',
|
709 |
+
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
710 |
+
tag="SCRAPE"
|
711 |
+
)
|
712 |
+
|
713 |
+
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
714 |
+
|
715 |
+
|
716 |
+
return {
|
717 |
+
# **markdown_content,
|
718 |
+
'cleaned_html': cleaned_html,
|
719 |
+
'success': success,
|
720 |
+
'media': media,
|
721 |
+
'links': links,
|
722 |
+
'metadata': meta
|
723 |
+
}
|
crawl4ai/crawler_strategy.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from selenium import webdriver
|
3 |
+
from selenium.webdriver.chrome.service import Service
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
6 |
+
from selenium.webdriver.support import expected_conditions as EC
|
7 |
+
from selenium.webdriver.chrome.options import Options
|
8 |
+
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
9 |
+
# from selenium.webdriver.chrome.service import Service as ChromeService
|
10 |
+
# from webdriver_manager.chrome import ChromeDriverManager
|
11 |
+
# from urllib3.exceptions import MaxRetryError
|
12 |
+
|
13 |
+
from .config import *
|
14 |
+
import logging, time
|
15 |
+
import base64
|
16 |
+
from PIL import Image, ImageDraw, ImageFont
|
17 |
+
from io import BytesIO
|
18 |
+
from typing import List, Callable
|
19 |
+
import requests
|
20 |
+
import os
|
21 |
+
from pathlib import Path
|
22 |
+
from .utils import *
|
23 |
+
|
24 |
+
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
25 |
+
logger.setLevel(logging.WARNING)
|
26 |
+
|
27 |
+
logger_driver = logging.getLogger('selenium.webdriver.common.service')
|
28 |
+
logger_driver.setLevel(logging.WARNING)
|
29 |
+
|
30 |
+
urllib3_logger = logging.getLogger('urllib3.connectionpool')
|
31 |
+
urllib3_logger.setLevel(logging.WARNING)
|
32 |
+
|
33 |
+
# Disable http.client logging
|
34 |
+
http_client_logger = logging.getLogger('http.client')
|
35 |
+
http_client_logger.setLevel(logging.WARNING)
|
36 |
+
|
37 |
+
# Disable driver_finder and service logging
|
38 |
+
driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
|
39 |
+
driver_finder_logger.setLevel(logging.WARNING)
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
class CrawlerStrategy(ABC):
|
45 |
+
@abstractmethod
|
46 |
+
def crawl(self, url: str, **kwargs) -> str:
|
47 |
+
pass
|
48 |
+
|
49 |
+
@abstractmethod
|
50 |
+
def take_screenshot(self, save_path: str):
|
51 |
+
pass
|
52 |
+
|
53 |
+
@abstractmethod
|
54 |
+
def update_user_agent(self, user_agent: str):
|
55 |
+
pass
|
56 |
+
|
57 |
+
@abstractmethod
|
58 |
+
def set_hook(self, hook_type: str, hook: Callable):
|
59 |
+
pass
|
60 |
+
|
61 |
+
class CloudCrawlerStrategy(CrawlerStrategy):
|
62 |
+
def __init__(self, use_cached_html = False):
|
63 |
+
super().__init__()
|
64 |
+
self.use_cached_html = use_cached_html
|
65 |
+
|
66 |
+
def crawl(self, url: str) -> str:
|
67 |
+
data = {
|
68 |
+
"urls": [url],
|
69 |
+
"include_raw_html": True,
|
70 |
+
"forced": True,
|
71 |
+
"extract_blocks": False,
|
72 |
+
}
|
73 |
+
|
74 |
+
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
75 |
+
response = response.json()
|
76 |
+
html = response["results"][0]["html"]
|
77 |
+
return sanitize_input_encode(html)
|
78 |
+
|
79 |
+
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
80 |
+
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
81 |
+
super().__init__()
|
82 |
+
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
83 |
+
self.options = Options()
|
84 |
+
self.options.headless = True
|
85 |
+
if kwargs.get("proxy"):
|
86 |
+
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
|
87 |
+
if kwargs.get("user_agent"):
|
88 |
+
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
89 |
+
else:
|
90 |
+
user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
91 |
+
self.options.add_argument(f"--user-agent={user_agent}")
|
92 |
+
self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
93 |
+
|
94 |
+
self.options.headless = kwargs.get("headless", True)
|
95 |
+
if self.options.headless:
|
96 |
+
self.options.add_argument("--headless")
|
97 |
+
|
98 |
+
self.options.add_argument("--disable-gpu")
|
99 |
+
self.options.add_argument("--window-size=1920,1080")
|
100 |
+
self.options.add_argument("--no-sandbox")
|
101 |
+
self.options.add_argument("--disable-dev-shm-usage")
|
102 |
+
self.options.add_argument("--disable-blink-features=AutomationControlled")
|
103 |
+
|
104 |
+
# self.options.add_argument("--disable-dev-shm-usage")
|
105 |
+
self.options.add_argument("--disable-gpu")
|
106 |
+
# self.options.add_argument("--disable-extensions")
|
107 |
+
# self.options.add_argument("--disable-infobars")
|
108 |
+
# self.options.add_argument("--disable-logging")
|
109 |
+
# self.options.add_argument("--disable-popup-blocking")
|
110 |
+
# self.options.add_argument("--disable-translate")
|
111 |
+
# self.options.add_argument("--disable-default-apps")
|
112 |
+
# self.options.add_argument("--disable-background-networking")
|
113 |
+
# self.options.add_argument("--disable-sync")
|
114 |
+
# self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
115 |
+
# self.options.add_argument("--disable-browser-side-navigation")
|
116 |
+
# self.options.add_argument("--dns-prefetch-disable")
|
117 |
+
# self.options.add_argument("--disable-web-security")
|
118 |
+
self.options.add_argument("--log-level=3")
|
119 |
+
self.use_cached_html = use_cached_html
|
120 |
+
self.use_cached_html = use_cached_html
|
121 |
+
self.js_code = js_code
|
122 |
+
self.verbose = kwargs.get("verbose", False)
|
123 |
+
|
124 |
+
# Hooks
|
125 |
+
self.hooks = {
|
126 |
+
'on_driver_created': None,
|
127 |
+
'on_user_agent_updated': None,
|
128 |
+
'before_get_url': None,
|
129 |
+
'after_get_url': None,
|
130 |
+
'before_return_html': None
|
131 |
+
}
|
132 |
+
|
133 |
+
# chromedriver_autoinstaller.install()
|
134 |
+
# import chromedriver_autoinstaller
|
135 |
+
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
136 |
+
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
137 |
+
# chromedriver_path = chromedriver_autoinstaller.install()
|
138 |
+
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
139 |
+
# self.service = Service(chromedriver_autoinstaller.install())
|
140 |
+
|
141 |
+
|
142 |
+
# chromedriver_path = ChromeDriverManager().install()
|
143 |
+
# self.service = Service(chromedriver_path)
|
144 |
+
# self.service.log_path = "NUL"
|
145 |
+
# self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
146 |
+
|
147 |
+
# Use selenium-manager (built into Selenium 4.10.0+)
|
148 |
+
self.service = Service()
|
149 |
+
self.driver = webdriver.Chrome(options=self.options)
|
150 |
+
|
151 |
+
self.driver = self.execute_hook('on_driver_created', self.driver)
|
152 |
+
|
153 |
+
if kwargs.get("cookies"):
|
154 |
+
for cookie in kwargs.get("cookies"):
|
155 |
+
self.driver.add_cookie(cookie)
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
def set_hook(self, hook_type: str, hook: Callable):
|
160 |
+
if hook_type in self.hooks:
|
161 |
+
self.hooks[hook_type] = hook
|
162 |
+
else:
|
163 |
+
raise ValueError(f"Invalid hook type: {hook_type}")
|
164 |
+
|
165 |
+
def execute_hook(self, hook_type: str, *args):
|
166 |
+
hook = self.hooks.get(hook_type)
|
167 |
+
if hook:
|
168 |
+
result = hook(*args)
|
169 |
+
if result is not None:
|
170 |
+
if isinstance(result, webdriver.Chrome):
|
171 |
+
return result
|
172 |
+
else:
|
173 |
+
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
174 |
+
# If the hook returns None or there is no hook, return self.driver
|
175 |
+
return self.driver
|
176 |
+
|
177 |
+
def update_user_agent(self, user_agent: str):
|
178 |
+
self.options.add_argument(f"user-agent={user_agent}")
|
179 |
+
self.driver.quit()
|
180 |
+
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
181 |
+
self.driver = self.execute_hook('on_user_agent_updated', self.driver)
|
182 |
+
|
183 |
+
def set_custom_headers(self, headers: dict):
|
184 |
+
# Enable Network domain for sending headers
|
185 |
+
self.driver.execute_cdp_cmd('Network.enable', {})
|
186 |
+
# Set extra HTTP headers
|
187 |
+
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
188 |
+
|
189 |
+
def _ensure_page_load(self, max_checks=6, check_interval=0.01):
|
190 |
+
initial_length = len(self.driver.page_source)
|
191 |
+
|
192 |
+
for ix in range(max_checks):
|
193 |
+
# print(f"Checking page load: {ix}")
|
194 |
+
time.sleep(check_interval)
|
195 |
+
current_length = len(self.driver.page_source)
|
196 |
+
|
197 |
+
if current_length != initial_length:
|
198 |
+
break
|
199 |
+
|
200 |
+
return self.driver.page_source
|
201 |
+
|
202 |
+
def crawl(self, url: str, **kwargs) -> str:
|
203 |
+
# Create md5 hash of the URL
|
204 |
+
import hashlib
|
205 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()
|
206 |
+
|
207 |
+
if self.use_cached_html:
|
208 |
+
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
209 |
+
if os.path.exists(cache_file_path):
|
210 |
+
with open(cache_file_path, "r") as f:
|
211 |
+
return sanitize_input_encode(f.read())
|
212 |
+
|
213 |
+
try:
|
214 |
+
self.driver = self.execute_hook('before_get_url', self.driver)
|
215 |
+
if self.verbose:
|
216 |
+
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
217 |
+
self.driver.get(url) #<html><head></head><body></body></html>
|
218 |
+
|
219 |
+
WebDriverWait(self.driver, 20).until(
|
220 |
+
lambda d: d.execute_script('return document.readyState') == 'complete'
|
221 |
+
)
|
222 |
+
WebDriverWait(self.driver, 10).until(
|
223 |
+
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
224 |
+
)
|
225 |
+
|
226 |
+
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
227 |
+
|
228 |
+
self.driver = self.execute_hook('after_get_url', self.driver)
|
229 |
+
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
230 |
+
can_not_be_done_headless = False # Look at my creativity for naming variables
|
231 |
+
|
232 |
+
# TODO: Very ugly approach, but promise to change it!
|
233 |
+
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
234 |
+
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
235 |
+
can_not_be_done_headless = True
|
236 |
+
options = Options()
|
237 |
+
options.headless = False
|
238 |
+
# set window size very small
|
239 |
+
options.add_argument("--window-size=5,5")
|
240 |
+
driver = webdriver.Chrome(service=self.service, options=options)
|
241 |
+
driver.get(url)
|
242 |
+
self.driver = self.execute_hook('after_get_url', driver)
|
243 |
+
html = sanitize_input_encode(driver.page_source)
|
244 |
+
driver.quit()
|
245 |
+
|
246 |
+
# Execute JS code if provided
|
247 |
+
self.js_code = kwargs.get("js_code", self.js_code)
|
248 |
+
if self.js_code and type(self.js_code) == str:
|
249 |
+
self.driver.execute_script(self.js_code)
|
250 |
+
# Optionally, wait for some condition after executing the JS code
|
251 |
+
WebDriverWait(self.driver, 10).until(
|
252 |
+
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
253 |
+
)
|
254 |
+
elif self.js_code and type(self.js_code) == list:
|
255 |
+
for js in self.js_code:
|
256 |
+
self.driver.execute_script(js)
|
257 |
+
WebDriverWait(self.driver, 10).until(
|
258 |
+
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
259 |
+
)
|
260 |
+
|
261 |
+
# Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
|
262 |
+
wait_for = kwargs.get('wait_for', False)
|
263 |
+
if wait_for:
|
264 |
+
if callable(wait_for):
|
265 |
+
print("[LOG] 🔄 Waiting for condition...")
|
266 |
+
WebDriverWait(self.driver, 20).until(wait_for)
|
267 |
+
else:
|
268 |
+
print("[LOG] 🔄 Waiting for condition...")
|
269 |
+
WebDriverWait(self.driver, 20).until(
|
270 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
|
271 |
+
)
|
272 |
+
|
273 |
+
if not can_not_be_done_headless:
|
274 |
+
html = sanitize_input_encode(self.driver.page_source)
|
275 |
+
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
276 |
+
|
277 |
+
# Store in cache
|
278 |
+
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
279 |
+
with open(cache_file_path, "w", encoding="utf-8") as f:
|
280 |
+
f.write(html)
|
281 |
+
|
282 |
+
if self.verbose:
|
283 |
+
print(f"[LOG] ✅ Crawled {url} successfully!")
|
284 |
+
|
285 |
+
return html
|
286 |
+
except InvalidArgumentException as e:
|
287 |
+
if not hasattr(e, 'msg'):
|
288 |
+
e.msg = sanitize_input_encode(str(e))
|
289 |
+
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
290 |
+
except WebDriverException as e:
|
291 |
+
# If e does nlt have msg attribute create it and set it to str(e)
|
292 |
+
if not hasattr(e, 'msg'):
|
293 |
+
e.msg = sanitize_input_encode(str(e))
|
294 |
+
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
295 |
+
except Exception as e:
|
296 |
+
if not hasattr(e, 'msg'):
|
297 |
+
e.msg = sanitize_input_encode(str(e))
|
298 |
+
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
299 |
+
|
300 |
+
def take_screenshot(self) -> str:
|
301 |
+
try:
|
302 |
+
# Get the dimensions of the page
|
303 |
+
total_width = self.driver.execute_script("return document.body.scrollWidth")
|
304 |
+
total_height = self.driver.execute_script("return document.body.scrollHeight")
|
305 |
+
|
306 |
+
# Set the window size to the dimensions of the page
|
307 |
+
self.driver.set_window_size(total_width, total_height)
|
308 |
+
|
309 |
+
# Take screenshot
|
310 |
+
screenshot = self.driver.get_screenshot_as_png()
|
311 |
+
|
312 |
+
# Open the screenshot with PIL
|
313 |
+
image = Image.open(BytesIO(screenshot))
|
314 |
+
|
315 |
+
# Convert image to RGB mode (this will handle both RGB and RGBA images)
|
316 |
+
rgb_image = image.convert('RGB')
|
317 |
+
|
318 |
+
# Convert to JPEG and compress
|
319 |
+
buffered = BytesIO()
|
320 |
+
rgb_image.save(buffered, format="JPEG", quality=85)
|
321 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
322 |
+
|
323 |
+
if self.verbose:
|
324 |
+
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
325 |
+
|
326 |
+
return img_base64
|
327 |
+
except Exception as e:
|
328 |
+
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
329 |
+
print(error_message)
|
330 |
+
|
331 |
+
# Generate an image with black background
|
332 |
+
img = Image.new('RGB', (800, 600), color='black')
|
333 |
+
draw = ImageDraw.Draw(img)
|
334 |
+
|
335 |
+
# Load a font
|
336 |
+
try:
|
337 |
+
font = ImageFont.truetype("arial.ttf", 40)
|
338 |
+
except IOError:
|
339 |
+
font = ImageFont.load_default()
|
340 |
+
|
341 |
+
# Define text color and wrap the text
|
342 |
+
text_color = (255, 255, 255)
|
343 |
+
max_width = 780
|
344 |
+
wrapped_text = wrap_text(draw, error_message, font, max_width)
|
345 |
+
|
346 |
+
# Calculate text position
|
347 |
+
text_position = (10, 10)
|
348 |
+
|
349 |
+
# Draw the text on the image
|
350 |
+
draw.text(text_position, wrapped_text, fill=text_color, font=font)
|
351 |
+
|
352 |
+
# Convert to base64
|
353 |
+
buffered = BytesIO()
|
354 |
+
img.save(buffered, format="JPEG")
|
355 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
356 |
+
|
357 |
+
return img_base64
|
358 |
+
|
359 |
+
def quit(self):
|
360 |
+
self.driver.quit()
|
crawl4ai/database.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
import sqlite3
|
4 |
+
from typing import Optional, Tuple
|
5 |
+
|
6 |
+
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
7 |
+
os.makedirs(DB_PATH, exist_ok=True)
|
8 |
+
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
9 |
+
|
10 |
+
def init_db():
|
11 |
+
global DB_PATH
|
12 |
+
conn = sqlite3.connect(DB_PATH)
|
13 |
+
cursor = conn.cursor()
|
14 |
+
cursor.execute('''
|
15 |
+
CREATE TABLE IF NOT EXISTS crawled_data (
|
16 |
+
url TEXT PRIMARY KEY,
|
17 |
+
html TEXT,
|
18 |
+
cleaned_html TEXT,
|
19 |
+
markdown TEXT,
|
20 |
+
extracted_content TEXT,
|
21 |
+
success BOOLEAN,
|
22 |
+
media TEXT DEFAULT "{}",
|
23 |
+
links TEXT DEFAULT "{}",
|
24 |
+
metadata TEXT DEFAULT "{}",
|
25 |
+
screenshot TEXT DEFAULT ""
|
26 |
+
)
|
27 |
+
''')
|
28 |
+
conn.commit()
|
29 |
+
conn.close()
|
30 |
+
|
31 |
+
def alter_db_add_screenshot(new_column: str = "media"):
|
32 |
+
check_db_path()
|
33 |
+
try:
|
34 |
+
conn = sqlite3.connect(DB_PATH)
|
35 |
+
cursor = conn.cursor()
|
36 |
+
cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
37 |
+
conn.commit()
|
38 |
+
conn.close()
|
39 |
+
except Exception as e:
|
40 |
+
print(f"Error altering database to add screenshot column: {e}")
|
41 |
+
|
42 |
+
def check_db_path():
|
43 |
+
if not DB_PATH:
|
44 |
+
raise ValueError("Database path is not set or is empty.")
|
45 |
+
|
46 |
+
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
47 |
+
check_db_path()
|
48 |
+
try:
|
49 |
+
conn = sqlite3.connect(DB_PATH)
|
50 |
+
cursor = conn.cursor()
|
51 |
+
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
|
52 |
+
result = cursor.fetchone()
|
53 |
+
conn.close()
|
54 |
+
return result
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error retrieving cached URL: {e}")
|
57 |
+
return None
|
58 |
+
|
59 |
+
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
|
60 |
+
check_db_path()
|
61 |
+
try:
|
62 |
+
conn = sqlite3.connect(DB_PATH)
|
63 |
+
cursor = conn.cursor()
|
64 |
+
cursor.execute('''
|
65 |
+
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
66 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
67 |
+
ON CONFLICT(url) DO UPDATE SET
|
68 |
+
html = excluded.html,
|
69 |
+
cleaned_html = excluded.cleaned_html,
|
70 |
+
markdown = excluded.markdown,
|
71 |
+
extracted_content = excluded.extracted_content,
|
72 |
+
success = excluded.success,
|
73 |
+
media = excluded.media,
|
74 |
+
links = excluded.links,
|
75 |
+
metadata = excluded.metadata,
|
76 |
+
screenshot = excluded.screenshot
|
77 |
+
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
78 |
+
conn.commit()
|
79 |
+
conn.close()
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error caching URL: {e}")
|
82 |
+
|
83 |
+
def get_total_count() -> int:
|
84 |
+
check_db_path()
|
85 |
+
try:
|
86 |
+
conn = sqlite3.connect(DB_PATH)
|
87 |
+
cursor = conn.cursor()
|
88 |
+
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
89 |
+
result = cursor.fetchone()
|
90 |
+
conn.close()
|
91 |
+
return result[0]
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Error getting total count: {e}")
|
94 |
+
return 0
|
95 |
+
|
96 |
+
def clear_db():
|
97 |
+
check_db_path()
|
98 |
+
try:
|
99 |
+
conn = sqlite3.connect(DB_PATH)
|
100 |
+
cursor = conn.cursor()
|
101 |
+
cursor.execute('DELETE FROM crawled_data')
|
102 |
+
conn.commit()
|
103 |
+
conn.close()
|
104 |
+
except Exception as e:
|
105 |
+
print(f"Error clearing database: {e}")
|
106 |
+
|
107 |
+
def flush_db():
|
108 |
+
check_db_path()
|
109 |
+
try:
|
110 |
+
conn = sqlite3.connect(DB_PATH)
|
111 |
+
cursor = conn.cursor()
|
112 |
+
cursor.execute('DROP TABLE crawled_data')
|
113 |
+
conn.commit()
|
114 |
+
conn.close()
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Error flushing database: {e}")
|
117 |
+
|
118 |
+
def update_existing_records(new_column: str = "media", default_value: str = "{}"):
|
119 |
+
check_db_path()
|
120 |
+
try:
|
121 |
+
conn = sqlite3.connect(DB_PATH)
|
122 |
+
cursor = conn.cursor()
|
123 |
+
cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
|
124 |
+
conn.commit()
|
125 |
+
conn.close()
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Error updating existing records: {e}")
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
# Delete the existing database file
|
131 |
+
if os.path.exists(DB_PATH):
|
132 |
+
os.remove(DB_PATH)
|
133 |
+
init_db()
|
134 |
+
# alter_db_add_screenshot("COL_NAME")
|
135 |
+
|
crawl4ai/docs_manager.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import shutil
|
3 |
+
from pathlib import Path
|
4 |
+
from crawl4ai.async_logger import AsyncLogger
|
5 |
+
from crawl4ai.llmtxt import AsyncLLMTextManager
|
6 |
+
|
7 |
+
class DocsManager:
|
8 |
+
def __init__(self, logger=None):
|
9 |
+
self.docs_dir = Path.home() / ".crawl4ai" / "docs"
|
10 |
+
self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
|
11 |
+
self.docs_dir.mkdir(parents=True, exist_ok=True)
|
12 |
+
self.logger = logger or AsyncLogger(verbose=True)
|
13 |
+
self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
|
14 |
+
|
15 |
+
async def ensure_docs_exist(self):
|
16 |
+
"""Fetch docs if not present"""
|
17 |
+
if not any(self.docs_dir.iterdir()):
|
18 |
+
await self.fetch_docs()
|
19 |
+
|
20 |
+
async def fetch_docs(self) -> bool:
|
21 |
+
"""Copy from local docs or download from GitHub"""
|
22 |
+
try:
|
23 |
+
# Try local first
|
24 |
+
if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))):
|
25 |
+
# Empty the local docs directory
|
26 |
+
for file_path in self.docs_dir.glob("*.md"):
|
27 |
+
file_path.unlink()
|
28 |
+
# for file_path in self.docs_dir.glob("*.tokens"):
|
29 |
+
# file_path.unlink()
|
30 |
+
for file_path in self.local_docs.glob("*.md"):
|
31 |
+
shutil.copy2(file_path, self.docs_dir / file_path.name)
|
32 |
+
# for file_path in self.local_docs.glob("*.tokens"):
|
33 |
+
# shutil.copy2(file_path, self.docs_dir / file_path.name)
|
34 |
+
return True
|
35 |
+
|
36 |
+
# Fallback to GitHub
|
37 |
+
response = requests.get(
|
38 |
+
"https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
|
39 |
+
headers={'Accept': 'application/vnd.github.v3+json'}
|
40 |
+
)
|
41 |
+
response.raise_for_status()
|
42 |
+
|
43 |
+
for item in response.json():
|
44 |
+
if item['type'] == 'file' and item['name'].endswith('.md'):
|
45 |
+
content = requests.get(item['download_url']).text
|
46 |
+
with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f:
|
47 |
+
f.write(content)
|
48 |
+
return True
|
49 |
+
|
50 |
+
except Exception as e:
|
51 |
+
self.logger.error(f"Failed to fetch docs: {str(e)}")
|
52 |
+
raise
|
53 |
+
|
54 |
+
def list(self) -> list[str]:
|
55 |
+
"""List available topics"""
|
56 |
+
names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
|
57 |
+
# Remove [0-9]+_ prefix
|
58 |
+
names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
|
59 |
+
# Exclude those end with .xs.md and .q.md
|
60 |
+
names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")]
|
61 |
+
return names
|
62 |
+
|
63 |
+
def generate(self, sections, mode="extended"):
|
64 |
+
return self.llm_text.generate(sections, mode)
|
65 |
+
|
66 |
+
def search(self, query: str, top_k: int = 5):
|
67 |
+
return self.llm_text.search(query, top_k)
|
crawl4ai/extraction_strategy.bak.py
ADDED
@@ -0,0 +1,1440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import Any, List, Dict, Optional, Union
|
3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
+
import json, time
|
5 |
+
# from optimum.intel import IPEXModel
|
6 |
+
from .prompts import *
|
7 |
+
from .config import *
|
8 |
+
from .utils import *
|
9 |
+
from .models import *
|
10 |
+
from functools import partial
|
11 |
+
from .model_loader import *
|
12 |
+
import math
|
13 |
+
import numpy as np
|
14 |
+
import re
|
15 |
+
from bs4 import BeautifulSoup
|
16 |
+
from lxml import html, etree
|
17 |
+
from dataclasses import dataclass
|
18 |
+
|
19 |
+
class ExtractionStrategy(ABC):
|
20 |
+
"""
|
21 |
+
Abstract base class for all extraction strategies.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, input_format: str = "markdown", **kwargs):
|
25 |
+
"""
|
26 |
+
Initialize the extraction strategy.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
input_format: Content format to use for extraction.
|
30 |
+
Options: "markdown" (default), "html", "fit_markdown"
|
31 |
+
**kwargs: Additional keyword arguments
|
32 |
+
"""
|
33 |
+
self.input_format = input_format
|
34 |
+
self.DEL = "<|DEL|>"
|
35 |
+
self.name = self.__class__.__name__
|
36 |
+
self.verbose = kwargs.get("verbose", False)
|
37 |
+
|
38 |
+
@abstractmethod
|
39 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
40 |
+
"""
|
41 |
+
Extract meaningful blocks or chunks from the given HTML.
|
42 |
+
|
43 |
+
:param url: The URL of the webpage.
|
44 |
+
:param html: The HTML content of the webpage.
|
45 |
+
:return: A list of extracted blocks or chunks.
|
46 |
+
"""
|
47 |
+
pass
|
48 |
+
|
49 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
50 |
+
"""
|
51 |
+
Process sections of text in parallel by default.
|
52 |
+
|
53 |
+
:param url: The URL of the webpage.
|
54 |
+
:param sections: List of sections (strings) to process.
|
55 |
+
:return: A list of processed JSON blocks.
|
56 |
+
"""
|
57 |
+
extracted_content = []
|
58 |
+
with ThreadPoolExecutor() as executor:
|
59 |
+
futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
|
60 |
+
for future in as_completed(futures):
|
61 |
+
extracted_content.extend(future.result())
|
62 |
+
return extracted_content
|
63 |
+
|
64 |
+
class NoExtractionStrategy(ExtractionStrategy):
|
65 |
+
"""
|
66 |
+
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
|
67 |
+
"""
|
68 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
69 |
+
"""
|
70 |
+
Extract meaningful blocks or chunks from the given HTML.
|
71 |
+
"""
|
72 |
+
return [{"index": 0, "content": html}]
|
73 |
+
|
74 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
75 |
+
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
76 |
+
|
77 |
+
#######################################################
|
78 |
+
# Strategies using LLM-based extraction for text data #
|
79 |
+
#######################################################
|
80 |
+
class LLMExtractionStrategy(ExtractionStrategy):
|
81 |
+
"""
|
82 |
+
A strategy that uses an LLM to extract meaningful content from the HTML.
|
83 |
+
|
84 |
+
Attributes:
|
85 |
+
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
86 |
+
api_token: The API token for the provider.
|
87 |
+
instruction: The instruction to use for the LLM model.
|
88 |
+
schema: Pydantic model schema for structured data.
|
89 |
+
extraction_type: "block" or "schema".
|
90 |
+
chunk_token_threshold: Maximum tokens per chunk.
|
91 |
+
overlap_rate: Overlap between chunks.
|
92 |
+
word_token_rate: Word to token conversion rate.
|
93 |
+
apply_chunking: Whether to apply chunking.
|
94 |
+
base_url: The base URL for the API request.
|
95 |
+
api_base: The base URL for the API request.
|
96 |
+
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
97 |
+
verbose: Whether to print verbose output.
|
98 |
+
usages: List of individual token usages.
|
99 |
+
total_usage: Accumulated token usage.
|
100 |
+
"""
|
101 |
+
|
102 |
+
def __init__(self,
|
103 |
+
provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
|
104 |
+
instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
|
105 |
+
"""
|
106 |
+
Initialize the strategy with clustering parameters.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
110 |
+
api_token: The API token for the provider.
|
111 |
+
instruction: The instruction to use for the LLM model.
|
112 |
+
schema: Pydantic model schema for structured data.
|
113 |
+
extraction_type: "block" or "schema".
|
114 |
+
chunk_token_threshold: Maximum tokens per chunk.
|
115 |
+
overlap_rate: Overlap between chunks.
|
116 |
+
word_token_rate: Word to token conversion rate.
|
117 |
+
apply_chunking: Whether to apply chunking.
|
118 |
+
base_url: The base URL for the API request.
|
119 |
+
api_base: The base URL for the API request.
|
120 |
+
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
121 |
+
verbose: Whether to print verbose output.
|
122 |
+
usages: List of individual token usages.
|
123 |
+
total_usage: Accumulated token usage.
|
124 |
+
|
125 |
+
"""
|
126 |
+
super().__init__(**kwargs)
|
127 |
+
self.provider = provider
|
128 |
+
self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
|
129 |
+
self.instruction = instruction
|
130 |
+
self.extract_type = extraction_type
|
131 |
+
self.schema = schema
|
132 |
+
if schema:
|
133 |
+
self.extract_type = "schema"
|
134 |
+
|
135 |
+
self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
|
136 |
+
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
137 |
+
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
138 |
+
self.apply_chunking = kwargs.get("apply_chunking", True)
|
139 |
+
self.base_url = kwargs.get("base_url", None)
|
140 |
+
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
|
141 |
+
self.extra_args = kwargs.get("extra_args", {})
|
142 |
+
if not self.apply_chunking:
|
143 |
+
self.chunk_token_threshold = 1e9
|
144 |
+
|
145 |
+
self.verbose = kwargs.get("verbose", False)
|
146 |
+
self.usages = [] # Store individual usages
|
147 |
+
self.total_usage = TokenUsage() # Accumulated usage
|
148 |
+
|
149 |
+
if not self.api_token:
|
150 |
+
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
|
151 |
+
|
152 |
+
|
153 |
+
def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
|
154 |
+
"""
|
155 |
+
Extract meaningful blocks or chunks from the given HTML using an LLM.
|
156 |
+
|
157 |
+
How it works:
|
158 |
+
1. Construct a prompt with variables.
|
159 |
+
2. Make a request to the LLM using the prompt.
|
160 |
+
3. Parse the response and extract blocks or chunks.
|
161 |
+
|
162 |
+
Args:
|
163 |
+
url: The URL of the webpage.
|
164 |
+
ix: Index of the block.
|
165 |
+
html: The HTML content of the webpage.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
A list of extracted blocks or chunks.
|
169 |
+
"""
|
170 |
+
if self.verbose:
|
171 |
+
# print("[LOG] Extracting blocks from URL:", url)
|
172 |
+
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
173 |
+
|
174 |
+
variable_values = {
|
175 |
+
"URL": url,
|
176 |
+
"HTML": escape_json_string(sanitize_html(html)),
|
177 |
+
}
|
178 |
+
|
179 |
+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
|
180 |
+
if self.instruction:
|
181 |
+
variable_values["REQUEST"] = self.instruction
|
182 |
+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
183 |
+
|
184 |
+
if self.extract_type == "schema" and self.schema:
|
185 |
+
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
186 |
+
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
187 |
+
|
188 |
+
for variable in variable_values:
|
189 |
+
prompt_with_variables = prompt_with_variables.replace(
|
190 |
+
"{" + variable + "}", variable_values[variable]
|
191 |
+
)
|
192 |
+
|
193 |
+
response = perform_completion_with_backoff(
|
194 |
+
self.provider,
|
195 |
+
prompt_with_variables,
|
196 |
+
self.api_token,
|
197 |
+
base_url=self.api_base or self.base_url,
|
198 |
+
extra_args = self.extra_args
|
199 |
+
) # , json_response=self.extract_type == "schema")
|
200 |
+
# Track usage
|
201 |
+
usage = TokenUsage(
|
202 |
+
completion_tokens=response.usage.completion_tokens,
|
203 |
+
prompt_tokens=response.usage.prompt_tokens,
|
204 |
+
total_tokens=response.usage.total_tokens,
|
205 |
+
completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
|
206 |
+
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
|
207 |
+
)
|
208 |
+
self.usages.append(usage)
|
209 |
+
|
210 |
+
# Update totals
|
211 |
+
self.total_usage.completion_tokens += usage.completion_tokens
|
212 |
+
self.total_usage.prompt_tokens += usage.prompt_tokens
|
213 |
+
self.total_usage.total_tokens += usage.total_tokens
|
214 |
+
|
215 |
+
try:
|
216 |
+
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
217 |
+
blocks = json.loads(blocks)
|
218 |
+
for block in blocks:
|
219 |
+
block['error'] = False
|
220 |
+
except Exception as e:
|
221 |
+
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
222 |
+
blocks = parsed
|
223 |
+
if unparsed:
|
224 |
+
blocks.append({
|
225 |
+
"index": 0,
|
226 |
+
"error": True,
|
227 |
+
"tags": ["error"],
|
228 |
+
"content": unparsed
|
229 |
+
})
|
230 |
+
|
231 |
+
if self.verbose:
|
232 |
+
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
|
233 |
+
return blocks
|
234 |
+
|
235 |
+
def _merge(self, documents, chunk_token_threshold, overlap):
|
236 |
+
"""
|
237 |
+
Merge documents into sections based on chunk_token_threshold and overlap.
|
238 |
+
"""
|
239 |
+
chunks = []
|
240 |
+
sections = []
|
241 |
+
total_tokens = 0
|
242 |
+
|
243 |
+
# Calculate the total tokens across all documents
|
244 |
+
for document in documents:
|
245 |
+
total_tokens += len(document.split(' ')) * self.word_token_rate
|
246 |
+
|
247 |
+
# Calculate the number of sections needed
|
248 |
+
num_sections = math.floor(total_tokens / chunk_token_threshold)
|
249 |
+
if num_sections < 1:
|
250 |
+
num_sections = 1 # Ensure there is at least one section
|
251 |
+
adjusted_chunk_threshold = total_tokens / num_sections
|
252 |
+
|
253 |
+
total_token_so_far = 0
|
254 |
+
current_chunk = []
|
255 |
+
|
256 |
+
for document in documents:
|
257 |
+
tokens = document.split(' ')
|
258 |
+
token_count = len(tokens) * self.word_token_rate
|
259 |
+
|
260 |
+
if total_token_so_far + token_count <= adjusted_chunk_threshold:
|
261 |
+
current_chunk.extend(tokens)
|
262 |
+
total_token_so_far += token_count
|
263 |
+
else:
|
264 |
+
# Ensure to handle the last section properly
|
265 |
+
if len(sections) == num_sections - 1:
|
266 |
+
current_chunk.extend(tokens)
|
267 |
+
continue
|
268 |
+
|
269 |
+
# Add overlap if specified
|
270 |
+
if overlap > 0 and current_chunk:
|
271 |
+
overlap_tokens = current_chunk[-overlap:]
|
272 |
+
current_chunk.extend(overlap_tokens)
|
273 |
+
|
274 |
+
sections.append(' '.join(current_chunk))
|
275 |
+
current_chunk = tokens
|
276 |
+
total_token_so_far = token_count
|
277 |
+
|
278 |
+
# Add the last chunk
|
279 |
+
if current_chunk:
|
280 |
+
sections.append(' '.join(current_chunk))
|
281 |
+
|
282 |
+
return sections
|
283 |
+
|
284 |
+
|
285 |
+
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
286 |
+
"""
|
287 |
+
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
288 |
+
|
289 |
+
Args:
|
290 |
+
url: The URL of the webpage.
|
291 |
+
sections: List of sections (strings) to process.
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
A list of extracted blocks or chunks.
|
295 |
+
"""
|
296 |
+
|
297 |
+
merged_sections = self._merge(
|
298 |
+
sections, self.chunk_token_threshold,
|
299 |
+
overlap= int(self.chunk_token_threshold * self.overlap_rate)
|
300 |
+
)
|
301 |
+
extracted_content = []
|
302 |
+
if self.provider.startswith("groq/"):
|
303 |
+
# Sequential processing with a delay
|
304 |
+
for ix, section in enumerate(merged_sections):
|
305 |
+
extract_func = partial(self.extract, url)
|
306 |
+
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
307 |
+
time.sleep(0.5) # 500 ms delay between each processing
|
308 |
+
else:
|
309 |
+
# Parallel processing using ThreadPoolExecutor
|
310 |
+
# extract_func = partial(self.extract, url)
|
311 |
+
# for ix, section in enumerate(merged_sections):
|
312 |
+
# extracted_content.append(extract_func(ix, section))
|
313 |
+
|
314 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
315 |
+
extract_func = partial(self.extract, url)
|
316 |
+
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
317 |
+
|
318 |
+
for future in as_completed(futures):
|
319 |
+
try:
|
320 |
+
extracted_content.extend(future.result())
|
321 |
+
except Exception as e:
|
322 |
+
if self.verbose:
|
323 |
+
print(f"Error in thread execution: {e}")
|
324 |
+
# Add error information to extracted_content
|
325 |
+
extracted_content.append({
|
326 |
+
"index": 0,
|
327 |
+
"error": True,
|
328 |
+
"tags": ["error"],
|
329 |
+
"content": str(e)
|
330 |
+
})
|
331 |
+
|
332 |
+
|
333 |
+
return extracted_content
|
334 |
+
|
335 |
+
|
336 |
+
def show_usage(self) -> None:
|
337 |
+
"""Print a detailed token usage report showing total and per-request usage."""
|
338 |
+
print("\n=== Token Usage Summary ===")
|
339 |
+
print(f"{'Type':<15} {'Count':>12}")
|
340 |
+
print("-" * 30)
|
341 |
+
print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
|
342 |
+
print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
|
343 |
+
print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
|
344 |
+
|
345 |
+
print("\n=== Usage History ===")
|
346 |
+
print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
|
347 |
+
print("-" * 48)
|
348 |
+
for i, usage in enumerate(self.usages, 1):
|
349 |
+
print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
|
350 |
+
|
351 |
+
#######################################################
|
352 |
+
# Strategies using clustering for text data extraction #
|
353 |
+
#######################################################
|
354 |
+
|
355 |
+
class CosineStrategy(ExtractionStrategy):
|
356 |
+
"""
|
357 |
+
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
|
358 |
+
|
359 |
+
How it works:
|
360 |
+
1. Pre-filter documents using embeddings and semantic_filter.
|
361 |
+
2. Perform clustering using cosine similarity.
|
362 |
+
3. Organize texts by their cluster labels, retaining order.
|
363 |
+
4. Filter clusters by word count.
|
364 |
+
5. Extract meaningful blocks or chunks from the filtered clusters.
|
365 |
+
|
366 |
+
Attributes:
|
367 |
+
semantic_filter (str): A keyword filter for document filtering.
|
368 |
+
word_count_threshold (int): Minimum number of words per cluster.
|
369 |
+
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
370 |
+
linkage_method (str): The linkage method for hierarchical clustering.
|
371 |
+
top_k (int): Number of top categories to extract.
|
372 |
+
model_name (str): The name of the sentence-transformers model.
|
373 |
+
sim_threshold (float): The similarity threshold for clustering.
|
374 |
+
"""
|
375 |
+
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
376 |
+
"""
|
377 |
+
Initialize the strategy with clustering parameters.
|
378 |
+
|
379 |
+
Args:
|
380 |
+
semantic_filter (str): A keyword filter for document filtering.
|
381 |
+
word_count_threshold (int): Minimum number of words per cluster.
|
382 |
+
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
383 |
+
linkage_method (str): The linkage method for hierarchical clustering.
|
384 |
+
top_k (int): Number of top categories to extract.
|
385 |
+
"""
|
386 |
+
super().__init__(**kwargs)
|
387 |
+
|
388 |
+
import numpy as np
|
389 |
+
|
390 |
+
self.semantic_filter = semantic_filter
|
391 |
+
self.word_count_threshold = word_count_threshold
|
392 |
+
self.max_dist = max_dist
|
393 |
+
self.linkage_method = linkage_method
|
394 |
+
self.top_k = top_k
|
395 |
+
self.sim_threshold = sim_threshold
|
396 |
+
self.timer = time.time()
|
397 |
+
self.verbose = kwargs.get("verbose", False)
|
398 |
+
|
399 |
+
self.buffer_embeddings = np.array([])
|
400 |
+
self.get_embedding_method = "direct"
|
401 |
+
|
402 |
+
self.device = get_device()
|
403 |
+
# import torch
|
404 |
+
# self.device = torch.device('cpu')
|
405 |
+
|
406 |
+
self.default_batch_size = calculate_batch_size(self.device)
|
407 |
+
|
408 |
+
if self.verbose:
|
409 |
+
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
410 |
+
|
411 |
+
# if False and self.device.type == "cpu":
|
412 |
+
# self.model = load_onnx_all_MiniLM_l6_v2()
|
413 |
+
# self.tokenizer = self.model.tokenizer
|
414 |
+
# self.get_embedding_method = "direct"
|
415 |
+
# else:
|
416 |
+
|
417 |
+
self.tokenizer, self.model = load_HF_embedding_model(model_name)
|
418 |
+
self.model.to(self.device)
|
419 |
+
self.model.eval()
|
420 |
+
|
421 |
+
self.get_embedding_method = "batch"
|
422 |
+
|
423 |
+
self.buffer_embeddings = np.array([])
|
424 |
+
|
425 |
+
# if model_name == "bert-base-uncased":
|
426 |
+
# self.tokenizer, self.model = load_bert_base_uncased()
|
427 |
+
# self.model.eval() # Ensure the model is in evaluation mode
|
428 |
+
# self.get_embedding_method = "batch"
|
429 |
+
# elif model_name == "BAAI/bge-small-en-v1.5":
|
430 |
+
# self.tokenizer, self.model = load_bge_small_en_v1_5()
|
431 |
+
# self.model.eval() # Ensure the model is in evaluation mode
|
432 |
+
# self.get_embedding_method = "batch"
|
433 |
+
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
434 |
+
# self.model = load_onnx_all_MiniLM_l6_v2()
|
435 |
+
# self.tokenizer = self.model.tokenizer
|
436 |
+
# self.get_embedding_method = "direct"
|
437 |
+
|
438 |
+
|
439 |
+
if self.verbose:
|
440 |
+
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
|
441 |
+
|
442 |
+
self.nlp, _ = load_text_multilabel_classifier()
|
443 |
+
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
|
444 |
+
|
445 |
+
if self.verbose:
|
446 |
+
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
447 |
+
|
448 |
+
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
|
449 |
+
"""
|
450 |
+
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
451 |
+
|
452 |
+
Args:
|
453 |
+
documents (List[str]): A list of document texts.
|
454 |
+
semantic_filter (str): A keyword filter for document filtering.
|
455 |
+
at_least_k (int): The minimum number of documents to return.
|
456 |
+
|
457 |
+
Returns:
|
458 |
+
List[str]: A list of filtered and sorted document texts.
|
459 |
+
"""
|
460 |
+
|
461 |
+
if not semantic_filter:
|
462 |
+
return documents
|
463 |
+
|
464 |
+
if len(documents) < at_least_k:
|
465 |
+
at_least_k = len(documents) // 2
|
466 |
+
|
467 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
468 |
+
|
469 |
+
# Compute embedding for the keyword filter
|
470 |
+
query_embedding = self.get_embeddings([semantic_filter])[0]
|
471 |
+
|
472 |
+
# Compute embeddings for the documents
|
473 |
+
document_embeddings = self.get_embeddings(documents)
|
474 |
+
|
475 |
+
# Calculate cosine similarity between the query embedding and document embeddings
|
476 |
+
similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
|
477 |
+
|
478 |
+
# Filter documents based on the similarity threshold
|
479 |
+
filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
|
480 |
+
|
481 |
+
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
|
482 |
+
if len(filtered_docs) < at_least_k:
|
483 |
+
remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
|
484 |
+
remaining_docs.sort(key=lambda x: x[1], reverse=True)
|
485 |
+
filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
|
486 |
+
|
487 |
+
# Extract the document texts from the tuples
|
488 |
+
filtered_docs = [doc for doc, _ in filtered_docs]
|
489 |
+
|
490 |
+
return filtered_docs[:at_least_k]
|
491 |
+
|
492 |
+
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
|
493 |
+
"""
|
494 |
+
Get BERT embeddings for a list of sentences.
|
495 |
+
|
496 |
+
Args:
|
497 |
+
sentences (List[str]): A list of text chunks (sentences).
|
498 |
+
|
499 |
+
Returns:
|
500 |
+
NumPy array of embeddings.
|
501 |
+
"""
|
502 |
+
# if self.buffer_embeddings.any() and not bypass_buffer:
|
503 |
+
# return self.buffer_embeddings
|
504 |
+
|
505 |
+
if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
|
506 |
+
import torch
|
507 |
+
# Tokenize sentences and convert to tensor
|
508 |
+
if batch_size is None:
|
509 |
+
batch_size = self.default_batch_size
|
510 |
+
|
511 |
+
all_embeddings = []
|
512 |
+
for i in range(0, len(sentences), batch_size):
|
513 |
+
batch_sentences = sentences[i:i + batch_size]
|
514 |
+
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
|
515 |
+
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
|
516 |
+
|
517 |
+
# Ensure no gradients are calculated
|
518 |
+
with torch.no_grad():
|
519 |
+
model_output = self.model(**encoded_input)
|
520 |
+
|
521 |
+
# Get embeddings from the last hidden state (mean pooling)
|
522 |
+
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
|
523 |
+
all_embeddings.append(embeddings)
|
524 |
+
|
525 |
+
self.buffer_embeddings = np.vstack(all_embeddings)
|
526 |
+
elif self.device.type == "cpu":
|
527 |
+
# self.buffer_embeddings = self.model(sentences)
|
528 |
+
if batch_size is None:
|
529 |
+
batch_size = self.default_batch_size
|
530 |
+
|
531 |
+
all_embeddings = []
|
532 |
+
for i in range(0, len(sentences), batch_size):
|
533 |
+
batch_sentences = sentences[i:i + batch_size]
|
534 |
+
embeddings = self.model(batch_sentences)
|
535 |
+
all_embeddings.append(embeddings)
|
536 |
+
|
537 |
+
self.buffer_embeddings = np.vstack(all_embeddings)
|
538 |
+
return self.buffer_embeddings
|
539 |
+
|
540 |
+
def hierarchical_clustering(self, sentences: List[str], embeddings = None):
|
541 |
+
"""
|
542 |
+
Perform hierarchical clustering on sentences and return cluster labels.
|
543 |
+
|
544 |
+
Args:
|
545 |
+
sentences (List[str]): A list of text chunks (sentences).
|
546 |
+
|
547 |
+
Returns:
|
548 |
+
NumPy array of cluster labels.
|
549 |
+
"""
|
550 |
+
# Get embeddings
|
551 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
552 |
+
from scipy.spatial.distance import pdist
|
553 |
+
self.timer = time.time()
|
554 |
+
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
|
555 |
+
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
|
556 |
+
# Compute pairwise cosine distances
|
557 |
+
distance_matrix = pdist(embeddings, 'cosine')
|
558 |
+
# Perform agglomerative clustering respecting order
|
559 |
+
linked = linkage(distance_matrix, method=self.linkage_method)
|
560 |
+
# Form flat clusters
|
561 |
+
labels = fcluster(linked, self.max_dist, criterion='distance')
|
562 |
+
return labels
|
563 |
+
|
564 |
+
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
|
565 |
+
"""
|
566 |
+
Filter clusters to remove those with a word count below the threshold.
|
567 |
+
|
568 |
+
Args:
|
569 |
+
clusters (Dict[int, List[str]]): Dictionary of clusters.
|
570 |
+
|
571 |
+
Returns:
|
572 |
+
Dict[int, List[str]]: Filtered dictionary of clusters.
|
573 |
+
"""
|
574 |
+
filtered_clusters = {}
|
575 |
+
for cluster_id, texts in clusters.items():
|
576 |
+
# Concatenate texts for analysis
|
577 |
+
full_text = " ".join(texts)
|
578 |
+
# Count words
|
579 |
+
word_count = len(full_text.split())
|
580 |
+
|
581 |
+
# Keep clusters with word count above the threshold
|
582 |
+
if word_count >= self.word_count_threshold:
|
583 |
+
filtered_clusters[cluster_id] = texts
|
584 |
+
|
585 |
+
return filtered_clusters
|
586 |
+
|
587 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
588 |
+
"""
|
589 |
+
Extract clusters from HTML content using hierarchical clustering.
|
590 |
+
|
591 |
+
Args:
|
592 |
+
url (str): The URL of the webpage.
|
593 |
+
html (str): The HTML content of the webpage.
|
594 |
+
|
595 |
+
Returns:
|
596 |
+
List[Dict[str, Any]]: A list of processed JSON blocks.
|
597 |
+
"""
|
598 |
+
# Assume `html` is a list of text chunks for this strategy
|
599 |
+
t = time.time()
|
600 |
+
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
601 |
+
|
602 |
+
# Pre-filter documents using embeddings and semantic_filter
|
603 |
+
text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
|
604 |
+
|
605 |
+
if not text_chunks:
|
606 |
+
return []
|
607 |
+
|
608 |
+
# Perform clustering
|
609 |
+
labels = self.hierarchical_clustering(text_chunks)
|
610 |
+
# print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
|
611 |
+
|
612 |
+
# Organize texts by their cluster labels, retaining order
|
613 |
+
t = time.time()
|
614 |
+
clusters = {}
|
615 |
+
for index, label in enumerate(labels):
|
616 |
+
clusters.setdefault(label, []).append(text_chunks[index])
|
617 |
+
|
618 |
+
# Filter clusters by word count
|
619 |
+
filtered_clusters = self.filter_clusters_by_word_count(clusters)
|
620 |
+
|
621 |
+
# Convert filtered clusters to a sorted list of dictionaries
|
622 |
+
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
623 |
+
|
624 |
+
if self.verbose:
|
625 |
+
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
626 |
+
|
627 |
+
if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
|
628 |
+
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
629 |
+
|
630 |
+
for cluster, label in zip(cluster_list, labels):
|
631 |
+
cluster['tags'] = label
|
632 |
+
# elif self.device.type == "cpu":
|
633 |
+
# # Process the text with the loaded model
|
634 |
+
# texts = [cluster['content'] for cluster in cluster_list]
|
635 |
+
# # Batch process texts
|
636 |
+
# docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
637 |
+
|
638 |
+
# for doc, cluster in zip(docs, cluster_list):
|
639 |
+
# tok_k = self.top_k
|
640 |
+
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
641 |
+
# cluster['tags'] = [cat for cat, _ in top_categories]
|
642 |
+
|
643 |
+
# for cluster in cluster_list:
|
644 |
+
# doc = self.nlp(cluster['content'])
|
645 |
+
# tok_k = self.top_k
|
646 |
+
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
647 |
+
# cluster['tags'] = [cat for cat, _ in top_categories]
|
648 |
+
|
649 |
+
if self.verbose:
|
650 |
+
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
651 |
+
|
652 |
+
return cluster_list
|
653 |
+
|
654 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
655 |
+
"""
|
656 |
+
Process sections using hierarchical clustering.
|
657 |
+
|
658 |
+
Args:
|
659 |
+
url (str): The URL of the webpage.
|
660 |
+
sections (List[str]): List of sections (strings) to process.
|
661 |
+
|
662 |
+
Returns:
|
663 |
+
"""
|
664 |
+
# This strategy processes all sections together
|
665 |
+
|
666 |
+
return self.extract(url, self.DEL.join(sections), **kwargs)
|
667 |
+
|
668 |
+
#######################################################
|
669 |
+
# New extraction strategies for JSON-based extraction #
|
670 |
+
#######################################################
|
671 |
+
|
672 |
+
class JsonElementExtractionStrategy(ExtractionStrategy):
|
673 |
+
"""
|
674 |
+
Abstract base class for extracting structured JSON from HTML content.
|
675 |
+
|
676 |
+
How it works:
|
677 |
+
1. Parses HTML content using the `_parse_html` method.
|
678 |
+
2. Uses a schema to define base selectors, fields, and transformations.
|
679 |
+
3. Extracts data hierarchically, supporting nested fields and lists.
|
680 |
+
4. Handles computed fields with expressions or functions.
|
681 |
+
|
682 |
+
Attributes:
|
683 |
+
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
|
684 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
685 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
686 |
+
|
687 |
+
Methods:
|
688 |
+
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
|
689 |
+
_extract_item(element, fields): Extracts fields from a single element.
|
690 |
+
_extract_single_field(element, field): Extracts a single field based on its type.
|
691 |
+
_apply_transform(value, transform): Applies a transformation to a value.
|
692 |
+
_compute_field(item, field): Computes a field value using an expression or function.
|
693 |
+
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
|
694 |
+
|
695 |
+
Abstract Methods:
|
696 |
+
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
|
697 |
+
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
|
698 |
+
_get_elements(element, selector): Retrieves child elements using a selector.
|
699 |
+
_get_element_text(element): Extracts text content from an element.
|
700 |
+
_get_element_html(element): Extracts raw HTML from an element.
|
701 |
+
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
|
702 |
+
"""
|
703 |
+
|
704 |
+
|
705 |
+
DEL = '\n'
|
706 |
+
|
707 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
708 |
+
"""
|
709 |
+
Initialize the JSON element extraction strategy with a schema.
|
710 |
+
|
711 |
+
Args:
|
712 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
713 |
+
"""
|
714 |
+
super().__init__(**kwargs)
|
715 |
+
self.schema = schema
|
716 |
+
self.verbose = kwargs.get('verbose', False)
|
717 |
+
|
718 |
+
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
719 |
+
"""
|
720 |
+
Extract structured data from HTML content.
|
721 |
+
|
722 |
+
How it works:
|
723 |
+
1. Parses the HTML content using the `_parse_html` method.
|
724 |
+
2. Identifies base elements using the schema's base selector.
|
725 |
+
3. Extracts fields from each base element using `_extract_item`.
|
726 |
+
|
727 |
+
Args:
|
728 |
+
url (str): The URL of the page being processed.
|
729 |
+
html_content (str): The raw HTML content to parse and extract.
|
730 |
+
*q: Additional positional arguments.
|
731 |
+
**kwargs: Additional keyword arguments for custom extraction.
|
732 |
+
|
733 |
+
Returns:
|
734 |
+
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
|
735 |
+
"""
|
736 |
+
|
737 |
+
parsed_html = self._parse_html(html_content)
|
738 |
+
base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
|
739 |
+
|
740 |
+
results = []
|
741 |
+
for element in base_elements:
|
742 |
+
# Extract base element attributes
|
743 |
+
item = {}
|
744 |
+
if 'baseFields' in self.schema:
|
745 |
+
for field in self.schema['baseFields']:
|
746 |
+
value = self._extract_single_field(element, field)
|
747 |
+
if value is not None:
|
748 |
+
item[field['name']] = value
|
749 |
+
|
750 |
+
# Extract child fields
|
751 |
+
field_data = self._extract_item(element, self.schema['fields'])
|
752 |
+
item.update(field_data)
|
753 |
+
|
754 |
+
if item:
|
755 |
+
results.append(item)
|
756 |
+
|
757 |
+
return results
|
758 |
+
|
759 |
+
@abstractmethod
|
760 |
+
def _parse_html(self, html_content: str):
|
761 |
+
"""Parse HTML content into appropriate format"""
|
762 |
+
pass
|
763 |
+
|
764 |
+
@abstractmethod
|
765 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
766 |
+
"""Get all base elements using the selector"""
|
767 |
+
pass
|
768 |
+
|
769 |
+
@abstractmethod
|
770 |
+
def _get_elements(self, element, selector: str):
|
771 |
+
"""Get child elements using the selector"""
|
772 |
+
pass
|
773 |
+
|
774 |
+
def _extract_field(self, element, field):
|
775 |
+
try:
|
776 |
+
if field['type'] == 'nested':
|
777 |
+
nested_elements = self._get_elements(element, field['selector'])
|
778 |
+
nested_element = nested_elements[0] if nested_elements else None
|
779 |
+
return self._extract_item(nested_element, field['fields']) if nested_element else {}
|
780 |
+
|
781 |
+
if field['type'] == 'list':
|
782 |
+
elements = self._get_elements(element, field['selector'])
|
783 |
+
return [self._extract_list_item(el, field['fields']) for el in elements]
|
784 |
+
|
785 |
+
if field['type'] == 'nested_list':
|
786 |
+
elements = self._get_elements(element, field['selector'])
|
787 |
+
return [self._extract_item(el, field['fields']) for el in elements]
|
788 |
+
|
789 |
+
return self._extract_single_field(element, field)
|
790 |
+
except Exception as e:
|
791 |
+
if self.verbose:
|
792 |
+
print(f"Error extracting field {field['name']}: {str(e)}")
|
793 |
+
return field.get('default')
|
794 |
+
|
795 |
+
def _extract_single_field(self, element, field):
|
796 |
+
"""
|
797 |
+
Extract a single field based on its type.
|
798 |
+
|
799 |
+
How it works:
|
800 |
+
1. Selects the target element using the field's selector.
|
801 |
+
2. Extracts the field value based on its type (e.g., text, attribute, regex).
|
802 |
+
3. Applies transformations if defined in the schema.
|
803 |
+
|
804 |
+
Args:
|
805 |
+
element: The base element to extract the field from.
|
806 |
+
field (Dict[str, Any]): The field definition in the schema.
|
807 |
+
|
808 |
+
Returns:
|
809 |
+
Any: The extracted field value.
|
810 |
+
"""
|
811 |
+
|
812 |
+
if 'selector' in field:
|
813 |
+
selected = self._get_elements(element, field['selector'])
|
814 |
+
if not selected:
|
815 |
+
return field.get('default')
|
816 |
+
selected = selected[0]
|
817 |
+
else:
|
818 |
+
selected = element
|
819 |
+
|
820 |
+
value = None
|
821 |
+
if field['type'] == 'text':
|
822 |
+
value = self._get_element_text(selected)
|
823 |
+
elif field['type'] == 'attribute':
|
824 |
+
value = self._get_element_attribute(selected, field['attribute'])
|
825 |
+
elif field['type'] == 'html':
|
826 |
+
value = self._get_element_html(selected)
|
827 |
+
elif field['type'] == 'regex':
|
828 |
+
text = self._get_element_text(selected)
|
829 |
+
match = re.search(field['pattern'], text)
|
830 |
+
value = match.group(1) if match else None
|
831 |
+
|
832 |
+
if 'transform' in field:
|
833 |
+
value = self._apply_transform(value, field['transform'])
|
834 |
+
|
835 |
+
return value if value is not None else field.get('default')
|
836 |
+
|
837 |
+
def _extract_list_item(self, element, fields):
|
838 |
+
item = {}
|
839 |
+
for field in fields:
|
840 |
+
value = self._extract_single_field(element, field)
|
841 |
+
if value is not None:
|
842 |
+
item[field['name']] = value
|
843 |
+
return item
|
844 |
+
|
845 |
+
def _extract_item(self, element, fields):
|
846 |
+
"""
|
847 |
+
Extracts fields from a given element.
|
848 |
+
|
849 |
+
How it works:
|
850 |
+
1. Iterates through the fields defined in the schema.
|
851 |
+
2. Handles computed, single, and nested field types.
|
852 |
+
3. Updates the item dictionary with extracted field values.
|
853 |
+
|
854 |
+
Args:
|
855 |
+
element: The base element to extract fields from.
|
856 |
+
fields (List[Dict[str, Any]]): The list of fields to extract.
|
857 |
+
|
858 |
+
Returns:
|
859 |
+
Dict[str, Any]: A dictionary representing the extracted item.
|
860 |
+
"""
|
861 |
+
|
862 |
+
item = {}
|
863 |
+
for field in fields:
|
864 |
+
if field['type'] == 'computed':
|
865 |
+
value = self._compute_field(item, field)
|
866 |
+
else:
|
867 |
+
value = self._extract_field(element, field)
|
868 |
+
if value is not None:
|
869 |
+
item[field['name']] = value
|
870 |
+
return item
|
871 |
+
|
872 |
+
def _apply_transform(self, value, transform):
|
873 |
+
"""
|
874 |
+
Apply a transformation to a value.
|
875 |
+
|
876 |
+
How it works:
|
877 |
+
1. Checks the transformation type (e.g., `lowercase`, `strip`).
|
878 |
+
2. Applies the transformation to the value.
|
879 |
+
3. Returns the transformed value.
|
880 |
+
|
881 |
+
Args:
|
882 |
+
value (str): The value to transform.
|
883 |
+
transform (str): The type of transformation to apply.
|
884 |
+
|
885 |
+
Returns:
|
886 |
+
str: The transformed value.
|
887 |
+
"""
|
888 |
+
|
889 |
+
if transform == 'lowercase':
|
890 |
+
return value.lower()
|
891 |
+
elif transform == 'uppercase':
|
892 |
+
return value.upper()
|
893 |
+
elif transform == 'strip':
|
894 |
+
return value.strip()
|
895 |
+
return value
|
896 |
+
|
897 |
+
def _compute_field(self, item, field):
|
898 |
+
try:
|
899 |
+
if 'expression' in field:
|
900 |
+
return eval(field['expression'], {}, item)
|
901 |
+
elif 'function' in field:
|
902 |
+
return field['function'](item)
|
903 |
+
except Exception as e:
|
904 |
+
if self.verbose:
|
905 |
+
print(f"Error computing field {field['name']}: {str(e)}")
|
906 |
+
return field.get('default')
|
907 |
+
|
908 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
909 |
+
"""
|
910 |
+
Run the extraction strategy on a combined HTML content.
|
911 |
+
|
912 |
+
How it works:
|
913 |
+
1. Combines multiple HTML sections using the `DEL` delimiter.
|
914 |
+
2. Calls the `extract` method with the combined HTML.
|
915 |
+
|
916 |
+
Args:
|
917 |
+
url (str): The URL of the page being processed.
|
918 |
+
sections (List[str]): A list of HTML sections.
|
919 |
+
*q: Additional positional arguments.
|
920 |
+
**kwargs: Additional keyword arguments for custom extraction.
|
921 |
+
|
922 |
+
Returns:
|
923 |
+
List[Dict[str, Any]]: A list of extracted items.
|
924 |
+
"""
|
925 |
+
|
926 |
+
combined_html = self.DEL.join(sections)
|
927 |
+
return self.extract(url, combined_html, **kwargs)
|
928 |
+
|
929 |
+
@abstractmethod
|
930 |
+
def _get_element_text(self, element) -> str:
|
931 |
+
"""Get text content from element"""
|
932 |
+
pass
|
933 |
+
|
934 |
+
@abstractmethod
|
935 |
+
def _get_element_html(self, element) -> str:
|
936 |
+
"""Get HTML content from element"""
|
937 |
+
pass
|
938 |
+
|
939 |
+
@abstractmethod
|
940 |
+
def _get_element_attribute(self, element, attribute: str):
|
941 |
+
"""Get attribute value from element"""
|
942 |
+
pass
|
943 |
+
|
944 |
+
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
945 |
+
"""
|
946 |
+
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
947 |
+
|
948 |
+
How it works:
|
949 |
+
1. Parses HTML content with BeautifulSoup.
|
950 |
+
2. Selects elements using CSS selectors defined in the schema.
|
951 |
+
3. Extracts field data and applies transformations as defined.
|
952 |
+
|
953 |
+
Attributes:
|
954 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
955 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
956 |
+
|
957 |
+
Methods:
|
958 |
+
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
|
959 |
+
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
|
960 |
+
_get_elements(element, selector): Selects child elements using a CSS selector.
|
961 |
+
_get_element_text(element): Extracts text content from a BeautifulSoup element.
|
962 |
+
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
|
963 |
+
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
|
964 |
+
"""
|
965 |
+
|
966 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
967 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
968 |
+
super().__init__(schema, **kwargs)
|
969 |
+
|
970 |
+
def _parse_html(self, html_content: str):
|
971 |
+
return BeautifulSoup(html_content, 'html.parser')
|
972 |
+
|
973 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
974 |
+
return parsed_html.select(selector)
|
975 |
+
|
976 |
+
def _get_elements(self, element, selector: str):
|
977 |
+
selected = element.select_one(selector)
|
978 |
+
return [selected] if selected else []
|
979 |
+
|
980 |
+
def _get_element_text(self, element) -> str:
|
981 |
+
return element.get_text(strip=True)
|
982 |
+
|
983 |
+
def _get_element_html(self, element) -> str:
|
984 |
+
return str(element)
|
985 |
+
|
986 |
+
def _get_element_attribute(self, element, attribute: str):
|
987 |
+
return element.get(attribute)
|
988 |
+
|
989 |
+
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
990 |
+
"""
|
991 |
+
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
992 |
+
|
993 |
+
How it works:
|
994 |
+
1. Parses HTML content into an lxml tree.
|
995 |
+
2. Selects elements using XPath expressions.
|
996 |
+
3. Converts CSS selectors to XPath when needed.
|
997 |
+
|
998 |
+
Attributes:
|
999 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
1000 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
1001 |
+
|
1002 |
+
Methods:
|
1003 |
+
_parse_html(html_content): Parses HTML content into an lxml tree.
|
1004 |
+
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
|
1005 |
+
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
|
1006 |
+
_get_elements(element, selector): Selects child elements using an XPath selector.
|
1007 |
+
_get_element_text(element): Extracts text content from an lxml element.
|
1008 |
+
_get_element_html(element): Extracts the raw HTML content of an lxml element.
|
1009 |
+
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
|
1010 |
+
"""
|
1011 |
+
|
1012 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
1013 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
1014 |
+
super().__init__(schema, **kwargs)
|
1015 |
+
|
1016 |
+
def _parse_html(self, html_content: str):
|
1017 |
+
return html.fromstring(html_content)
|
1018 |
+
|
1019 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
1020 |
+
return parsed_html.xpath(selector)
|
1021 |
+
|
1022 |
+
def _css_to_xpath(self, css_selector: str) -> str:
|
1023 |
+
"""Convert CSS selector to XPath if needed"""
|
1024 |
+
if '/' in css_selector: # Already an XPath
|
1025 |
+
return css_selector
|
1026 |
+
return self._basic_css_to_xpath(css_selector)
|
1027 |
+
|
1028 |
+
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
1029 |
+
"""Basic CSS to XPath conversion for common cases"""
|
1030 |
+
if ' > ' in css_selector:
|
1031 |
+
parts = css_selector.split(' > ')
|
1032 |
+
return '//' + '/'.join(parts)
|
1033 |
+
if ' ' in css_selector:
|
1034 |
+
parts = css_selector.split(' ')
|
1035 |
+
return '//' + '//'.join(parts)
|
1036 |
+
return '//' + css_selector
|
1037 |
+
|
1038 |
+
def _get_elements(self, element, selector: str):
|
1039 |
+
xpath = self._css_to_xpath(selector)
|
1040 |
+
if not xpath.startswith('.'):
|
1041 |
+
xpath = '.' + xpath
|
1042 |
+
return element.xpath(xpath)
|
1043 |
+
|
1044 |
+
def _get_element_text(self, element) -> str:
|
1045 |
+
return ''.join(element.xpath('.//text()')).strip()
|
1046 |
+
|
1047 |
+
def _get_element_html(self, element) -> str:
|
1048 |
+
return etree.tostring(element, encoding='unicode')
|
1049 |
+
|
1050 |
+
def _get_element_attribute(self, element, attribute: str):
|
1051 |
+
return element.get(attribute)
|
1052 |
+
|
1053 |
+
|
1054 |
+
#######################################################
|
1055 |
+
# Strategies based on the extraction of specific types#
|
1056 |
+
#######################################################
|
1057 |
+
|
1058 |
+
class TopicExtractionStrategy(ExtractionStrategy):
|
1059 |
+
def __init__(self, num_keywords: int = 3, **kwargs):
|
1060 |
+
"""
|
1061 |
+
Initialize the topic extraction strategy with parameters for topic segmentation.
|
1062 |
+
|
1063 |
+
:param num_keywords: Number of keywords to represent each topic segment.
|
1064 |
+
"""
|
1065 |
+
import nltk
|
1066 |
+
super().__init__(**kwargs)
|
1067 |
+
self.num_keywords = num_keywords
|
1068 |
+
self.tokenizer = nltk.TextTilingTokenizer()
|
1069 |
+
|
1070 |
+
def extract_keywords(self, text: str) -> List[str]:
|
1071 |
+
"""
|
1072 |
+
Extract keywords from a given text segment using simple frequency analysis.
|
1073 |
+
|
1074 |
+
:param text: The text segment from which to extract keywords.
|
1075 |
+
:return: A list of keyword strings.
|
1076 |
+
"""
|
1077 |
+
import nltk
|
1078 |
+
# Tokenize the text and compute word frequency
|
1079 |
+
words = nltk.word_tokenize(text)
|
1080 |
+
freq_dist = nltk.FreqDist(words)
|
1081 |
+
# Get the most common words as keywords
|
1082 |
+
keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
|
1083 |
+
return keywords
|
1084 |
+
|
1085 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
1086 |
+
"""
|
1087 |
+
Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
|
1088 |
+
|
1089 |
+
:param url: The URL of the webpage.
|
1090 |
+
:param html: The HTML content of the webpage.
|
1091 |
+
:param provider: The provider to be used for extraction (not used here).
|
1092 |
+
:param api_token: Optional API token for the provider (not used here).
|
1093 |
+
:return: A list of dictionaries representing the topics.
|
1094 |
+
"""
|
1095 |
+
# Use TextTiling to segment the text into topics
|
1096 |
+
segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
|
1097 |
+
|
1098 |
+
# Prepare the output as a list of dictionaries
|
1099 |
+
topic_list = []
|
1100 |
+
for i, segment in enumerate(segmented_topics):
|
1101 |
+
# Extract keywords for each segment
|
1102 |
+
keywords = self.extract_keywords(segment)
|
1103 |
+
topic_list.append({
|
1104 |
+
"index": i,
|
1105 |
+
"content": segment,
|
1106 |
+
"keywords": keywords
|
1107 |
+
})
|
1108 |
+
|
1109 |
+
return topic_list
|
1110 |
+
|
1111 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
1112 |
+
"""
|
1113 |
+
Process sections using topic segmentation and keyword extraction.
|
1114 |
+
|
1115 |
+
:param url: The URL of the webpage.
|
1116 |
+
:param sections: List of sections (strings) to process.
|
1117 |
+
:param provider: The provider to be used for extraction (not used here).
|
1118 |
+
:param api_token: Optional API token for the provider (not used here).
|
1119 |
+
:return: A list of processed JSON blocks.
|
1120 |
+
"""
|
1121 |
+
# Concatenate sections into a single text for coherent topic segmentation
|
1122 |
+
|
1123 |
+
|
1124 |
+
return self.extract(url, self.DEL.join(sections), **kwargs)
|
1125 |
+
|
1126 |
+
class ContentSummarizationStrategy(ExtractionStrategy):
|
1127 |
+
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
|
1128 |
+
"""
|
1129 |
+
Initialize the content summarization strategy with a specific model.
|
1130 |
+
|
1131 |
+
:param model_name: The model to use for summarization.
|
1132 |
+
"""
|
1133 |
+
super().__init__(**kwargs)
|
1134 |
+
from transformers import pipeline
|
1135 |
+
self.summarizer = pipeline("summarization", model=model_name)
|
1136 |
+
|
1137 |
+
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
1138 |
+
"""
|
1139 |
+
Summarize a single section of text.
|
1140 |
+
|
1141 |
+
:param url: The URL of the webpage.
|
1142 |
+
:param text: A section of text to summarize.
|
1143 |
+
:param provider: The provider to be used for extraction (not used here).
|
1144 |
+
:param api_token: Optional API token for the provider (not used here).
|
1145 |
+
:return: A dictionary with the summary.
|
1146 |
+
"""
|
1147 |
+
try:
|
1148 |
+
summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
|
1149 |
+
return {"summary": summary[0]['summary_text']}
|
1150 |
+
except Exception as e:
|
1151 |
+
print(f"Error summarizing text: {e}")
|
1152 |
+
return {"summary": text} # Fallback to original text if summarization fails
|
1153 |
+
|
1154 |
+
def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
1155 |
+
"""
|
1156 |
+
Process each section in parallel to produce summaries.
|
1157 |
+
|
1158 |
+
:param url: The URL of the webpage.
|
1159 |
+
:param sections: List of sections (strings) to summarize.
|
1160 |
+
:param provider: The provider to be used for extraction (not used here).
|
1161 |
+
:param api_token: Optional API token for the provider (not used here).
|
1162 |
+
:return: A list of dictionaries with summaries for each section.
|
1163 |
+
"""
|
1164 |
+
# Use a ThreadPoolExecutor to summarize in parallel
|
1165 |
+
summaries = []
|
1166 |
+
with ThreadPoolExecutor() as executor:
|
1167 |
+
# Create a future for each section's summarization
|
1168 |
+
future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
|
1169 |
+
for future in as_completed(future_to_section):
|
1170 |
+
section_index = future_to_section[future]
|
1171 |
+
try:
|
1172 |
+
summary_result = future.result()
|
1173 |
+
summaries.append((section_index, summary_result))
|
1174 |
+
except Exception as e:
|
1175 |
+
print(f"Error processing section {section_index}: {e}")
|
1176 |
+
summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text
|
1177 |
+
|
1178 |
+
# Sort summaries by the original section index to maintain order
|
1179 |
+
summaries.sort(key=lambda x: x[0])
|
1180 |
+
return [summary for _, summary in summaries]
|
1181 |
+
|
1182 |
+
#######################################################
|
1183 |
+
# Deprecated strategies
|
1184 |
+
#######################################################
|
1185 |
+
|
1186 |
+
class _JsonCssExtractionStrategy(ExtractionStrategy):
|
1187 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
1188 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
1189 |
+
super().__init__(**kwargs)
|
1190 |
+
self.schema = schema
|
1191 |
+
|
1192 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
1193 |
+
soup = BeautifulSoup(html, 'html.parser')
|
1194 |
+
base_elements = soup.select(self.schema['baseSelector'])
|
1195 |
+
|
1196 |
+
results = []
|
1197 |
+
for element in base_elements:
|
1198 |
+
# Extract base element attributes first
|
1199 |
+
item = {}
|
1200 |
+
if 'baseFields' in self.schema:
|
1201 |
+
for field in self.schema['baseFields']:
|
1202 |
+
value = self._extract_single_field(element, field)
|
1203 |
+
if value is not None:
|
1204 |
+
item[field['name']] = value
|
1205 |
+
|
1206 |
+
# Then extract child fields
|
1207 |
+
field_data = self._extract_item(element, self.schema['fields'])
|
1208 |
+
item.update(field_data)
|
1209 |
+
|
1210 |
+
results.append(item)
|
1211 |
+
|
1212 |
+
return results
|
1213 |
+
|
1214 |
+
def _extract_field(self, element, field):
|
1215 |
+
try:
|
1216 |
+
if field['type'] == 'nested':
|
1217 |
+
nested_element = element.select_one(field['selector'])
|
1218 |
+
return self._extract_item(nested_element, field['fields']) if nested_element else {}
|
1219 |
+
|
1220 |
+
if field['type'] == 'list':
|
1221 |
+
elements = element.select(field['selector'])
|
1222 |
+
return [self._extract_list_item(el, field['fields']) for el in elements]
|
1223 |
+
|
1224 |
+
if field['type'] == 'nested_list':
|
1225 |
+
elements = element.select(field['selector'])
|
1226 |
+
return [self._extract_item(el, field['fields']) for el in elements]
|
1227 |
+
|
1228 |
+
return self._extract_single_field(element, field)
|
1229 |
+
except Exception as e:
|
1230 |
+
if self.verbose:
|
1231 |
+
print(f"Error extracting field {field['name']}: {str(e)}")
|
1232 |
+
return field.get('default')
|
1233 |
+
|
1234 |
+
def _extract_list_item(self, element, fields):
|
1235 |
+
item = {}
|
1236 |
+
for field in fields:
|
1237 |
+
value = self._extract_single_field(element, field)
|
1238 |
+
if value is not None:
|
1239 |
+
item[field['name']] = value
|
1240 |
+
return item
|
1241 |
+
|
1242 |
+
def _extract_single_field(self, element, field):
|
1243 |
+
if 'selector' in field:
|
1244 |
+
selected = element.select_one(field['selector'])
|
1245 |
+
if not selected:
|
1246 |
+
return field.get('default')
|
1247 |
+
else:
|
1248 |
+
selected = element
|
1249 |
+
|
1250 |
+
value = None
|
1251 |
+
if field['type'] == 'text':
|
1252 |
+
value = selected.get_text(strip=True)
|
1253 |
+
elif field['type'] == 'attribute':
|
1254 |
+
value = selected.get(field['attribute'])
|
1255 |
+
elif field['type'] == 'html':
|
1256 |
+
value = str(selected)
|
1257 |
+
elif field['type'] == 'regex':
|
1258 |
+
text = selected.get_text(strip=True)
|
1259 |
+
match = re.search(field['pattern'], text)
|
1260 |
+
value = match.group(1) if match else None
|
1261 |
+
|
1262 |
+
if 'transform' in field:
|
1263 |
+
value = self._apply_transform(value, field['transform'])
|
1264 |
+
|
1265 |
+
return value if value is not None else field.get('default')
|
1266 |
+
|
1267 |
+
def _extract_item(self, element, fields):
|
1268 |
+
item = {}
|
1269 |
+
for field in fields:
|
1270 |
+
if field['type'] == 'computed':
|
1271 |
+
value = self._compute_field(item, field)
|
1272 |
+
else:
|
1273 |
+
value = self._extract_field(element, field)
|
1274 |
+
if value is not None:
|
1275 |
+
item[field['name']] = value
|
1276 |
+
return item
|
1277 |
+
|
1278 |
+
def _apply_transform(self, value, transform):
|
1279 |
+
if transform == 'lowercase':
|
1280 |
+
return value.lower()
|
1281 |
+
elif transform == 'uppercase':
|
1282 |
+
return value.upper()
|
1283 |
+
elif transform == 'strip':
|
1284 |
+
return value.strip()
|
1285 |
+
return value
|
1286 |
+
|
1287 |
+
def _compute_field(self, item, field):
|
1288 |
+
try:
|
1289 |
+
if 'expression' in field:
|
1290 |
+
return eval(field['expression'], {}, item)
|
1291 |
+
elif 'function' in field:
|
1292 |
+
return field['function'](item)
|
1293 |
+
except Exception as e:
|
1294 |
+
if self.verbose:
|
1295 |
+
print(f"Error computing field {field['name']}: {str(e)}")
|
1296 |
+
return field.get('default')
|
1297 |
+
|
1298 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
1299 |
+
combined_html = self.DEL.join(sections)
|
1300 |
+
return self.extract(url, combined_html, **kwargs)
|
1301 |
+
class _JsonXPathExtractionStrategy(ExtractionStrategy):
|
1302 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
1303 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
1304 |
+
super().__init__(**kwargs)
|
1305 |
+
self.schema = schema
|
1306 |
+
|
1307 |
+
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
1308 |
+
tree = html.fromstring(html_content)
|
1309 |
+
base_xpath = self.schema['baseSelector']
|
1310 |
+
base_elements = tree.xpath(base_xpath)
|
1311 |
+
|
1312 |
+
results = []
|
1313 |
+
for element in base_elements:
|
1314 |
+
# Extract base element attributes first
|
1315 |
+
item = {}
|
1316 |
+
if 'baseFields' in self.schema:
|
1317 |
+
for field in self.schema['baseFields']:
|
1318 |
+
value = self._extract_single_field(element, field)
|
1319 |
+
if value is not None:
|
1320 |
+
item[field['name']] = value
|
1321 |
+
|
1322 |
+
# Then extract child fields
|
1323 |
+
field_data = self._extract_item(element, self.schema['fields'])
|
1324 |
+
item.update(field_data)
|
1325 |
+
|
1326 |
+
results.append(item)
|
1327 |
+
|
1328 |
+
return results
|
1329 |
+
|
1330 |
+
def _css_to_xpath(self, css_selector: str) -> str:
|
1331 |
+
"""Convert CSS selector to XPath if needed"""
|
1332 |
+
if '/' in css_selector: # Already an XPath
|
1333 |
+
return css_selector
|
1334 |
+
else:
|
1335 |
+
# Fallback to basic conversion for common cases
|
1336 |
+
return self._basic_css_to_xpath(css_selector)
|
1337 |
+
|
1338 |
+
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
1339 |
+
"""Basic CSS to XPath conversion for common cases"""
|
1340 |
+
# Handle basic cases
|
1341 |
+
if ' > ' in css_selector:
|
1342 |
+
parts = css_selector.split(' > ')
|
1343 |
+
return '//' + '/'.join(parts)
|
1344 |
+
if ' ' in css_selector:
|
1345 |
+
parts = css_selector.split(' ')
|
1346 |
+
return '//' + '//'.join(parts)
|
1347 |
+
return '//' + css_selector
|
1348 |
+
|
1349 |
+
def _extract_field(self, element, field):
|
1350 |
+
try:
|
1351 |
+
if field['type'] == 'nested':
|
1352 |
+
xpath = self._css_to_xpath(field['selector'])
|
1353 |
+
nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
|
1354 |
+
return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
|
1355 |
+
|
1356 |
+
if field['type'] == 'list':
|
1357 |
+
xpath = self._css_to_xpath(field['selector'])
|
1358 |
+
elements = element.xpath(xpath)
|
1359 |
+
return [self._extract_list_item(el, field['fields']) for el in elements]
|
1360 |
+
|
1361 |
+
if field['type'] == 'nested_list':
|
1362 |
+
xpath = self._css_to_xpath(field['selector'])
|
1363 |
+
elements = element.xpath(xpath)
|
1364 |
+
return [self._extract_item(el, field['fields']) for el in elements]
|
1365 |
+
|
1366 |
+
return self._extract_single_field(element, field)
|
1367 |
+
except Exception as e:
|
1368 |
+
if self.verbose:
|
1369 |
+
print(f"Error extracting field {field['name']}: {str(e)}")
|
1370 |
+
return field.get('default')
|
1371 |
+
|
1372 |
+
def _extract_list_item(self, element, fields):
|
1373 |
+
item = {}
|
1374 |
+
for field in fields:
|
1375 |
+
value = self._extract_single_field(element, field)
|
1376 |
+
if value is not None:
|
1377 |
+
item[field['name']] = value
|
1378 |
+
return item
|
1379 |
+
|
1380 |
+
def _extract_single_field(self, element, field):
|
1381 |
+
if 'selector' in field:
|
1382 |
+
xpath = self._css_to_xpath(field['selector'])
|
1383 |
+
selected = element.xpath(xpath)
|
1384 |
+
if not selected:
|
1385 |
+
return field.get('default')
|
1386 |
+
selected = selected[0]
|
1387 |
+
else:
|
1388 |
+
selected = element
|
1389 |
+
|
1390 |
+
value = None
|
1391 |
+
if field['type'] == 'text':
|
1392 |
+
value = ''.join(selected.xpath('.//text()')).strip()
|
1393 |
+
elif field['type'] == 'attribute':
|
1394 |
+
value = selected.get(field['attribute'])
|
1395 |
+
elif field['type'] == 'html':
|
1396 |
+
value = etree.tostring(selected, encoding='unicode')
|
1397 |
+
elif field['type'] == 'regex':
|
1398 |
+
text = ''.join(selected.xpath('.//text()')).strip()
|
1399 |
+
match = re.search(field['pattern'], text)
|
1400 |
+
value = match.group(1) if match else None
|
1401 |
+
|
1402 |
+
if 'transform' in field:
|
1403 |
+
value = self._apply_transform(value, field['transform'])
|
1404 |
+
|
1405 |
+
return value if value is not None else field.get('default')
|
1406 |
+
|
1407 |
+
def _extract_item(self, element, fields):
|
1408 |
+
item = {}
|
1409 |
+
for field in fields:
|
1410 |
+
if field['type'] == 'computed':
|
1411 |
+
value = self._compute_field(item, field)
|
1412 |
+
else:
|
1413 |
+
value = self._extract_field(element, field)
|
1414 |
+
if value is not None:
|
1415 |
+
item[field['name']] = value
|
1416 |
+
return item
|
1417 |
+
|
1418 |
+
def _apply_transform(self, value, transform):
|
1419 |
+
if transform == 'lowercase':
|
1420 |
+
return value.lower()
|
1421 |
+
elif transform == 'uppercase':
|
1422 |
+
return value.upper()
|
1423 |
+
elif transform == 'strip':
|
1424 |
+
return value.strip()
|
1425 |
+
return value
|
1426 |
+
|
1427 |
+
def _compute_field(self, item, field):
|
1428 |
+
try:
|
1429 |
+
if 'expression' in field:
|
1430 |
+
return eval(field['expression'], {}, item)
|
1431 |
+
elif 'function' in field:
|
1432 |
+
return field['function'](item)
|
1433 |
+
except Exception as e:
|
1434 |
+
if self.verbose:
|
1435 |
+
print(f"Error computing field {field['name']}: {str(e)}")
|
1436 |
+
return field.get('default')
|
1437 |
+
|
1438 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
1439 |
+
combined_html = self.DEL.join(sections)
|
1440 |
+
return self.extract(url, combined_html, **kwargs)
|
crawl4ai/extraction_strategy.py
ADDED
@@ -0,0 +1,1052 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import Any, List, Dict, Optional, Union
|
3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
+
import json, time
|
5 |
+
# from optimum.intel import IPEXModel
|
6 |
+
from .prompts import *
|
7 |
+
from .config import *
|
8 |
+
from .utils import *
|
9 |
+
from .models import *
|
10 |
+
from functools import partial
|
11 |
+
from .model_loader import *
|
12 |
+
import math
|
13 |
+
import numpy as np
|
14 |
+
import re
|
15 |
+
from bs4 import BeautifulSoup
|
16 |
+
from lxml import html, etree
|
17 |
+
from dataclasses import dataclass
|
18 |
+
|
19 |
+
class ExtractionStrategy(ABC):
|
20 |
+
"""
|
21 |
+
Abstract base class for all extraction strategies.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, input_format: str = "markdown", **kwargs):
|
25 |
+
"""
|
26 |
+
Initialize the extraction strategy.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
input_format: Content format to use for extraction.
|
30 |
+
Options: "markdown" (default), "html", "fit_markdown"
|
31 |
+
**kwargs: Additional keyword arguments
|
32 |
+
"""
|
33 |
+
self.input_format = input_format
|
34 |
+
self.DEL = "<|DEL|>"
|
35 |
+
self.name = self.__class__.__name__
|
36 |
+
self.verbose = kwargs.get("verbose", False)
|
37 |
+
|
38 |
+
@abstractmethod
|
39 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
40 |
+
"""
|
41 |
+
Extract meaningful blocks or chunks from the given HTML.
|
42 |
+
|
43 |
+
:param url: The URL of the webpage.
|
44 |
+
:param html: The HTML content of the webpage.
|
45 |
+
:return: A list of extracted blocks or chunks.
|
46 |
+
"""
|
47 |
+
pass
|
48 |
+
|
49 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
50 |
+
"""
|
51 |
+
Process sections of text in parallel by default.
|
52 |
+
|
53 |
+
:param url: The URL of the webpage.
|
54 |
+
:param sections: List of sections (strings) to process.
|
55 |
+
:return: A list of processed JSON blocks.
|
56 |
+
"""
|
57 |
+
extracted_content = []
|
58 |
+
with ThreadPoolExecutor() as executor:
|
59 |
+
futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
|
60 |
+
for future in as_completed(futures):
|
61 |
+
extracted_content.extend(future.result())
|
62 |
+
return extracted_content
|
63 |
+
|
64 |
+
class NoExtractionStrategy(ExtractionStrategy):
|
65 |
+
"""
|
66 |
+
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
|
67 |
+
"""
|
68 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
69 |
+
"""
|
70 |
+
Extract meaningful blocks or chunks from the given HTML.
|
71 |
+
"""
|
72 |
+
return [{"index": 0, "content": html}]
|
73 |
+
|
74 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
75 |
+
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
76 |
+
|
77 |
+
#######################################################
|
78 |
+
# Strategies using LLM-based extraction for text data #
|
79 |
+
#######################################################
|
80 |
+
class LLMExtractionStrategy(ExtractionStrategy):
|
81 |
+
"""
|
82 |
+
A strategy that uses an LLM to extract meaningful content from the HTML.
|
83 |
+
|
84 |
+
Attributes:
|
85 |
+
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
86 |
+
api_token: The API token for the provider.
|
87 |
+
instruction: The instruction to use for the LLM model.
|
88 |
+
schema: Pydantic model schema for structured data.
|
89 |
+
extraction_type: "block" or "schema".
|
90 |
+
chunk_token_threshold: Maximum tokens per chunk.
|
91 |
+
overlap_rate: Overlap between chunks.
|
92 |
+
word_token_rate: Word to token conversion rate.
|
93 |
+
apply_chunking: Whether to apply chunking.
|
94 |
+
base_url: The base URL for the API request.
|
95 |
+
api_base: The base URL for the API request.
|
96 |
+
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
97 |
+
verbose: Whether to print verbose output.
|
98 |
+
usages: List of individual token usages.
|
99 |
+
total_usage: Accumulated token usage.
|
100 |
+
"""
|
101 |
+
|
102 |
+
def __init__(self,
|
103 |
+
provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
|
104 |
+
instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
|
105 |
+
"""
|
106 |
+
Initialize the strategy with clustering parameters.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
110 |
+
api_token: The API token for the provider.
|
111 |
+
instruction: The instruction to use for the LLM model.
|
112 |
+
schema: Pydantic model schema for structured data.
|
113 |
+
extraction_type: "block" or "schema".
|
114 |
+
chunk_token_threshold: Maximum tokens per chunk.
|
115 |
+
overlap_rate: Overlap between chunks.
|
116 |
+
word_token_rate: Word to token conversion rate.
|
117 |
+
apply_chunking: Whether to apply chunking.
|
118 |
+
base_url: The base URL for the API request.
|
119 |
+
api_base: The base URL for the API request.
|
120 |
+
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
121 |
+
verbose: Whether to print verbose output.
|
122 |
+
usages: List of individual token usages.
|
123 |
+
total_usage: Accumulated token usage.
|
124 |
+
|
125 |
+
"""
|
126 |
+
super().__init__(**kwargs)
|
127 |
+
self.provider = provider
|
128 |
+
self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
|
129 |
+
self.instruction = instruction
|
130 |
+
self.extract_type = extraction_type
|
131 |
+
self.schema = schema
|
132 |
+
if schema:
|
133 |
+
self.extract_type = "schema"
|
134 |
+
|
135 |
+
self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
|
136 |
+
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
137 |
+
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
138 |
+
self.apply_chunking = kwargs.get("apply_chunking", True)
|
139 |
+
self.base_url = kwargs.get("base_url", None)
|
140 |
+
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
|
141 |
+
self.extra_args = kwargs.get("extra_args", {})
|
142 |
+
if not self.apply_chunking:
|
143 |
+
self.chunk_token_threshold = 1e9
|
144 |
+
|
145 |
+
self.verbose = kwargs.get("verbose", False)
|
146 |
+
self.usages = [] # Store individual usages
|
147 |
+
self.total_usage = TokenUsage() # Accumulated usage
|
148 |
+
|
149 |
+
if not self.api_token:
|
150 |
+
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
|
151 |
+
|
152 |
+
|
153 |
+
def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
|
154 |
+
"""
|
155 |
+
Extract meaningful blocks or chunks from the given HTML using an LLM.
|
156 |
+
|
157 |
+
How it works:
|
158 |
+
1. Construct a prompt with variables.
|
159 |
+
2. Make a request to the LLM using the prompt.
|
160 |
+
3. Parse the response and extract blocks or chunks.
|
161 |
+
|
162 |
+
Args:
|
163 |
+
url: The URL of the webpage.
|
164 |
+
ix: Index of the block.
|
165 |
+
html: The HTML content of the webpage.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
A list of extracted blocks or chunks.
|
169 |
+
"""
|
170 |
+
if self.verbose:
|
171 |
+
# print("[LOG] Extracting blocks from URL:", url)
|
172 |
+
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
173 |
+
|
174 |
+
variable_values = {
|
175 |
+
"URL": url,
|
176 |
+
"HTML": escape_json_string(sanitize_html(html)),
|
177 |
+
}
|
178 |
+
|
179 |
+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
|
180 |
+
if self.instruction:
|
181 |
+
variable_values["REQUEST"] = self.instruction
|
182 |
+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
183 |
+
|
184 |
+
if self.extract_type == "schema" and self.schema:
|
185 |
+
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
186 |
+
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
187 |
+
|
188 |
+
for variable in variable_values:
|
189 |
+
prompt_with_variables = prompt_with_variables.replace(
|
190 |
+
"{" + variable + "}", variable_values[variable]
|
191 |
+
)
|
192 |
+
|
193 |
+
response = perform_completion_with_backoff(
|
194 |
+
self.provider,
|
195 |
+
prompt_with_variables,
|
196 |
+
self.api_token,
|
197 |
+
base_url=self.api_base or self.base_url,
|
198 |
+
extra_args = self.extra_args
|
199 |
+
) # , json_response=self.extract_type == "schema")
|
200 |
+
# Track usage
|
201 |
+
usage = TokenUsage(
|
202 |
+
completion_tokens=response.usage.completion_tokens,
|
203 |
+
prompt_tokens=response.usage.prompt_tokens,
|
204 |
+
total_tokens=response.usage.total_tokens,
|
205 |
+
completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
|
206 |
+
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
|
207 |
+
)
|
208 |
+
self.usages.append(usage)
|
209 |
+
|
210 |
+
# Update totals
|
211 |
+
self.total_usage.completion_tokens += usage.completion_tokens
|
212 |
+
self.total_usage.prompt_tokens += usage.prompt_tokens
|
213 |
+
self.total_usage.total_tokens += usage.total_tokens
|
214 |
+
|
215 |
+
try:
|
216 |
+
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
217 |
+
blocks = json.loads(blocks)
|
218 |
+
for block in blocks:
|
219 |
+
block['error'] = False
|
220 |
+
except Exception as e:
|
221 |
+
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
222 |
+
blocks = parsed
|
223 |
+
if unparsed:
|
224 |
+
blocks.append({
|
225 |
+
"index": 0,
|
226 |
+
"error": True,
|
227 |
+
"tags": ["error"],
|
228 |
+
"content": unparsed
|
229 |
+
})
|
230 |
+
|
231 |
+
if self.verbose:
|
232 |
+
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
|
233 |
+
return blocks
|
234 |
+
|
235 |
+
def _merge(self, documents, chunk_token_threshold, overlap):
|
236 |
+
"""
|
237 |
+
Merge documents into sections based on chunk_token_threshold and overlap.
|
238 |
+
"""
|
239 |
+
chunks = []
|
240 |
+
sections = []
|
241 |
+
total_tokens = 0
|
242 |
+
|
243 |
+
# Calculate the total tokens across all documents
|
244 |
+
for document in documents:
|
245 |
+
total_tokens += len(document.split(' ')) * self.word_token_rate
|
246 |
+
|
247 |
+
# Calculate the number of sections needed
|
248 |
+
num_sections = math.floor(total_tokens / chunk_token_threshold)
|
249 |
+
if num_sections < 1:
|
250 |
+
num_sections = 1 # Ensure there is at least one section
|
251 |
+
adjusted_chunk_threshold = total_tokens / num_sections
|
252 |
+
|
253 |
+
total_token_so_far = 0
|
254 |
+
current_chunk = []
|
255 |
+
|
256 |
+
for document in documents:
|
257 |
+
tokens = document.split(' ')
|
258 |
+
token_count = len(tokens) * self.word_token_rate
|
259 |
+
|
260 |
+
if total_token_so_far + token_count <= adjusted_chunk_threshold:
|
261 |
+
current_chunk.extend(tokens)
|
262 |
+
total_token_so_far += token_count
|
263 |
+
else:
|
264 |
+
# Ensure to handle the last section properly
|
265 |
+
if len(sections) == num_sections - 1:
|
266 |
+
current_chunk.extend(tokens)
|
267 |
+
continue
|
268 |
+
|
269 |
+
# Add overlap if specified
|
270 |
+
if overlap > 0 and current_chunk:
|
271 |
+
overlap_tokens = current_chunk[-overlap:]
|
272 |
+
current_chunk.extend(overlap_tokens)
|
273 |
+
|
274 |
+
sections.append(' '.join(current_chunk))
|
275 |
+
current_chunk = tokens
|
276 |
+
total_token_so_far = token_count
|
277 |
+
|
278 |
+
# Add the last chunk
|
279 |
+
if current_chunk:
|
280 |
+
sections.append(' '.join(current_chunk))
|
281 |
+
|
282 |
+
return sections
|
283 |
+
|
284 |
+
|
285 |
+
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
286 |
+
"""
|
287 |
+
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
288 |
+
|
289 |
+
Args:
|
290 |
+
url: The URL of the webpage.
|
291 |
+
sections: List of sections (strings) to process.
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
A list of extracted blocks or chunks.
|
295 |
+
"""
|
296 |
+
|
297 |
+
merged_sections = self._merge(
|
298 |
+
sections, self.chunk_token_threshold,
|
299 |
+
overlap= int(self.chunk_token_threshold * self.overlap_rate)
|
300 |
+
)
|
301 |
+
extracted_content = []
|
302 |
+
if self.provider.startswith("groq/"):
|
303 |
+
# Sequential processing with a delay
|
304 |
+
for ix, section in enumerate(merged_sections):
|
305 |
+
extract_func = partial(self.extract, url)
|
306 |
+
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
307 |
+
time.sleep(0.5) # 500 ms delay between each processing
|
308 |
+
else:
|
309 |
+
# Parallel processing using ThreadPoolExecutor
|
310 |
+
# extract_func = partial(self.extract, url)
|
311 |
+
# for ix, section in enumerate(merged_sections):
|
312 |
+
# extracted_content.append(extract_func(ix, section))
|
313 |
+
|
314 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
315 |
+
extract_func = partial(self.extract, url)
|
316 |
+
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
317 |
+
|
318 |
+
for future in as_completed(futures):
|
319 |
+
try:
|
320 |
+
extracted_content.extend(future.result())
|
321 |
+
except Exception as e:
|
322 |
+
if self.verbose:
|
323 |
+
print(f"Error in thread execution: {e}")
|
324 |
+
# Add error information to extracted_content
|
325 |
+
extracted_content.append({
|
326 |
+
"index": 0,
|
327 |
+
"error": True,
|
328 |
+
"tags": ["error"],
|
329 |
+
"content": str(e)
|
330 |
+
})
|
331 |
+
|
332 |
+
|
333 |
+
return extracted_content
|
334 |
+
|
335 |
+
|
336 |
+
def show_usage(self) -> None:
|
337 |
+
"""Print a detailed token usage report showing total and per-request usage."""
|
338 |
+
print("\n=== Token Usage Summary ===")
|
339 |
+
print(f"{'Type':<15} {'Count':>12}")
|
340 |
+
print("-" * 30)
|
341 |
+
print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
|
342 |
+
print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
|
343 |
+
print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
|
344 |
+
|
345 |
+
print("\n=== Usage History ===")
|
346 |
+
print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
|
347 |
+
print("-" * 48)
|
348 |
+
for i, usage in enumerate(self.usages, 1):
|
349 |
+
print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
|
350 |
+
|
351 |
+
#######################################################
|
352 |
+
# Strategies using clustering for text data extraction #
|
353 |
+
#######################################################
|
354 |
+
|
355 |
+
class CosineStrategy(ExtractionStrategy):
|
356 |
+
"""
|
357 |
+
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
|
358 |
+
|
359 |
+
How it works:
|
360 |
+
1. Pre-filter documents using embeddings and semantic_filter.
|
361 |
+
2. Perform clustering using cosine similarity.
|
362 |
+
3. Organize texts by their cluster labels, retaining order.
|
363 |
+
4. Filter clusters by word count.
|
364 |
+
5. Extract meaningful blocks or chunks from the filtered clusters.
|
365 |
+
|
366 |
+
Attributes:
|
367 |
+
semantic_filter (str): A keyword filter for document filtering.
|
368 |
+
word_count_threshold (int): Minimum number of words per cluster.
|
369 |
+
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
370 |
+
linkage_method (str): The linkage method for hierarchical clustering.
|
371 |
+
top_k (int): Number of top categories to extract.
|
372 |
+
model_name (str): The name of the sentence-transformers model.
|
373 |
+
sim_threshold (float): The similarity threshold for clustering.
|
374 |
+
"""
|
375 |
+
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
376 |
+
"""
|
377 |
+
Initialize the strategy with clustering parameters.
|
378 |
+
|
379 |
+
Args:
|
380 |
+
semantic_filter (str): A keyword filter for document filtering.
|
381 |
+
word_count_threshold (int): Minimum number of words per cluster.
|
382 |
+
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
383 |
+
linkage_method (str): The linkage method for hierarchical clustering.
|
384 |
+
top_k (int): Number of top categories to extract.
|
385 |
+
"""
|
386 |
+
super().__init__(**kwargs)
|
387 |
+
|
388 |
+
import numpy as np
|
389 |
+
|
390 |
+
self.semantic_filter = semantic_filter
|
391 |
+
self.word_count_threshold = word_count_threshold
|
392 |
+
self.max_dist = max_dist
|
393 |
+
self.linkage_method = linkage_method
|
394 |
+
self.top_k = top_k
|
395 |
+
self.sim_threshold = sim_threshold
|
396 |
+
self.timer = time.time()
|
397 |
+
self.verbose = kwargs.get("verbose", False)
|
398 |
+
|
399 |
+
self.buffer_embeddings = np.array([])
|
400 |
+
self.get_embedding_method = "direct"
|
401 |
+
|
402 |
+
self.device = get_device()
|
403 |
+
# import torch
|
404 |
+
# self.device = torch.device('cpu')
|
405 |
+
|
406 |
+
self.default_batch_size = calculate_batch_size(self.device)
|
407 |
+
|
408 |
+
if self.verbose:
|
409 |
+
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
410 |
+
|
411 |
+
# if False and self.device.type == "cpu":
|
412 |
+
# self.model = load_onnx_all_MiniLM_l6_v2()
|
413 |
+
# self.tokenizer = self.model.tokenizer
|
414 |
+
# self.get_embedding_method = "direct"
|
415 |
+
# else:
|
416 |
+
|
417 |
+
self.tokenizer, self.model = load_HF_embedding_model(model_name)
|
418 |
+
self.model.to(self.device)
|
419 |
+
self.model.eval()
|
420 |
+
|
421 |
+
self.get_embedding_method = "batch"
|
422 |
+
|
423 |
+
self.buffer_embeddings = np.array([])
|
424 |
+
|
425 |
+
# if model_name == "bert-base-uncased":
|
426 |
+
# self.tokenizer, self.model = load_bert_base_uncased()
|
427 |
+
# self.model.eval() # Ensure the model is in evaluation mode
|
428 |
+
# self.get_embedding_method = "batch"
|
429 |
+
# elif model_name == "BAAI/bge-small-en-v1.5":
|
430 |
+
# self.tokenizer, self.model = load_bge_small_en_v1_5()
|
431 |
+
# self.model.eval() # Ensure the model is in evaluation mode
|
432 |
+
# self.get_embedding_method = "batch"
|
433 |
+
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
434 |
+
# self.model = load_onnx_all_MiniLM_l6_v2()
|
435 |
+
# self.tokenizer = self.model.tokenizer
|
436 |
+
# self.get_embedding_method = "direct"
|
437 |
+
|
438 |
+
|
439 |
+
if self.verbose:
|
440 |
+
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
|
441 |
+
|
442 |
+
self.nlp, _ = load_text_multilabel_classifier()
|
443 |
+
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
|
444 |
+
|
445 |
+
if self.verbose:
|
446 |
+
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
447 |
+
|
448 |
+
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
|
449 |
+
"""
|
450 |
+
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
451 |
+
|
452 |
+
Args:
|
453 |
+
documents (List[str]): A list of document texts.
|
454 |
+
semantic_filter (str): A keyword filter for document filtering.
|
455 |
+
at_least_k (int): The minimum number of documents to return.
|
456 |
+
|
457 |
+
Returns:
|
458 |
+
List[str]: A list of filtered and sorted document texts.
|
459 |
+
"""
|
460 |
+
|
461 |
+
if not semantic_filter:
|
462 |
+
return documents
|
463 |
+
|
464 |
+
if len(documents) < at_least_k:
|
465 |
+
at_least_k = len(documents) // 2
|
466 |
+
|
467 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
468 |
+
|
469 |
+
# Compute embedding for the keyword filter
|
470 |
+
query_embedding = self.get_embeddings([semantic_filter])[0]
|
471 |
+
|
472 |
+
# Compute embeddings for the documents
|
473 |
+
document_embeddings = self.get_embeddings(documents)
|
474 |
+
|
475 |
+
# Calculate cosine similarity between the query embedding and document embeddings
|
476 |
+
similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
|
477 |
+
|
478 |
+
# Filter documents based on the similarity threshold
|
479 |
+
filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
|
480 |
+
|
481 |
+
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
|
482 |
+
if len(filtered_docs) < at_least_k:
|
483 |
+
remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
|
484 |
+
remaining_docs.sort(key=lambda x: x[1], reverse=True)
|
485 |
+
filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
|
486 |
+
|
487 |
+
# Extract the document texts from the tuples
|
488 |
+
filtered_docs = [doc for doc, _ in filtered_docs]
|
489 |
+
|
490 |
+
return filtered_docs[:at_least_k]
|
491 |
+
|
492 |
+
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
|
493 |
+
"""
|
494 |
+
Get BERT embeddings for a list of sentences.
|
495 |
+
|
496 |
+
Args:
|
497 |
+
sentences (List[str]): A list of text chunks (sentences).
|
498 |
+
|
499 |
+
Returns:
|
500 |
+
NumPy array of embeddings.
|
501 |
+
"""
|
502 |
+
# if self.buffer_embeddings.any() and not bypass_buffer:
|
503 |
+
# return self.buffer_embeddings
|
504 |
+
|
505 |
+
if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
|
506 |
+
import torch
|
507 |
+
# Tokenize sentences and convert to tensor
|
508 |
+
if batch_size is None:
|
509 |
+
batch_size = self.default_batch_size
|
510 |
+
|
511 |
+
all_embeddings = []
|
512 |
+
for i in range(0, len(sentences), batch_size):
|
513 |
+
batch_sentences = sentences[i:i + batch_size]
|
514 |
+
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
|
515 |
+
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
|
516 |
+
|
517 |
+
# Ensure no gradients are calculated
|
518 |
+
with torch.no_grad():
|
519 |
+
model_output = self.model(**encoded_input)
|
520 |
+
|
521 |
+
# Get embeddings from the last hidden state (mean pooling)
|
522 |
+
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
|
523 |
+
all_embeddings.append(embeddings)
|
524 |
+
|
525 |
+
self.buffer_embeddings = np.vstack(all_embeddings)
|
526 |
+
elif self.device.type == "cpu":
|
527 |
+
# self.buffer_embeddings = self.model(sentences)
|
528 |
+
if batch_size is None:
|
529 |
+
batch_size = self.default_batch_size
|
530 |
+
|
531 |
+
all_embeddings = []
|
532 |
+
for i in range(0, len(sentences), batch_size):
|
533 |
+
batch_sentences = sentences[i:i + batch_size]
|
534 |
+
embeddings = self.model(batch_sentences)
|
535 |
+
all_embeddings.append(embeddings)
|
536 |
+
|
537 |
+
self.buffer_embeddings = np.vstack(all_embeddings)
|
538 |
+
return self.buffer_embeddings
|
539 |
+
|
540 |
+
def hierarchical_clustering(self, sentences: List[str], embeddings = None):
|
541 |
+
"""
|
542 |
+
Perform hierarchical clustering on sentences and return cluster labels.
|
543 |
+
|
544 |
+
Args:
|
545 |
+
sentences (List[str]): A list of text chunks (sentences).
|
546 |
+
|
547 |
+
Returns:
|
548 |
+
NumPy array of cluster labels.
|
549 |
+
"""
|
550 |
+
# Get embeddings
|
551 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
552 |
+
from scipy.spatial.distance import pdist
|
553 |
+
self.timer = time.time()
|
554 |
+
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
|
555 |
+
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
|
556 |
+
# Compute pairwise cosine distances
|
557 |
+
distance_matrix = pdist(embeddings, 'cosine')
|
558 |
+
# Perform agglomerative clustering respecting order
|
559 |
+
linked = linkage(distance_matrix, method=self.linkage_method)
|
560 |
+
# Form flat clusters
|
561 |
+
labels = fcluster(linked, self.max_dist, criterion='distance')
|
562 |
+
return labels
|
563 |
+
|
564 |
+
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
|
565 |
+
"""
|
566 |
+
Filter clusters to remove those with a word count below the threshold.
|
567 |
+
|
568 |
+
Args:
|
569 |
+
clusters (Dict[int, List[str]]): Dictionary of clusters.
|
570 |
+
|
571 |
+
Returns:
|
572 |
+
Dict[int, List[str]]: Filtered dictionary of clusters.
|
573 |
+
"""
|
574 |
+
filtered_clusters = {}
|
575 |
+
for cluster_id, texts in clusters.items():
|
576 |
+
# Concatenate texts for analysis
|
577 |
+
full_text = " ".join(texts)
|
578 |
+
# Count words
|
579 |
+
word_count = len(full_text.split())
|
580 |
+
|
581 |
+
# Keep clusters with word count above the threshold
|
582 |
+
if word_count >= self.word_count_threshold:
|
583 |
+
filtered_clusters[cluster_id] = texts
|
584 |
+
|
585 |
+
return filtered_clusters
|
586 |
+
|
587 |
+
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
588 |
+
"""
|
589 |
+
Extract clusters from HTML content using hierarchical clustering.
|
590 |
+
|
591 |
+
Args:
|
592 |
+
url (str): The URL of the webpage.
|
593 |
+
html (str): The HTML content of the webpage.
|
594 |
+
|
595 |
+
Returns:
|
596 |
+
List[Dict[str, Any]]: A list of processed JSON blocks.
|
597 |
+
"""
|
598 |
+
# Assume `html` is a list of text chunks for this strategy
|
599 |
+
t = time.time()
|
600 |
+
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
601 |
+
|
602 |
+
# Pre-filter documents using embeddings and semantic_filter
|
603 |
+
text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
|
604 |
+
|
605 |
+
if not text_chunks:
|
606 |
+
return []
|
607 |
+
|
608 |
+
# Perform clustering
|
609 |
+
labels = self.hierarchical_clustering(text_chunks)
|
610 |
+
# print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
|
611 |
+
|
612 |
+
# Organize texts by their cluster labels, retaining order
|
613 |
+
t = time.time()
|
614 |
+
clusters = {}
|
615 |
+
for index, label in enumerate(labels):
|
616 |
+
clusters.setdefault(label, []).append(text_chunks[index])
|
617 |
+
|
618 |
+
# Filter clusters by word count
|
619 |
+
filtered_clusters = self.filter_clusters_by_word_count(clusters)
|
620 |
+
|
621 |
+
# Convert filtered clusters to a sorted list of dictionaries
|
622 |
+
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
623 |
+
|
624 |
+
if self.verbose:
|
625 |
+
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
626 |
+
|
627 |
+
if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
|
628 |
+
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
629 |
+
|
630 |
+
for cluster, label in zip(cluster_list, labels):
|
631 |
+
cluster['tags'] = label
|
632 |
+
# elif self.device.type == "cpu":
|
633 |
+
# # Process the text with the loaded model
|
634 |
+
# texts = [cluster['content'] for cluster in cluster_list]
|
635 |
+
# # Batch process texts
|
636 |
+
# docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
637 |
+
|
638 |
+
# for doc, cluster in zip(docs, cluster_list):
|
639 |
+
# tok_k = self.top_k
|
640 |
+
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
641 |
+
# cluster['tags'] = [cat for cat, _ in top_categories]
|
642 |
+
|
643 |
+
# for cluster in cluster_list:
|
644 |
+
# doc = self.nlp(cluster['content'])
|
645 |
+
# tok_k = self.top_k
|
646 |
+
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
647 |
+
# cluster['tags'] = [cat for cat, _ in top_categories]
|
648 |
+
|
649 |
+
if self.verbose:
|
650 |
+
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
651 |
+
|
652 |
+
return cluster_list
|
653 |
+
|
654 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
655 |
+
"""
|
656 |
+
Process sections using hierarchical clustering.
|
657 |
+
|
658 |
+
Args:
|
659 |
+
url (str): The URL of the webpage.
|
660 |
+
sections (List[str]): List of sections (strings) to process.
|
661 |
+
|
662 |
+
Returns:
|
663 |
+
"""
|
664 |
+
# This strategy processes all sections together
|
665 |
+
|
666 |
+
return self.extract(url, self.DEL.join(sections), **kwargs)
|
667 |
+
|
668 |
+
#######################################################
|
669 |
+
# New extraction strategies for JSON-based extraction #
|
670 |
+
#######################################################
|
671 |
+
|
672 |
+
class JsonElementExtractionStrategy(ExtractionStrategy):
|
673 |
+
"""
|
674 |
+
Abstract base class for extracting structured JSON from HTML content.
|
675 |
+
|
676 |
+
How it works:
|
677 |
+
1. Parses HTML content using the `_parse_html` method.
|
678 |
+
2. Uses a schema to define base selectors, fields, and transformations.
|
679 |
+
3. Extracts data hierarchically, supporting nested fields and lists.
|
680 |
+
4. Handles computed fields with expressions or functions.
|
681 |
+
|
682 |
+
Attributes:
|
683 |
+
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
|
684 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
685 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
686 |
+
|
687 |
+
Methods:
|
688 |
+
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
|
689 |
+
_extract_item(element, fields): Extracts fields from a single element.
|
690 |
+
_extract_single_field(element, field): Extracts a single field based on its type.
|
691 |
+
_apply_transform(value, transform): Applies a transformation to a value.
|
692 |
+
_compute_field(item, field): Computes a field value using an expression or function.
|
693 |
+
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
|
694 |
+
|
695 |
+
Abstract Methods:
|
696 |
+
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
|
697 |
+
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
|
698 |
+
_get_elements(element, selector): Retrieves child elements using a selector.
|
699 |
+
_get_element_text(element): Extracts text content from an element.
|
700 |
+
_get_element_html(element): Extracts raw HTML from an element.
|
701 |
+
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
|
702 |
+
"""
|
703 |
+
|
704 |
+
|
705 |
+
DEL = '\n'
|
706 |
+
|
707 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
708 |
+
"""
|
709 |
+
Initialize the JSON element extraction strategy with a schema.
|
710 |
+
|
711 |
+
Args:
|
712 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
713 |
+
"""
|
714 |
+
super().__init__(**kwargs)
|
715 |
+
self.schema = schema
|
716 |
+
self.verbose = kwargs.get('verbose', False)
|
717 |
+
|
718 |
+
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
719 |
+
"""
|
720 |
+
Extract structured data from HTML content.
|
721 |
+
|
722 |
+
How it works:
|
723 |
+
1. Parses the HTML content using the `_parse_html` method.
|
724 |
+
2. Identifies base elements using the schema's base selector.
|
725 |
+
3. Extracts fields from each base element using `_extract_item`.
|
726 |
+
|
727 |
+
Args:
|
728 |
+
url (str): The URL of the page being processed.
|
729 |
+
html_content (str): The raw HTML content to parse and extract.
|
730 |
+
*q: Additional positional arguments.
|
731 |
+
**kwargs: Additional keyword arguments for custom extraction.
|
732 |
+
|
733 |
+
Returns:
|
734 |
+
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
|
735 |
+
"""
|
736 |
+
|
737 |
+
parsed_html = self._parse_html(html_content)
|
738 |
+
base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
|
739 |
+
|
740 |
+
results = []
|
741 |
+
for element in base_elements:
|
742 |
+
# Extract base element attributes
|
743 |
+
item = {}
|
744 |
+
if 'baseFields' in self.schema:
|
745 |
+
for field in self.schema['baseFields']:
|
746 |
+
value = self._extract_single_field(element, field)
|
747 |
+
if value is not None:
|
748 |
+
item[field['name']] = value
|
749 |
+
|
750 |
+
# Extract child fields
|
751 |
+
field_data = self._extract_item(element, self.schema['fields'])
|
752 |
+
item.update(field_data)
|
753 |
+
|
754 |
+
if item:
|
755 |
+
results.append(item)
|
756 |
+
|
757 |
+
return results
|
758 |
+
|
759 |
+
@abstractmethod
|
760 |
+
def _parse_html(self, html_content: str):
|
761 |
+
"""Parse HTML content into appropriate format"""
|
762 |
+
pass
|
763 |
+
|
764 |
+
@abstractmethod
|
765 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
766 |
+
"""Get all base elements using the selector"""
|
767 |
+
pass
|
768 |
+
|
769 |
+
@abstractmethod
|
770 |
+
def _get_elements(self, element, selector: str):
|
771 |
+
"""Get child elements using the selector"""
|
772 |
+
pass
|
773 |
+
|
774 |
+
def _extract_field(self, element, field):
|
775 |
+
try:
|
776 |
+
if field['type'] == 'nested':
|
777 |
+
nested_elements = self._get_elements(element, field['selector'])
|
778 |
+
nested_element = nested_elements[0] if nested_elements else None
|
779 |
+
return self._extract_item(nested_element, field['fields']) if nested_element else {}
|
780 |
+
|
781 |
+
if field['type'] == 'list':
|
782 |
+
elements = self._get_elements(element, field['selector'])
|
783 |
+
return [self._extract_list_item(el, field['fields']) for el in elements]
|
784 |
+
|
785 |
+
if field['type'] == 'nested_list':
|
786 |
+
elements = self._get_elements(element, field['selector'])
|
787 |
+
return [self._extract_item(el, field['fields']) for el in elements]
|
788 |
+
|
789 |
+
return self._extract_single_field(element, field)
|
790 |
+
except Exception as e:
|
791 |
+
if self.verbose:
|
792 |
+
print(f"Error extracting field {field['name']}: {str(e)}")
|
793 |
+
return field.get('default')
|
794 |
+
|
795 |
+
def _extract_single_field(self, element, field):
|
796 |
+
"""
|
797 |
+
Extract a single field based on its type.
|
798 |
+
|
799 |
+
How it works:
|
800 |
+
1. Selects the target element using the field's selector.
|
801 |
+
2. Extracts the field value based on its type (e.g., text, attribute, regex).
|
802 |
+
3. Applies transformations if defined in the schema.
|
803 |
+
|
804 |
+
Args:
|
805 |
+
element: The base element to extract the field from.
|
806 |
+
field (Dict[str, Any]): The field definition in the schema.
|
807 |
+
|
808 |
+
Returns:
|
809 |
+
Any: The extracted field value.
|
810 |
+
"""
|
811 |
+
|
812 |
+
if 'selector' in field:
|
813 |
+
selected = self._get_elements(element, field['selector'])
|
814 |
+
if not selected:
|
815 |
+
return field.get('default')
|
816 |
+
selected = selected[0]
|
817 |
+
else:
|
818 |
+
selected = element
|
819 |
+
|
820 |
+
value = None
|
821 |
+
if field['type'] == 'text':
|
822 |
+
value = self._get_element_text(selected)
|
823 |
+
elif field['type'] == 'attribute':
|
824 |
+
value = self._get_element_attribute(selected, field['attribute'])
|
825 |
+
elif field['type'] == 'html':
|
826 |
+
value = self._get_element_html(selected)
|
827 |
+
elif field['type'] == 'regex':
|
828 |
+
text = self._get_element_text(selected)
|
829 |
+
match = re.search(field['pattern'], text)
|
830 |
+
value = match.group(1) if match else None
|
831 |
+
|
832 |
+
if 'transform' in field:
|
833 |
+
value = self._apply_transform(value, field['transform'])
|
834 |
+
|
835 |
+
return value if value is not None else field.get('default')
|
836 |
+
|
837 |
+
def _extract_list_item(self, element, fields):
|
838 |
+
item = {}
|
839 |
+
for field in fields:
|
840 |
+
value = self._extract_single_field(element, field)
|
841 |
+
if value is not None:
|
842 |
+
item[field['name']] = value
|
843 |
+
return item
|
844 |
+
|
845 |
+
def _extract_item(self, element, fields):
|
846 |
+
"""
|
847 |
+
Extracts fields from a given element.
|
848 |
+
|
849 |
+
How it works:
|
850 |
+
1. Iterates through the fields defined in the schema.
|
851 |
+
2. Handles computed, single, and nested field types.
|
852 |
+
3. Updates the item dictionary with extracted field values.
|
853 |
+
|
854 |
+
Args:
|
855 |
+
element: The base element to extract fields from.
|
856 |
+
fields (List[Dict[str, Any]]): The list of fields to extract.
|
857 |
+
|
858 |
+
Returns:
|
859 |
+
Dict[str, Any]: A dictionary representing the extracted item.
|
860 |
+
"""
|
861 |
+
|
862 |
+
item = {}
|
863 |
+
for field in fields:
|
864 |
+
if field['type'] == 'computed':
|
865 |
+
value = self._compute_field(item, field)
|
866 |
+
else:
|
867 |
+
value = self._extract_field(element, field)
|
868 |
+
if value is not None:
|
869 |
+
item[field['name']] = value
|
870 |
+
return item
|
871 |
+
|
872 |
+
def _apply_transform(self, value, transform):
|
873 |
+
"""
|
874 |
+
Apply a transformation to a value.
|
875 |
+
|
876 |
+
How it works:
|
877 |
+
1. Checks the transformation type (e.g., `lowercase`, `strip`).
|
878 |
+
2. Applies the transformation to the value.
|
879 |
+
3. Returns the transformed value.
|
880 |
+
|
881 |
+
Args:
|
882 |
+
value (str): The value to transform.
|
883 |
+
transform (str): The type of transformation to apply.
|
884 |
+
|
885 |
+
Returns:
|
886 |
+
str: The transformed value.
|
887 |
+
"""
|
888 |
+
|
889 |
+
if transform == 'lowercase':
|
890 |
+
return value.lower()
|
891 |
+
elif transform == 'uppercase':
|
892 |
+
return value.upper()
|
893 |
+
elif transform == 'strip':
|
894 |
+
return value.strip()
|
895 |
+
return value
|
896 |
+
|
897 |
+
def _compute_field(self, item, field):
|
898 |
+
try:
|
899 |
+
if 'expression' in field:
|
900 |
+
return eval(field['expression'], {}, item)
|
901 |
+
elif 'function' in field:
|
902 |
+
return field['function'](item)
|
903 |
+
except Exception as e:
|
904 |
+
if self.verbose:
|
905 |
+
print(f"Error computing field {field['name']}: {str(e)}")
|
906 |
+
return field.get('default')
|
907 |
+
|
908 |
+
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
909 |
+
"""
|
910 |
+
Run the extraction strategy on a combined HTML content.
|
911 |
+
|
912 |
+
How it works:
|
913 |
+
1. Combines multiple HTML sections using the `DEL` delimiter.
|
914 |
+
2. Calls the `extract` method with the combined HTML.
|
915 |
+
|
916 |
+
Args:
|
917 |
+
url (str): The URL of the page being processed.
|
918 |
+
sections (List[str]): A list of HTML sections.
|
919 |
+
*q: Additional positional arguments.
|
920 |
+
**kwargs: Additional keyword arguments for custom extraction.
|
921 |
+
|
922 |
+
Returns:
|
923 |
+
List[Dict[str, Any]]: A list of extracted items.
|
924 |
+
"""
|
925 |
+
|
926 |
+
combined_html = self.DEL.join(sections)
|
927 |
+
return self.extract(url, combined_html, **kwargs)
|
928 |
+
|
929 |
+
@abstractmethod
|
930 |
+
def _get_element_text(self, element) -> str:
|
931 |
+
"""Get text content from element"""
|
932 |
+
pass
|
933 |
+
|
934 |
+
@abstractmethod
|
935 |
+
def _get_element_html(self, element) -> str:
|
936 |
+
"""Get HTML content from element"""
|
937 |
+
pass
|
938 |
+
|
939 |
+
@abstractmethod
|
940 |
+
def _get_element_attribute(self, element, attribute: str):
|
941 |
+
"""Get attribute value from element"""
|
942 |
+
pass
|
943 |
+
|
944 |
+
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
945 |
+
"""
|
946 |
+
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
947 |
+
|
948 |
+
How it works:
|
949 |
+
1. Parses HTML content with BeautifulSoup.
|
950 |
+
2. Selects elements using CSS selectors defined in the schema.
|
951 |
+
3. Extracts field data and applies transformations as defined.
|
952 |
+
|
953 |
+
Attributes:
|
954 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
955 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
956 |
+
|
957 |
+
Methods:
|
958 |
+
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
|
959 |
+
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
|
960 |
+
_get_elements(element, selector): Selects child elements using a CSS selector.
|
961 |
+
_get_element_text(element): Extracts text content from a BeautifulSoup element.
|
962 |
+
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
|
963 |
+
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
|
964 |
+
"""
|
965 |
+
|
966 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
967 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
968 |
+
super().__init__(schema, **kwargs)
|
969 |
+
|
970 |
+
def _parse_html(self, html_content: str):
|
971 |
+
return BeautifulSoup(html_content, 'html.parser')
|
972 |
+
|
973 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
974 |
+
return parsed_html.select(selector)
|
975 |
+
|
976 |
+
def _get_elements(self, element, selector: str):
|
977 |
+
# Return all matching elements using select() instead of select_one()
|
978 |
+
# This ensures that we get all elements that match the selector, not just the first one
|
979 |
+
return element.select(selector)
|
980 |
+
|
981 |
+
def _get_element_text(self, element) -> str:
|
982 |
+
return element.get_text(strip=True)
|
983 |
+
|
984 |
+
def _get_element_html(self, element) -> str:
|
985 |
+
return str(element)
|
986 |
+
|
987 |
+
def _get_element_attribute(self, element, attribute: str):
|
988 |
+
return element.get(attribute)
|
989 |
+
|
990 |
+
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
991 |
+
"""
|
992 |
+
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
993 |
+
|
994 |
+
How it works:
|
995 |
+
1. Parses HTML content into an lxml tree.
|
996 |
+
2. Selects elements using XPath expressions.
|
997 |
+
3. Converts CSS selectors to XPath when needed.
|
998 |
+
|
999 |
+
Attributes:
|
1000 |
+
schema (Dict[str, Any]): The schema defining the extraction rules.
|
1001 |
+
verbose (bool): Enables verbose logging for debugging purposes.
|
1002 |
+
|
1003 |
+
Methods:
|
1004 |
+
_parse_html(html_content): Parses HTML content into an lxml tree.
|
1005 |
+
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
|
1006 |
+
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
|
1007 |
+
_get_elements(element, selector): Selects child elements using an XPath selector.
|
1008 |
+
_get_element_text(element): Extracts text content from an lxml element.
|
1009 |
+
_get_element_html(element): Extracts the raw HTML content of an lxml element.
|
1010 |
+
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
|
1011 |
+
"""
|
1012 |
+
|
1013 |
+
def __init__(self, schema: Dict[str, Any], **kwargs):
|
1014 |
+
kwargs['input_format'] = 'html' # Force HTML input
|
1015 |
+
super().__init__(schema, **kwargs)
|
1016 |
+
|
1017 |
+
def _parse_html(self, html_content: str):
|
1018 |
+
return html.fromstring(html_content)
|
1019 |
+
|
1020 |
+
def _get_base_elements(self, parsed_html, selector: str):
|
1021 |
+
return parsed_html.xpath(selector)
|
1022 |
+
|
1023 |
+
def _css_to_xpath(self, css_selector: str) -> str:
|
1024 |
+
"""Convert CSS selector to XPath if needed"""
|
1025 |
+
if '/' in css_selector: # Already an XPath
|
1026 |
+
return css_selector
|
1027 |
+
return self._basic_css_to_xpath(css_selector)
|
1028 |
+
|
1029 |
+
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
1030 |
+
"""Basic CSS to XPath conversion for common cases"""
|
1031 |
+
if ' > ' in css_selector:
|
1032 |
+
parts = css_selector.split(' > ')
|
1033 |
+
return '//' + '/'.join(parts)
|
1034 |
+
if ' ' in css_selector:
|
1035 |
+
parts = css_selector.split(' ')
|
1036 |
+
return '//' + '//'.join(parts)
|
1037 |
+
return '//' + css_selector
|
1038 |
+
|
1039 |
+
def _get_elements(self, element, selector: str):
|
1040 |
+
xpath = self._css_to_xpath(selector)
|
1041 |
+
if not xpath.startswith('.'):
|
1042 |
+
xpath = '.' + xpath
|
1043 |
+
return element.xpath(xpath)
|
1044 |
+
|
1045 |
+
def _get_element_text(self, element) -> str:
|
1046 |
+
return ''.join(element.xpath('.//text()')).strip()
|
1047 |
+
|
1048 |
+
def _get_element_html(self, element) -> str:
|
1049 |
+
return etree.tostring(element, encoding='unicode')
|
1050 |
+
|
1051 |
+
def _get_element_attribute(self, element, attribute: str):
|
1052 |
+
return element.get(attribute)
|
crawl4ai/html2text/__init__.py
ADDED
@@ -0,0 +1,1141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
2 |
+
|
3 |
+
import html.entities
|
4 |
+
import html.parser
|
5 |
+
import re
|
6 |
+
import string
|
7 |
+
import urllib.parse as urlparse
|
8 |
+
from textwrap import wrap
|
9 |
+
from typing import Dict, List, Optional, Tuple, Union
|
10 |
+
|
11 |
+
from . import config
|
12 |
+
from ._typing import OutCallback
|
13 |
+
from .elements import AnchorElement, ListElement
|
14 |
+
from .utils import (
|
15 |
+
dumb_css_parser,
|
16 |
+
element_style,
|
17 |
+
escape_md,
|
18 |
+
escape_md_section,
|
19 |
+
google_fixed_width_font,
|
20 |
+
google_has_height,
|
21 |
+
google_list_style,
|
22 |
+
google_text_emphasis,
|
23 |
+
hn,
|
24 |
+
list_numbering_start,
|
25 |
+
pad_tables_in_text,
|
26 |
+
skipwrap,
|
27 |
+
unifiable_n,
|
28 |
+
)
|
29 |
+
|
30 |
+
__version__ = (2024, 2, 26)
|
31 |
+
|
32 |
+
|
33 |
+
# TODO:
|
34 |
+
# Support decoded entities with UNIFIABLE.
|
35 |
+
|
36 |
+
|
37 |
+
class HTML2Text(html.parser.HTMLParser):
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
out: Optional[OutCallback] = None,
|
41 |
+
baseurl: str = "",
|
42 |
+
bodywidth: int = config.BODY_WIDTH,
|
43 |
+
) -> None:
|
44 |
+
"""
|
45 |
+
Input parameters:
|
46 |
+
out: possible custom replacement for self.outtextf (which
|
47 |
+
appends lines of text).
|
48 |
+
baseurl: base URL of the document we process
|
49 |
+
"""
|
50 |
+
super().__init__(convert_charrefs=False)
|
51 |
+
|
52 |
+
# Config options
|
53 |
+
self.split_next_td = False
|
54 |
+
self.td_count = 0
|
55 |
+
self.table_start = False
|
56 |
+
self.unicode_snob = config.UNICODE_SNOB # covered in cli
|
57 |
+
|
58 |
+
self.escape_snob = config.ESCAPE_SNOB # covered in cli
|
59 |
+
self.escape_backslash = config.ESCAPE_BACKSLASH # covered in cli
|
60 |
+
self.escape_dot = config.ESCAPE_DOT # covered in cli
|
61 |
+
self.escape_plus = config.ESCAPE_PLUS # covered in cli
|
62 |
+
self.escape_dash = config.ESCAPE_DASH # covered in cli
|
63 |
+
|
64 |
+
self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
|
65 |
+
self.body_width = bodywidth # covered in cli
|
66 |
+
self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
|
67 |
+
self.inline_links = config.INLINE_LINKS # covered in cli
|
68 |
+
self.protect_links = config.PROTECT_LINKS # covered in cli
|
69 |
+
self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
|
70 |
+
self.ignore_links = config.IGNORE_ANCHORS # covered in cli
|
71 |
+
self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli
|
72 |
+
self.ignore_images = config.IGNORE_IMAGES # covered in cli
|
73 |
+
self.images_as_html = config.IMAGES_AS_HTML # covered in cli
|
74 |
+
self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
|
75 |
+
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
|
76 |
+
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
|
77 |
+
self.bypass_tables = config.BYPASS_TABLES # covered in cli
|
78 |
+
self.ignore_tables = config.IGNORE_TABLES # covered in cli
|
79 |
+
self.google_doc = False # covered in cli
|
80 |
+
self.ul_item_mark = "*" # covered in cli
|
81 |
+
self.emphasis_mark = "_" # covered in cli
|
82 |
+
self.strong_mark = "**"
|
83 |
+
self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
|
84 |
+
self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
|
85 |
+
self.hide_strikethrough = False # covered in cli
|
86 |
+
self.mark_code = config.MARK_CODE
|
87 |
+
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
|
88 |
+
self.wrap_links = config.WRAP_LINKS # covered in cli
|
89 |
+
self.wrap_tables = config.WRAP_TABLES
|
90 |
+
self.pad_tables = config.PAD_TABLES # covered in cli
|
91 |
+
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
|
92 |
+
self.tag_callback = None
|
93 |
+
self.open_quote = config.OPEN_QUOTE # covered in cli
|
94 |
+
self.close_quote = config.CLOSE_QUOTE # covered in cli
|
95 |
+
self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli
|
96 |
+
|
97 |
+
if out is None:
|
98 |
+
self.out = self.outtextf
|
99 |
+
else:
|
100 |
+
self.out = out
|
101 |
+
|
102 |
+
# empty list to store output characters before they are "joined"
|
103 |
+
self.outtextlist: List[str] = []
|
104 |
+
|
105 |
+
self.quiet = 0
|
106 |
+
self.p_p = 0 # number of newline character to print before next output
|
107 |
+
self.outcount = 0
|
108 |
+
self.start = True
|
109 |
+
self.space = False
|
110 |
+
self.a: List[AnchorElement] = []
|
111 |
+
self.astack: List[Optional[Dict[str, Optional[str]]]] = []
|
112 |
+
self.maybe_automatic_link: Optional[str] = None
|
113 |
+
self.empty_link = False
|
114 |
+
self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
|
115 |
+
self.acount = 0
|
116 |
+
self.list: List[ListElement] = []
|
117 |
+
self.blockquote = 0
|
118 |
+
self.pre = False
|
119 |
+
self.startpre = False
|
120 |
+
self.code = False
|
121 |
+
self.quote = False
|
122 |
+
self.br_toggle = ""
|
123 |
+
self.lastWasNL = False
|
124 |
+
self.lastWasList = False
|
125 |
+
self.style = 0
|
126 |
+
self.style_def: Dict[str, Dict[str, str]] = {}
|
127 |
+
self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = []
|
128 |
+
self.emphasis = 0
|
129 |
+
self.drop_white_space = 0
|
130 |
+
self.inheader = False
|
131 |
+
# Current abbreviation definition
|
132 |
+
self.abbr_title: Optional[str] = None
|
133 |
+
# Last inner HTML (for abbr being defined)
|
134 |
+
self.abbr_data: Optional[str] = None
|
135 |
+
# Stack of abbreviations to write later
|
136 |
+
self.abbr_list: Dict[str, str] = {}
|
137 |
+
self.baseurl = baseurl
|
138 |
+
self.stressed = False
|
139 |
+
self.preceding_stressed = False
|
140 |
+
self.preceding_data = ""
|
141 |
+
self.current_tag = ""
|
142 |
+
|
143 |
+
config.UNIFIABLE["nbsp"] = " _place_holder;"
|
144 |
+
|
145 |
+
def update_params(self, **kwargs):
|
146 |
+
for key, value in kwargs.items():
|
147 |
+
setattr(self, key, value)
|
148 |
+
|
149 |
+
def feed(self, data: str) -> None:
|
150 |
+
data = data.replace("</' + 'script>", "</ignore>")
|
151 |
+
super().feed(data)
|
152 |
+
|
153 |
+
def handle(self, data: str) -> str:
|
154 |
+
self.start = True
|
155 |
+
self.feed(data)
|
156 |
+
self.feed("")
|
157 |
+
markdown = self.optwrap(self.finish())
|
158 |
+
if self.pad_tables:
|
159 |
+
return pad_tables_in_text(markdown)
|
160 |
+
else:
|
161 |
+
return markdown
|
162 |
+
|
163 |
+
def outtextf(self, s: str) -> None:
|
164 |
+
self.outtextlist.append(s)
|
165 |
+
if s:
|
166 |
+
self.lastWasNL = s[-1] == "\n"
|
167 |
+
|
168 |
+
def finish(self) -> str:
|
169 |
+
self.close()
|
170 |
+
|
171 |
+
self.pbr()
|
172 |
+
self.o("", force="end")
|
173 |
+
|
174 |
+
outtext = "".join(self.outtextlist)
|
175 |
+
|
176 |
+
if self.unicode_snob:
|
177 |
+
nbsp = html.entities.html5["nbsp;"]
|
178 |
+
else:
|
179 |
+
nbsp = " "
|
180 |
+
outtext = outtext.replace(" _place_holder;", nbsp)
|
181 |
+
|
182 |
+
# Clear self.outtextlist to avoid memory leak of its content to
|
183 |
+
# the next handling.
|
184 |
+
self.outtextlist = []
|
185 |
+
|
186 |
+
return outtext
|
187 |
+
|
188 |
+
def handle_charref(self, c: str) -> None:
|
189 |
+
self.handle_data(self.charref(c), True)
|
190 |
+
|
191 |
+
def handle_entityref(self, c: str) -> None:
|
192 |
+
ref = self.entityref(c)
|
193 |
+
|
194 |
+
# ref may be an empty string (e.g. for ‎/‏ markers that should
|
195 |
+
# not contribute to the final output).
|
196 |
+
# self.handle_data cannot handle a zero-length string right after a
|
197 |
+
# stressed tag or mid-text within a stressed tag (text get split and
|
198 |
+
# self.stressed/self.preceding_stressed gets switched after the first
|
199 |
+
# part of that text).
|
200 |
+
if ref:
|
201 |
+
self.handle_data(ref, True)
|
202 |
+
|
203 |
+
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
|
204 |
+
self.handle_tag(tag, dict(attrs), start=True)
|
205 |
+
|
206 |
+
def handle_endtag(self, tag: str) -> None:
|
207 |
+
self.handle_tag(tag, {}, start=False)
|
208 |
+
|
209 |
+
def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
|
210 |
+
"""
|
211 |
+
:type attrs: dict
|
212 |
+
|
213 |
+
:returns: The index of certain set of attributes (of a link) in the
|
214 |
+
self.a list. If the set of attributes is not found, returns None
|
215 |
+
:rtype: int
|
216 |
+
"""
|
217 |
+
if "href" not in attrs:
|
218 |
+
return None
|
219 |
+
|
220 |
+
match = False
|
221 |
+
for i, a in enumerate(self.a):
|
222 |
+
if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
|
223 |
+
if "title" in a.attrs or "title" in attrs:
|
224 |
+
if (
|
225 |
+
"title" in a.attrs
|
226 |
+
and "title" in attrs
|
227 |
+
and a.attrs["title"] == attrs["title"]
|
228 |
+
):
|
229 |
+
match = True
|
230 |
+
else:
|
231 |
+
match = True
|
232 |
+
|
233 |
+
if match:
|
234 |
+
return i
|
235 |
+
return None
|
236 |
+
|
237 |
+
def handle_emphasis(
|
238 |
+
self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
|
239 |
+
) -> None:
|
240 |
+
"""
|
241 |
+
Handles various text emphases
|
242 |
+
"""
|
243 |
+
tag_emphasis = google_text_emphasis(tag_style)
|
244 |
+
parent_emphasis = google_text_emphasis(parent_style)
|
245 |
+
|
246 |
+
# handle Google's text emphasis
|
247 |
+
strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough
|
248 |
+
|
249 |
+
# google and others may mark a font's weight as `bold` or `700`
|
250 |
+
bold = False
|
251 |
+
for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
|
252 |
+
bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis
|
253 |
+
if bold:
|
254 |
+
break
|
255 |
+
|
256 |
+
italic = "italic" in tag_emphasis and "italic" not in parent_emphasis
|
257 |
+
fixed = (
|
258 |
+
google_fixed_width_font(tag_style)
|
259 |
+
and not google_fixed_width_font(parent_style)
|
260 |
+
and not self.pre
|
261 |
+
)
|
262 |
+
|
263 |
+
if start:
|
264 |
+
# crossed-out text must be handled before other attributes
|
265 |
+
# in order not to output qualifiers unnecessarily
|
266 |
+
if bold or italic or fixed:
|
267 |
+
self.emphasis += 1
|
268 |
+
if strikethrough:
|
269 |
+
self.quiet += 1
|
270 |
+
if italic:
|
271 |
+
self.o(self.emphasis_mark)
|
272 |
+
self.drop_white_space += 1
|
273 |
+
if bold:
|
274 |
+
self.o(self.strong_mark)
|
275 |
+
self.drop_white_space += 1
|
276 |
+
if fixed:
|
277 |
+
self.o("`")
|
278 |
+
self.drop_white_space += 1
|
279 |
+
self.code = True
|
280 |
+
else:
|
281 |
+
if bold or italic or fixed:
|
282 |
+
# there must not be whitespace before closing emphasis mark
|
283 |
+
self.emphasis -= 1
|
284 |
+
self.space = False
|
285 |
+
if fixed:
|
286 |
+
if self.drop_white_space:
|
287 |
+
# empty emphasis, drop it
|
288 |
+
self.drop_white_space -= 1
|
289 |
+
else:
|
290 |
+
self.o("`")
|
291 |
+
self.code = False
|
292 |
+
if bold:
|
293 |
+
if self.drop_white_space:
|
294 |
+
# empty emphasis, drop it
|
295 |
+
self.drop_white_space -= 1
|
296 |
+
else:
|
297 |
+
self.o(self.strong_mark)
|
298 |
+
if italic:
|
299 |
+
if self.drop_white_space:
|
300 |
+
# empty emphasis, drop it
|
301 |
+
self.drop_white_space -= 1
|
302 |
+
else:
|
303 |
+
self.o(self.emphasis_mark)
|
304 |
+
# space is only allowed after *all* emphasis marks
|
305 |
+
if (bold or italic) and not self.emphasis:
|
306 |
+
self.o(" ")
|
307 |
+
if strikethrough:
|
308 |
+
self.quiet -= 1
|
309 |
+
|
310 |
+
def handle_tag(
|
311 |
+
self, tag: str, attrs: Dict[str, Optional[str]], start: bool
|
312 |
+
) -> None:
|
313 |
+
self.current_tag = tag
|
314 |
+
|
315 |
+
if self.tag_callback is not None:
|
316 |
+
if self.tag_callback(self, tag, attrs, start) is True:
|
317 |
+
return
|
318 |
+
|
319 |
+
# first thing inside the anchor tag is another tag
|
320 |
+
# that produces some output
|
321 |
+
if (
|
322 |
+
start
|
323 |
+
and self.maybe_automatic_link is not None
|
324 |
+
and tag not in ["p", "div", "style", "dl", "dt"]
|
325 |
+
and (tag != "img" or self.ignore_images)
|
326 |
+
):
|
327 |
+
self.o("[")
|
328 |
+
self.maybe_automatic_link = None
|
329 |
+
self.empty_link = False
|
330 |
+
|
331 |
+
if self.google_doc:
|
332 |
+
# the attrs parameter is empty for a closing tag. in addition, we
|
333 |
+
# need the attributes of the parent nodes in order to get a
|
334 |
+
# complete style description for the current element. we assume
|
335 |
+
# that google docs export well formed html.
|
336 |
+
parent_style: Dict[str, str] = {}
|
337 |
+
if start:
|
338 |
+
if self.tag_stack:
|
339 |
+
parent_style = self.tag_stack[-1][2]
|
340 |
+
tag_style = element_style(attrs, self.style_def, parent_style)
|
341 |
+
self.tag_stack.append((tag, attrs, tag_style))
|
342 |
+
else:
|
343 |
+
dummy, attrs, tag_style = (
|
344 |
+
self.tag_stack.pop() if self.tag_stack else (None, {}, {})
|
345 |
+
)
|
346 |
+
if self.tag_stack:
|
347 |
+
parent_style = self.tag_stack[-1][2]
|
348 |
+
|
349 |
+
if hn(tag):
|
350 |
+
# check if nh is inside of an 'a' tag (incorrect but found in the wild)
|
351 |
+
if self.astack:
|
352 |
+
if start:
|
353 |
+
self.inheader = True
|
354 |
+
# are inside link name, so only add '#' if it can appear before '['
|
355 |
+
if self.outtextlist and self.outtextlist[-1] == "[":
|
356 |
+
self.outtextlist.pop()
|
357 |
+
self.space = False
|
358 |
+
self.o(hn(tag) * "#" + " ")
|
359 |
+
self.o("[")
|
360 |
+
else:
|
361 |
+
self.p_p = 0 # don't break up link name
|
362 |
+
self.inheader = False
|
363 |
+
return # prevent redundant emphasis marks on headers
|
364 |
+
else:
|
365 |
+
self.p()
|
366 |
+
if start:
|
367 |
+
self.inheader = True
|
368 |
+
self.o(hn(tag) * "#" + " ")
|
369 |
+
else:
|
370 |
+
self.inheader = False
|
371 |
+
return # prevent redundant emphasis marks on headers
|
372 |
+
|
373 |
+
if tag in ["p", "div"]:
|
374 |
+
if self.google_doc:
|
375 |
+
if start and google_has_height(tag_style):
|
376 |
+
self.p()
|
377 |
+
else:
|
378 |
+
self.soft_br()
|
379 |
+
elif self.astack:
|
380 |
+
pass
|
381 |
+
elif self.split_next_td:
|
382 |
+
pass
|
383 |
+
else:
|
384 |
+
self.p()
|
385 |
+
|
386 |
+
if tag == "br" and start:
|
387 |
+
if self.blockquote > 0:
|
388 |
+
self.o(" \n> ")
|
389 |
+
else:
|
390 |
+
self.o(" \n")
|
391 |
+
|
392 |
+
if tag == "hr" and start:
|
393 |
+
self.p()
|
394 |
+
self.o("* * *")
|
395 |
+
self.p()
|
396 |
+
|
397 |
+
if tag in ["head", "style", "script"]:
|
398 |
+
if start:
|
399 |
+
self.quiet += 1
|
400 |
+
else:
|
401 |
+
self.quiet -= 1
|
402 |
+
|
403 |
+
if tag == "style":
|
404 |
+
if start:
|
405 |
+
self.style += 1
|
406 |
+
else:
|
407 |
+
self.style -= 1
|
408 |
+
|
409 |
+
if tag in ["body"]:
|
410 |
+
self.quiet = 0 # sites like 9rules.com never close <head>
|
411 |
+
|
412 |
+
if tag == "blockquote":
|
413 |
+
if start:
|
414 |
+
self.p()
|
415 |
+
self.o("> ", force=True)
|
416 |
+
self.start = True
|
417 |
+
self.blockquote += 1
|
418 |
+
else:
|
419 |
+
self.blockquote -= 1
|
420 |
+
self.p()
|
421 |
+
|
422 |
+
if tag in ["em", "i", "u"] and not self.ignore_emphasis:
|
423 |
+
# Separate with a space if we immediately follow an alphanumeric
|
424 |
+
# character, since otherwise Markdown won't render the emphasis
|
425 |
+
# marks, and we'll be left with eg 'foo_bar_' visible.
|
426 |
+
# (Don't add a space otherwise, though, since there isn't one in the
|
427 |
+
# original HTML.)
|
428 |
+
if (
|
429 |
+
start
|
430 |
+
and self.preceding_data
|
431 |
+
and self.preceding_data[-1] not in string.whitespace
|
432 |
+
and self.preceding_data[-1] not in string.punctuation
|
433 |
+
):
|
434 |
+
emphasis = " " + self.emphasis_mark
|
435 |
+
self.preceding_data += " "
|
436 |
+
else:
|
437 |
+
emphasis = self.emphasis_mark
|
438 |
+
|
439 |
+
self.o(emphasis)
|
440 |
+
if start:
|
441 |
+
self.stressed = True
|
442 |
+
|
443 |
+
if tag in ["strong", "b"] and not self.ignore_emphasis:
|
444 |
+
# Separate with space if we immediately follow an * character, since
|
445 |
+
# without it, Markdown won't render the resulting *** correctly.
|
446 |
+
# (Don't add a space otherwise, though, since there isn't one in the
|
447 |
+
# original HTML.)
|
448 |
+
if (
|
449 |
+
start
|
450 |
+
and self.preceding_data
|
451 |
+
# When `self.strong_mark` is set to empty, the next condition
|
452 |
+
# will cause IndexError since it's trying to match the data
|
453 |
+
# with the first character of the `self.strong_mark`.
|
454 |
+
and len(self.strong_mark) > 0
|
455 |
+
and self.preceding_data[-1] == self.strong_mark[0]
|
456 |
+
):
|
457 |
+
strong = " " + self.strong_mark
|
458 |
+
self.preceding_data += " "
|
459 |
+
else:
|
460 |
+
strong = self.strong_mark
|
461 |
+
|
462 |
+
self.o(strong)
|
463 |
+
if start:
|
464 |
+
self.stressed = True
|
465 |
+
|
466 |
+
if tag in ["del", "strike", "s"]:
|
467 |
+
if start and self.preceding_data and self.preceding_data[-1] == "~":
|
468 |
+
strike = " ~~"
|
469 |
+
self.preceding_data += " "
|
470 |
+
else:
|
471 |
+
strike = "~~"
|
472 |
+
|
473 |
+
self.o(strike)
|
474 |
+
if start:
|
475 |
+
self.stressed = True
|
476 |
+
|
477 |
+
if self.google_doc:
|
478 |
+
if not self.inheader:
|
479 |
+
# handle some font attributes, but leave headers clean
|
480 |
+
self.handle_emphasis(start, tag_style, parent_style)
|
481 |
+
|
482 |
+
if tag in ["kbd", "code", "tt"] and not self.pre:
|
483 |
+
self.o("`") # TODO: `` `this` ``
|
484 |
+
self.code = not self.code
|
485 |
+
|
486 |
+
if tag == "abbr":
|
487 |
+
if start:
|
488 |
+
self.abbr_title = None
|
489 |
+
self.abbr_data = ""
|
490 |
+
if "title" in attrs:
|
491 |
+
self.abbr_title = attrs["title"]
|
492 |
+
else:
|
493 |
+
if self.abbr_title is not None:
|
494 |
+
assert self.abbr_data is not None
|
495 |
+
self.abbr_list[self.abbr_data] = self.abbr_title
|
496 |
+
self.abbr_title = None
|
497 |
+
self.abbr_data = None
|
498 |
+
|
499 |
+
if tag == "q":
|
500 |
+
if not self.quote:
|
501 |
+
self.o(self.open_quote)
|
502 |
+
else:
|
503 |
+
self.o(self.close_quote)
|
504 |
+
self.quote = not self.quote
|
505 |
+
|
506 |
+
def link_url(self: HTML2Text, link: str, title: str = "") -> None:
|
507 |
+
url = urlparse.urljoin(self.baseurl, link)
|
508 |
+
title = ' "{}"'.format(title) if title.strip() else ""
|
509 |
+
self.o("]({url}{title})".format(url=escape_md(url), title=title))
|
510 |
+
|
511 |
+
if tag == "a" and not self.ignore_links:
|
512 |
+
if start:
|
513 |
+
if (
|
514 |
+
"href" in attrs
|
515 |
+
and attrs["href"] is not None
|
516 |
+
and not (self.skip_internal_links and attrs["href"].startswith("#"))
|
517 |
+
and not (
|
518 |
+
self.ignore_mailto_links and attrs["href"].startswith("mailto:")
|
519 |
+
)
|
520 |
+
):
|
521 |
+
self.astack.append(attrs)
|
522 |
+
self.maybe_automatic_link = attrs["href"]
|
523 |
+
self.empty_link = True
|
524 |
+
if self.protect_links:
|
525 |
+
attrs["href"] = "<" + attrs["href"] + ">"
|
526 |
+
else:
|
527 |
+
self.astack.append(None)
|
528 |
+
else:
|
529 |
+
if self.astack:
|
530 |
+
a = self.astack.pop()
|
531 |
+
if self.maybe_automatic_link and not self.empty_link:
|
532 |
+
self.maybe_automatic_link = None
|
533 |
+
elif a:
|
534 |
+
assert a["href"] is not None
|
535 |
+
if self.empty_link:
|
536 |
+
self.o("[")
|
537 |
+
self.empty_link = False
|
538 |
+
self.maybe_automatic_link = None
|
539 |
+
if self.inline_links:
|
540 |
+
self.p_p = 0
|
541 |
+
title = a.get("title") or ""
|
542 |
+
title = escape_md(title)
|
543 |
+
link_url(self, a["href"], title)
|
544 |
+
else:
|
545 |
+
i = self.previousIndex(a)
|
546 |
+
if i is not None:
|
547 |
+
a_props = self.a[i]
|
548 |
+
else:
|
549 |
+
self.acount += 1
|
550 |
+
a_props = AnchorElement(a, self.acount, self.outcount)
|
551 |
+
self.a.append(a_props)
|
552 |
+
self.o("][" + str(a_props.count) + "]")
|
553 |
+
|
554 |
+
if tag == "img" and start and not self.ignore_images:
|
555 |
+
if "src" in attrs and attrs["src"] is not None:
|
556 |
+
if not self.images_to_alt:
|
557 |
+
attrs["href"] = attrs["src"]
|
558 |
+
alt = attrs.get("alt") or self.default_image_alt
|
559 |
+
|
560 |
+
# If we have images_with_size, write raw html including width,
|
561 |
+
# height, and alt attributes
|
562 |
+
if self.images_as_html or (
|
563 |
+
self.images_with_size and ("width" in attrs or "height" in attrs)
|
564 |
+
):
|
565 |
+
self.o("<img src='" + attrs["src"] + "' ")
|
566 |
+
if "width" in attrs and attrs["width"] is not None:
|
567 |
+
self.o("width='" + attrs["width"] + "' ")
|
568 |
+
if "height" in attrs and attrs["height"] is not None:
|
569 |
+
self.o("height='" + attrs["height"] + "' ")
|
570 |
+
if alt:
|
571 |
+
self.o("alt='" + alt + "' ")
|
572 |
+
self.o("/>")
|
573 |
+
return
|
574 |
+
|
575 |
+
# If we have a link to create, output the start
|
576 |
+
if self.maybe_automatic_link is not None:
|
577 |
+
href = self.maybe_automatic_link
|
578 |
+
if (
|
579 |
+
self.images_to_alt
|
580 |
+
and escape_md(alt) == href
|
581 |
+
and self.absolute_url_matcher.match(href)
|
582 |
+
):
|
583 |
+
self.o("<" + escape_md(alt) + ">")
|
584 |
+
self.empty_link = False
|
585 |
+
return
|
586 |
+
else:
|
587 |
+
self.o("[")
|
588 |
+
self.maybe_automatic_link = None
|
589 |
+
self.empty_link = False
|
590 |
+
|
591 |
+
# If we have images_to_alt, we discard the image itself,
|
592 |
+
# considering only the alt text.
|
593 |
+
if self.images_to_alt:
|
594 |
+
self.o(escape_md(alt))
|
595 |
+
else:
|
596 |
+
self.o("![" + escape_md(alt) + "]")
|
597 |
+
if self.inline_links:
|
598 |
+
href = attrs.get("href") or ""
|
599 |
+
self.o(
|
600 |
+
"(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
|
601 |
+
)
|
602 |
+
else:
|
603 |
+
i = self.previousIndex(attrs)
|
604 |
+
if i is not None:
|
605 |
+
a_props = self.a[i]
|
606 |
+
else:
|
607 |
+
self.acount += 1
|
608 |
+
a_props = AnchorElement(attrs, self.acount, self.outcount)
|
609 |
+
self.a.append(a_props)
|
610 |
+
self.o("[" + str(a_props.count) + "]")
|
611 |
+
|
612 |
+
if tag == "dl" and start:
|
613 |
+
self.p()
|
614 |
+
if tag == "dt" and not start:
|
615 |
+
self.pbr()
|
616 |
+
if tag == "dd" and start:
|
617 |
+
self.o(" ")
|
618 |
+
if tag == "dd" and not start:
|
619 |
+
self.pbr()
|
620 |
+
|
621 |
+
if tag in ["ol", "ul"]:
|
622 |
+
# Google Docs create sub lists as top level lists
|
623 |
+
if not self.list and not self.lastWasList:
|
624 |
+
self.p()
|
625 |
+
if start:
|
626 |
+
if self.google_doc:
|
627 |
+
list_style = google_list_style(tag_style)
|
628 |
+
else:
|
629 |
+
list_style = tag
|
630 |
+
numbering_start = list_numbering_start(attrs)
|
631 |
+
self.list.append(ListElement(list_style, numbering_start))
|
632 |
+
else:
|
633 |
+
if self.list:
|
634 |
+
self.list.pop()
|
635 |
+
if not self.google_doc and not self.list:
|
636 |
+
self.o("\n")
|
637 |
+
self.lastWasList = True
|
638 |
+
else:
|
639 |
+
self.lastWasList = False
|
640 |
+
|
641 |
+
if tag == "li":
|
642 |
+
self.pbr()
|
643 |
+
if start:
|
644 |
+
if self.list:
|
645 |
+
li = self.list[-1]
|
646 |
+
else:
|
647 |
+
li = ListElement("ul", 0)
|
648 |
+
if self.google_doc:
|
649 |
+
self.o(" " * self.google_nest_count(tag_style))
|
650 |
+
else:
|
651 |
+
# Indent two spaces per list, except use three spaces for an
|
652 |
+
# unordered list inside an ordered list.
|
653 |
+
# https://spec.commonmark.org/0.28/#motivation
|
654 |
+
# TODO: line up <ol><li>s > 9 correctly.
|
655 |
+
parent_list = None
|
656 |
+
for list in self.list:
|
657 |
+
self.o(
|
658 |
+
" " if parent_list == "ol" and list.name == "ul" else " "
|
659 |
+
)
|
660 |
+
parent_list = list.name
|
661 |
+
|
662 |
+
if li.name == "ul":
|
663 |
+
self.o(self.ul_item_mark + " ")
|
664 |
+
elif li.name == "ol":
|
665 |
+
li.num += 1
|
666 |
+
self.o(str(li.num) + ". ")
|
667 |
+
self.start = True
|
668 |
+
|
669 |
+
if tag in ["table", "tr", "td", "th"]:
|
670 |
+
if self.ignore_tables:
|
671 |
+
if tag == "tr":
|
672 |
+
if start:
|
673 |
+
pass
|
674 |
+
else:
|
675 |
+
self.soft_br()
|
676 |
+
else:
|
677 |
+
pass
|
678 |
+
|
679 |
+
elif self.bypass_tables:
|
680 |
+
if start:
|
681 |
+
self.soft_br()
|
682 |
+
if tag in ["td", "th"]:
|
683 |
+
if start:
|
684 |
+
self.o("<{}>\n\n".format(tag))
|
685 |
+
else:
|
686 |
+
self.o("\n</{}>".format(tag))
|
687 |
+
else:
|
688 |
+
if start:
|
689 |
+
self.o("<{}>".format(tag))
|
690 |
+
else:
|
691 |
+
self.o("</{}>".format(tag))
|
692 |
+
|
693 |
+
else:
|
694 |
+
if tag == "table":
|
695 |
+
if start:
|
696 |
+
self.table_start = True
|
697 |
+
if self.pad_tables:
|
698 |
+
self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
|
699 |
+
self.o(" \n")
|
700 |
+
else:
|
701 |
+
if self.pad_tables:
|
702 |
+
# add break in case the table is empty or its 1 row table
|
703 |
+
self.soft_br()
|
704 |
+
self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
|
705 |
+
self.o(" \n")
|
706 |
+
if tag in ["td", "th"] and start:
|
707 |
+
if self.split_next_td:
|
708 |
+
self.o("| ")
|
709 |
+
self.split_next_td = True
|
710 |
+
|
711 |
+
if tag == "tr" and start:
|
712 |
+
self.td_count = 0
|
713 |
+
if tag == "tr" and not start:
|
714 |
+
self.split_next_td = False
|
715 |
+
self.soft_br()
|
716 |
+
if tag == "tr" and not start and self.table_start:
|
717 |
+
# Underline table header
|
718 |
+
self.o("|".join(["---"] * self.td_count))
|
719 |
+
self.soft_br()
|
720 |
+
self.table_start = False
|
721 |
+
if tag in ["td", "th"] and start:
|
722 |
+
self.td_count += 1
|
723 |
+
|
724 |
+
if tag == "pre":
|
725 |
+
if start:
|
726 |
+
self.startpre = True
|
727 |
+
self.pre = True
|
728 |
+
else:
|
729 |
+
self.pre = False
|
730 |
+
if self.mark_code:
|
731 |
+
self.out("\n[/code]")
|
732 |
+
self.p()
|
733 |
+
|
734 |
+
if tag in ["sup", "sub"] and self.include_sup_sub:
|
735 |
+
if start:
|
736 |
+
self.o("<{}>".format(tag))
|
737 |
+
else:
|
738 |
+
self.o("</{}>".format(tag))
|
739 |
+
|
740 |
+
# TODO: Add docstring for these one letter functions
|
741 |
+
def pbr(self) -> None:
|
742 |
+
"Pretty print has a line break"
|
743 |
+
if self.p_p == 0:
|
744 |
+
self.p_p = 1
|
745 |
+
|
746 |
+
def p(self) -> None:
|
747 |
+
"Set pretty print to 1 or 2 lines"
|
748 |
+
self.p_p = 1 if self.single_line_break else 2
|
749 |
+
|
750 |
+
def soft_br(self) -> None:
|
751 |
+
"Soft breaks"
|
752 |
+
self.pbr()
|
753 |
+
self.br_toggle = " "
|
754 |
+
|
755 |
+
def o(
|
756 |
+
self, data: str, puredata: bool = False, force: Union[bool, str] = False
|
757 |
+
) -> None:
|
758 |
+
"""
|
759 |
+
Deal with indentation and whitespace
|
760 |
+
"""
|
761 |
+
if self.abbr_data is not None:
|
762 |
+
self.abbr_data += data
|
763 |
+
|
764 |
+
if not self.quiet:
|
765 |
+
if self.google_doc:
|
766 |
+
# prevent white space immediately after 'begin emphasis'
|
767 |
+
# marks ('**' and '_')
|
768 |
+
lstripped_data = data.lstrip()
|
769 |
+
if self.drop_white_space and not (self.pre or self.code):
|
770 |
+
data = lstripped_data
|
771 |
+
if lstripped_data != "":
|
772 |
+
self.drop_white_space = 0
|
773 |
+
|
774 |
+
if puredata and not self.pre:
|
775 |
+
# This is a very dangerous call ... it could mess up
|
776 |
+
# all handling of when not handled properly
|
777 |
+
# (see entityref)
|
778 |
+
data = re.sub(r"\s+", r" ", data)
|
779 |
+
if data and data[0] == " ":
|
780 |
+
self.space = True
|
781 |
+
data = data[1:]
|
782 |
+
if not data and not force:
|
783 |
+
return
|
784 |
+
|
785 |
+
if self.startpre:
|
786 |
+
# self.out(" :") #TODO: not output when already one there
|
787 |
+
if not data.startswith("\n") and not data.startswith("\r\n"):
|
788 |
+
# <pre>stuff...
|
789 |
+
data = "\n" + data
|
790 |
+
if self.mark_code:
|
791 |
+
self.out("\n[code]")
|
792 |
+
self.p_p = 0
|
793 |
+
|
794 |
+
bq = ">" * self.blockquote
|
795 |
+
if not (force and data and data[0] == ">") and self.blockquote:
|
796 |
+
bq += " "
|
797 |
+
|
798 |
+
if self.pre:
|
799 |
+
if not self.list:
|
800 |
+
bq += " "
|
801 |
+
# else: list content is already partially indented
|
802 |
+
bq += " " * len(self.list)
|
803 |
+
data = data.replace("\n", "\n" + bq)
|
804 |
+
|
805 |
+
if self.startpre:
|
806 |
+
self.startpre = False
|
807 |
+
if self.list:
|
808 |
+
# use existing initial indentation
|
809 |
+
data = data.lstrip("\n")
|
810 |
+
|
811 |
+
if self.start:
|
812 |
+
self.space = False
|
813 |
+
self.p_p = 0
|
814 |
+
self.start = False
|
815 |
+
|
816 |
+
if force == "end":
|
817 |
+
# It's the end.
|
818 |
+
self.p_p = 0
|
819 |
+
self.out("\n")
|
820 |
+
self.space = False
|
821 |
+
|
822 |
+
if self.p_p:
|
823 |
+
self.out((self.br_toggle + "\n" + bq) * self.p_p)
|
824 |
+
self.space = False
|
825 |
+
self.br_toggle = ""
|
826 |
+
|
827 |
+
if self.space:
|
828 |
+
if not self.lastWasNL:
|
829 |
+
self.out(" ")
|
830 |
+
self.space = False
|
831 |
+
|
832 |
+
if self.a and (
|
833 |
+
(self.p_p == 2 and self.links_each_paragraph) or force == "end"
|
834 |
+
):
|
835 |
+
if force == "end":
|
836 |
+
self.out("\n")
|
837 |
+
|
838 |
+
newa = []
|
839 |
+
for link in self.a:
|
840 |
+
if self.outcount > link.outcount:
|
841 |
+
self.out(
|
842 |
+
" ["
|
843 |
+
+ str(link.count)
|
844 |
+
+ "]: "
|
845 |
+
+ urlparse.urljoin(self.baseurl, link.attrs["href"])
|
846 |
+
)
|
847 |
+
if "title" in link.attrs and link.attrs["title"] is not None:
|
848 |
+
self.out(" (" + link.attrs["title"] + ")")
|
849 |
+
self.out("\n")
|
850 |
+
else:
|
851 |
+
newa.append(link)
|
852 |
+
|
853 |
+
# Don't need an extra line when nothing was done.
|
854 |
+
if self.a != newa:
|
855 |
+
self.out("\n")
|
856 |
+
|
857 |
+
self.a = newa
|
858 |
+
|
859 |
+
if self.abbr_list and force == "end":
|
860 |
+
for abbr, definition in self.abbr_list.items():
|
861 |
+
self.out(" *[" + abbr + "]: " + definition + "\n")
|
862 |
+
|
863 |
+
self.p_p = 0
|
864 |
+
self.out(data)
|
865 |
+
self.outcount += 1
|
866 |
+
|
867 |
+
def handle_data(self, data: str, entity_char: bool = False) -> None:
|
868 |
+
if not data:
|
869 |
+
# Data may be empty for some HTML entities. For example,
|
870 |
+
# LEFT-TO-RIGHT MARK.
|
871 |
+
return
|
872 |
+
|
873 |
+
if self.stressed:
|
874 |
+
data = data.strip()
|
875 |
+
self.stressed = False
|
876 |
+
self.preceding_stressed = True
|
877 |
+
elif self.preceding_stressed:
|
878 |
+
if (
|
879 |
+
re.match(r"[^][(){}\s.!?]", data[0])
|
880 |
+
and not hn(self.current_tag)
|
881 |
+
and self.current_tag not in ["a", "code", "pre"]
|
882 |
+
):
|
883 |
+
# should match a letter or common punctuation
|
884 |
+
data = " " + data
|
885 |
+
self.preceding_stressed = False
|
886 |
+
|
887 |
+
if self.style:
|
888 |
+
self.style_def.update(dumb_css_parser(data))
|
889 |
+
|
890 |
+
if self.maybe_automatic_link is not None:
|
891 |
+
href = self.maybe_automatic_link
|
892 |
+
if (
|
893 |
+
href == data
|
894 |
+
and self.absolute_url_matcher.match(href)
|
895 |
+
and self.use_automatic_links
|
896 |
+
):
|
897 |
+
self.o("<" + data + ">")
|
898 |
+
self.empty_link = False
|
899 |
+
return
|
900 |
+
else:
|
901 |
+
self.o("[")
|
902 |
+
self.maybe_automatic_link = None
|
903 |
+
self.empty_link = False
|
904 |
+
|
905 |
+
if not self.code and not self.pre and not entity_char:
|
906 |
+
data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
|
907 |
+
self.preceding_data = data
|
908 |
+
self.o(data, puredata=True)
|
909 |
+
|
910 |
+
def charref(self, name: str) -> str:
|
911 |
+
if name[0] in ["x", "X"]:
|
912 |
+
c = int(name[1:], 16)
|
913 |
+
else:
|
914 |
+
c = int(name)
|
915 |
+
|
916 |
+
if not self.unicode_snob and c in unifiable_n:
|
917 |
+
return unifiable_n[c]
|
918 |
+
else:
|
919 |
+
try:
|
920 |
+
return chr(c)
|
921 |
+
except ValueError: # invalid unicode
|
922 |
+
return ""
|
923 |
+
|
924 |
+
def entityref(self, c: str) -> str:
|
925 |
+
if not self.unicode_snob and c in config.UNIFIABLE:
|
926 |
+
return config.UNIFIABLE[c]
|
927 |
+
try:
|
928 |
+
ch = html.entities.html5[c + ";"]
|
929 |
+
except KeyError:
|
930 |
+
return "&" + c + ";"
|
931 |
+
return config.UNIFIABLE[c] if c == "nbsp" else ch
|
932 |
+
|
933 |
+
def google_nest_count(self, style: Dict[str, str]) -> int:
|
934 |
+
"""
|
935 |
+
Calculate the nesting count of google doc lists
|
936 |
+
|
937 |
+
:type style: dict
|
938 |
+
|
939 |
+
:rtype: int
|
940 |
+
"""
|
941 |
+
nest_count = 0
|
942 |
+
if "margin-left" in style:
|
943 |
+
nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
|
944 |
+
|
945 |
+
return nest_count
|
946 |
+
|
947 |
+
def optwrap(self, text: str) -> str:
|
948 |
+
"""
|
949 |
+
Wrap all paragraphs in the provided text.
|
950 |
+
|
951 |
+
:type text: str
|
952 |
+
|
953 |
+
:rtype: str
|
954 |
+
"""
|
955 |
+
if not self.body_width:
|
956 |
+
return text
|
957 |
+
|
958 |
+
result = ""
|
959 |
+
newlines = 0
|
960 |
+
# I cannot think of a better solution for now.
|
961 |
+
# To avoid the non-wrap behaviour for entire paras
|
962 |
+
# because of the presence of a link in it
|
963 |
+
if not self.wrap_links:
|
964 |
+
self.inline_links = False
|
965 |
+
for para in text.split("\n"):
|
966 |
+
if len(para) > 0:
|
967 |
+
if not skipwrap(
|
968 |
+
para, self.wrap_links, self.wrap_list_items, self.wrap_tables
|
969 |
+
):
|
970 |
+
indent = ""
|
971 |
+
if para.startswith(" " + self.ul_item_mark):
|
972 |
+
# list item continuation: add a double indent to the
|
973 |
+
# new lines
|
974 |
+
indent = " "
|
975 |
+
elif para.startswith("> "):
|
976 |
+
# blockquote continuation: add the greater than symbol
|
977 |
+
# to the new lines
|
978 |
+
indent = "> "
|
979 |
+
wrapped = wrap(
|
980 |
+
para,
|
981 |
+
self.body_width,
|
982 |
+
break_long_words=False,
|
983 |
+
subsequent_indent=indent,
|
984 |
+
)
|
985 |
+
result += "\n".join(wrapped)
|
986 |
+
if para.endswith(" "):
|
987 |
+
result += " \n"
|
988 |
+
newlines = 1
|
989 |
+
elif indent:
|
990 |
+
result += "\n"
|
991 |
+
newlines = 1
|
992 |
+
else:
|
993 |
+
result += "\n\n"
|
994 |
+
newlines = 2
|
995 |
+
else:
|
996 |
+
# Warning for the tempted!!!
|
997 |
+
# Be aware that obvious replacement of this with
|
998 |
+
# line.isspace()
|
999 |
+
# DOES NOT work! Explanations are welcome.
|
1000 |
+
if not config.RE_SPACE.match(para):
|
1001 |
+
result += para + "\n"
|
1002 |
+
newlines = 1
|
1003 |
+
else:
|
1004 |
+
if newlines < 2:
|
1005 |
+
result += "\n"
|
1006 |
+
newlines += 1
|
1007 |
+
return result
|
1008 |
+
|
1009 |
+
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
|
1010 |
+
if bodywidth is None:
|
1011 |
+
bodywidth = config.BODY_WIDTH
|
1012 |
+
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
|
1013 |
+
|
1014 |
+
return h.handle(html)
|
1015 |
+
|
1016 |
+
class CustomHTML2Text(HTML2Text):
|
1017 |
+
def __init__(self, *args, handle_code_in_pre=False, **kwargs):
|
1018 |
+
super().__init__(*args, **kwargs)
|
1019 |
+
self.inside_pre = False
|
1020 |
+
self.inside_code = False
|
1021 |
+
self.preserve_tags = set() # Set of tags to preserve
|
1022 |
+
self.current_preserved_tag = None
|
1023 |
+
self.preserved_content = []
|
1024 |
+
self.preserve_depth = 0
|
1025 |
+
self.handle_code_in_pre = handle_code_in_pre
|
1026 |
+
|
1027 |
+
# Configuration options
|
1028 |
+
self.skip_internal_links = False
|
1029 |
+
self.single_line_break = False
|
1030 |
+
self.mark_code = False
|
1031 |
+
self.include_sup_sub = False
|
1032 |
+
self.body_width = 0
|
1033 |
+
self.ignore_mailto_links = True
|
1034 |
+
self.ignore_links = False
|
1035 |
+
self.escape_backslash = False
|
1036 |
+
self.escape_dot = False
|
1037 |
+
self.escape_plus = False
|
1038 |
+
self.escape_dash = False
|
1039 |
+
self.escape_snob = False
|
1040 |
+
|
1041 |
+
def update_params(self, **kwargs):
|
1042 |
+
"""Update parameters and set preserved tags."""
|
1043 |
+
for key, value in kwargs.items():
|
1044 |
+
if key == 'preserve_tags':
|
1045 |
+
self.preserve_tags = set(value)
|
1046 |
+
elif key == 'handle_code_in_pre':
|
1047 |
+
self.handle_code_in_pre = value
|
1048 |
+
else:
|
1049 |
+
setattr(self, key, value)
|
1050 |
+
|
1051 |
+
def handle_tag(self, tag, attrs, start):
|
1052 |
+
# Handle preserved tags
|
1053 |
+
if tag in self.preserve_tags:
|
1054 |
+
if start:
|
1055 |
+
if self.preserve_depth == 0:
|
1056 |
+
self.current_preserved_tag = tag
|
1057 |
+
self.preserved_content = []
|
1058 |
+
# Format opening tag with attributes
|
1059 |
+
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
1060 |
+
self.preserved_content.append(f'<{tag}{attr_str}>')
|
1061 |
+
self.preserve_depth += 1
|
1062 |
+
return
|
1063 |
+
else:
|
1064 |
+
self.preserve_depth -= 1
|
1065 |
+
if self.preserve_depth == 0:
|
1066 |
+
self.preserved_content.append(f'</{tag}>')
|
1067 |
+
# Output the preserved HTML block with proper spacing
|
1068 |
+
preserved_html = ''.join(self.preserved_content)
|
1069 |
+
self.o('\n' + preserved_html + '\n')
|
1070 |
+
self.current_preserved_tag = None
|
1071 |
+
return
|
1072 |
+
|
1073 |
+
# If we're inside a preserved tag, collect all content
|
1074 |
+
if self.preserve_depth > 0:
|
1075 |
+
if start:
|
1076 |
+
# Format nested tags with attributes
|
1077 |
+
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
1078 |
+
self.preserved_content.append(f'<{tag}{attr_str}>')
|
1079 |
+
else:
|
1080 |
+
self.preserved_content.append(f'</{tag}>')
|
1081 |
+
return
|
1082 |
+
|
1083 |
+
# Handle pre tags
|
1084 |
+
if tag == 'pre':
|
1085 |
+
if start:
|
1086 |
+
self.o('```\n') # Markdown code block start
|
1087 |
+
self.inside_pre = True
|
1088 |
+
else:
|
1089 |
+
self.o('\n```\n') # Markdown code block end
|
1090 |
+
self.inside_pre = False
|
1091 |
+
elif tag == 'code':
|
1092 |
+
if self.inside_pre and not self.handle_code_in_pre:
|
1093 |
+
# Ignore code tags inside pre blocks if handle_code_in_pre is False
|
1094 |
+
return
|
1095 |
+
if start:
|
1096 |
+
self.o('`') # Markdown inline code start
|
1097 |
+
self.inside_code = True
|
1098 |
+
else:
|
1099 |
+
self.o('`') # Markdown inline code end
|
1100 |
+
self.inside_code = False
|
1101 |
+
else:
|
1102 |
+
super().handle_tag(tag, attrs, start)
|
1103 |
+
|
1104 |
+
def handle_data(self, data, entity_char=False):
|
1105 |
+
"""Override handle_data to capture content within preserved tags."""
|
1106 |
+
if self.preserve_depth > 0:
|
1107 |
+
self.preserved_content.append(data)
|
1108 |
+
return
|
1109 |
+
|
1110 |
+
if self.inside_pre:
|
1111 |
+
# Output the raw content for pre blocks, including content inside code tags
|
1112 |
+
self.o(data) # Directly output the data as-is (preserve newlines)
|
1113 |
+
return
|
1114 |
+
if self.inside_code:
|
1115 |
+
# Inline code: no newlines allowed
|
1116 |
+
self.o(data.replace('\n', ' '))
|
1117 |
+
return
|
1118 |
+
|
1119 |
+
# Default behavior for other tags
|
1120 |
+
super().handle_data(data, entity_char)
|
1121 |
+
|
1122 |
+
|
1123 |
+
# # Handle pre tags
|
1124 |
+
# if tag == 'pre':
|
1125 |
+
# if start:
|
1126 |
+
# self.o('```\n')
|
1127 |
+
# self.inside_pre = True
|
1128 |
+
# else:
|
1129 |
+
# self.o('\n```')
|
1130 |
+
# self.inside_pre = False
|
1131 |
+
# # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
1132 |
+
# # pass
|
1133 |
+
# else:
|
1134 |
+
# super().handle_tag(tag, attrs, start)
|
1135 |
+
|
1136 |
+
# def handle_data(self, data, entity_char=False):
|
1137 |
+
# """Override handle_data to capture content within preserved tags."""
|
1138 |
+
# if self.preserve_depth > 0:
|
1139 |
+
# self.preserved_content.append(data)
|
1140 |
+
# return
|
1141 |
+
# super().handle_data(data, entity_char)
|
crawl4ai/html2text/__main__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .cli import main
|
2 |
+
|
3 |
+
main()
|
crawl4ai/html2text/_typing.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
class OutCallback:
|
2 |
+
def __call__(self, s: str) -> None: ...
|
crawl4ai/html2text/cli.py
ADDED
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from . import HTML2Text, __version__, config
|
5 |
+
|
6 |
+
|
7 |
+
def main() -> None:
|
8 |
+
baseurl = ""
|
9 |
+
|
10 |
+
class bcolors:
|
11 |
+
HEADER = "\033[95m"
|
12 |
+
OKBLUE = "\033[94m"
|
13 |
+
OKGREEN = "\033[92m"
|
14 |
+
WARNING = "\033[93m"
|
15 |
+
FAIL = "\033[91m"
|
16 |
+
ENDC = "\033[0m"
|
17 |
+
BOLD = "\033[1m"
|
18 |
+
UNDERLINE = "\033[4m"
|
19 |
+
|
20 |
+
p = argparse.ArgumentParser()
|
21 |
+
p.add_argument(
|
22 |
+
"--default-image-alt",
|
23 |
+
dest="default_image_alt",
|
24 |
+
default=config.DEFAULT_IMAGE_ALT,
|
25 |
+
help="The default alt string for images with missing ones",
|
26 |
+
)
|
27 |
+
p.add_argument(
|
28 |
+
"--pad-tables",
|
29 |
+
dest="pad_tables",
|
30 |
+
action="store_true",
|
31 |
+
default=config.PAD_TABLES,
|
32 |
+
help="pad the cells to equal column width in tables",
|
33 |
+
)
|
34 |
+
p.add_argument(
|
35 |
+
"--no-wrap-links",
|
36 |
+
dest="wrap_links",
|
37 |
+
action="store_false",
|
38 |
+
default=config.WRAP_LINKS,
|
39 |
+
help="don't wrap links during conversion",
|
40 |
+
)
|
41 |
+
p.add_argument(
|
42 |
+
"--wrap-list-items",
|
43 |
+
dest="wrap_list_items",
|
44 |
+
action="store_true",
|
45 |
+
default=config.WRAP_LIST_ITEMS,
|
46 |
+
help="wrap list items during conversion",
|
47 |
+
)
|
48 |
+
p.add_argument(
|
49 |
+
"--wrap-tables",
|
50 |
+
dest="wrap_tables",
|
51 |
+
action="store_true",
|
52 |
+
default=config.WRAP_TABLES,
|
53 |
+
help="wrap tables",
|
54 |
+
)
|
55 |
+
p.add_argument(
|
56 |
+
"--ignore-emphasis",
|
57 |
+
dest="ignore_emphasis",
|
58 |
+
action="store_true",
|
59 |
+
default=config.IGNORE_EMPHASIS,
|
60 |
+
help="don't include any formatting for emphasis",
|
61 |
+
)
|
62 |
+
p.add_argument(
|
63 |
+
"--reference-links",
|
64 |
+
dest="inline_links",
|
65 |
+
action="store_false",
|
66 |
+
default=config.INLINE_LINKS,
|
67 |
+
help="use reference style links instead of inline links",
|
68 |
+
)
|
69 |
+
p.add_argument(
|
70 |
+
"--ignore-links",
|
71 |
+
dest="ignore_links",
|
72 |
+
action="store_true",
|
73 |
+
default=config.IGNORE_ANCHORS,
|
74 |
+
help="don't include any formatting for links",
|
75 |
+
)
|
76 |
+
p.add_argument(
|
77 |
+
"--ignore-mailto-links",
|
78 |
+
action="store_true",
|
79 |
+
dest="ignore_mailto_links",
|
80 |
+
default=config.IGNORE_MAILTO_LINKS,
|
81 |
+
help="don't include mailto: links",
|
82 |
+
)
|
83 |
+
p.add_argument(
|
84 |
+
"--protect-links",
|
85 |
+
dest="protect_links",
|
86 |
+
action="store_true",
|
87 |
+
default=config.PROTECT_LINKS,
|
88 |
+
help="protect links from line breaks surrounding them with angle brackets",
|
89 |
+
)
|
90 |
+
p.add_argument(
|
91 |
+
"--ignore-images",
|
92 |
+
dest="ignore_images",
|
93 |
+
action="store_true",
|
94 |
+
default=config.IGNORE_IMAGES,
|
95 |
+
help="don't include any formatting for images",
|
96 |
+
)
|
97 |
+
p.add_argument(
|
98 |
+
"--images-as-html",
|
99 |
+
dest="images_as_html",
|
100 |
+
action="store_true",
|
101 |
+
default=config.IMAGES_AS_HTML,
|
102 |
+
help=(
|
103 |
+
"Always write image tags as raw html; preserves `height`, `width` and "
|
104 |
+
"`alt` if possible."
|
105 |
+
),
|
106 |
+
)
|
107 |
+
p.add_argument(
|
108 |
+
"--images-to-alt",
|
109 |
+
dest="images_to_alt",
|
110 |
+
action="store_true",
|
111 |
+
default=config.IMAGES_TO_ALT,
|
112 |
+
help="Discard image data, only keep alt text",
|
113 |
+
)
|
114 |
+
p.add_argument(
|
115 |
+
"--images-with-size",
|
116 |
+
dest="images_with_size",
|
117 |
+
action="store_true",
|
118 |
+
default=config.IMAGES_WITH_SIZE,
|
119 |
+
help=(
|
120 |
+
"Write image tags with height and width attrs as raw html to retain "
|
121 |
+
"dimensions"
|
122 |
+
),
|
123 |
+
)
|
124 |
+
p.add_argument(
|
125 |
+
"-g",
|
126 |
+
"--google-doc",
|
127 |
+
action="store_true",
|
128 |
+
dest="google_doc",
|
129 |
+
default=False,
|
130 |
+
help="convert an html-exported Google Document",
|
131 |
+
)
|
132 |
+
p.add_argument(
|
133 |
+
"-d",
|
134 |
+
"--dash-unordered-list",
|
135 |
+
action="store_true",
|
136 |
+
dest="ul_style_dash",
|
137 |
+
default=False,
|
138 |
+
help="use a dash rather than a star for unordered list items",
|
139 |
+
)
|
140 |
+
p.add_argument(
|
141 |
+
"-e",
|
142 |
+
"--asterisk-emphasis",
|
143 |
+
action="store_true",
|
144 |
+
dest="em_style_asterisk",
|
145 |
+
default=False,
|
146 |
+
help="use an asterisk rather than an underscore for emphasized text",
|
147 |
+
)
|
148 |
+
p.add_argument(
|
149 |
+
"-b",
|
150 |
+
"--body-width",
|
151 |
+
dest="body_width",
|
152 |
+
type=int,
|
153 |
+
default=config.BODY_WIDTH,
|
154 |
+
help="number of characters per output line, 0 for no wrap",
|
155 |
+
)
|
156 |
+
p.add_argument(
|
157 |
+
"-i",
|
158 |
+
"--google-list-indent",
|
159 |
+
dest="list_indent",
|
160 |
+
type=int,
|
161 |
+
default=config.GOOGLE_LIST_INDENT,
|
162 |
+
help="number of pixels Google indents nested lists",
|
163 |
+
)
|
164 |
+
p.add_argument(
|
165 |
+
"-s",
|
166 |
+
"--hide-strikethrough",
|
167 |
+
action="store_true",
|
168 |
+
dest="hide_strikethrough",
|
169 |
+
default=False,
|
170 |
+
help="hide strike-through text. only relevant when -g is " "specified as well",
|
171 |
+
)
|
172 |
+
p.add_argument(
|
173 |
+
"--escape-all",
|
174 |
+
action="store_true",
|
175 |
+
dest="escape_snob",
|
176 |
+
default=False,
|
177 |
+
help=(
|
178 |
+
"Escape all special characters. Output is less readable, but avoids "
|
179 |
+
"corner case formatting issues."
|
180 |
+
),
|
181 |
+
)
|
182 |
+
p.add_argument(
|
183 |
+
"--bypass-tables",
|
184 |
+
action="store_true",
|
185 |
+
dest="bypass_tables",
|
186 |
+
default=config.BYPASS_TABLES,
|
187 |
+
help="Format tables in HTML rather than Markdown syntax.",
|
188 |
+
)
|
189 |
+
p.add_argument(
|
190 |
+
"--ignore-tables",
|
191 |
+
action="store_true",
|
192 |
+
dest="ignore_tables",
|
193 |
+
default=config.IGNORE_TABLES,
|
194 |
+
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
195 |
+
)
|
196 |
+
p.add_argument(
|
197 |
+
"--single-line-break",
|
198 |
+
action="store_true",
|
199 |
+
dest="single_line_break",
|
200 |
+
default=config.SINGLE_LINE_BREAK,
|
201 |
+
help=(
|
202 |
+
"Use a single line break after a block element rather than two line "
|
203 |
+
"breaks. NOTE: Requires --body-width=0"
|
204 |
+
),
|
205 |
+
)
|
206 |
+
p.add_argument(
|
207 |
+
"--unicode-snob",
|
208 |
+
action="store_true",
|
209 |
+
dest="unicode_snob",
|
210 |
+
default=config.UNICODE_SNOB,
|
211 |
+
help="Use unicode throughout document",
|
212 |
+
)
|
213 |
+
p.add_argument(
|
214 |
+
"--no-automatic-links",
|
215 |
+
action="store_false",
|
216 |
+
dest="use_automatic_links",
|
217 |
+
default=config.USE_AUTOMATIC_LINKS,
|
218 |
+
help="Do not use automatic links wherever applicable",
|
219 |
+
)
|
220 |
+
p.add_argument(
|
221 |
+
"--no-skip-internal-links",
|
222 |
+
action="store_false",
|
223 |
+
dest="skip_internal_links",
|
224 |
+
default=config.SKIP_INTERNAL_LINKS,
|
225 |
+
help="Do not skip internal links",
|
226 |
+
)
|
227 |
+
p.add_argument(
|
228 |
+
"--links-after-para",
|
229 |
+
action="store_true",
|
230 |
+
dest="links_each_paragraph",
|
231 |
+
default=config.LINKS_EACH_PARAGRAPH,
|
232 |
+
help="Put links after each paragraph instead of document",
|
233 |
+
)
|
234 |
+
p.add_argument(
|
235 |
+
"--mark-code",
|
236 |
+
action="store_true",
|
237 |
+
dest="mark_code",
|
238 |
+
default=config.MARK_CODE,
|
239 |
+
help="Mark program code blocks with [code]...[/code]",
|
240 |
+
)
|
241 |
+
p.add_argument(
|
242 |
+
"--decode-errors",
|
243 |
+
dest="decode_errors",
|
244 |
+
default=config.DECODE_ERRORS,
|
245 |
+
help=(
|
246 |
+
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
247 |
+
"acceptable values"
|
248 |
+
),
|
249 |
+
)
|
250 |
+
p.add_argument(
|
251 |
+
"--open-quote",
|
252 |
+
dest="open_quote",
|
253 |
+
default=config.OPEN_QUOTE,
|
254 |
+
help="The character used to open quotes",
|
255 |
+
)
|
256 |
+
p.add_argument(
|
257 |
+
"--close-quote",
|
258 |
+
dest="close_quote",
|
259 |
+
default=config.CLOSE_QUOTE,
|
260 |
+
help="The character used to close quotes",
|
261 |
+
)
|
262 |
+
p.add_argument(
|
263 |
+
"--version", action="version", version=".".join(map(str, __version__))
|
264 |
+
)
|
265 |
+
p.add_argument("filename", nargs="?")
|
266 |
+
p.add_argument("encoding", nargs="?", default="utf-8")
|
267 |
+
p.add_argument(
|
268 |
+
"--include-sup-sub",
|
269 |
+
dest="include_sup_sub",
|
270 |
+
action="store_true",
|
271 |
+
default=config.INCLUDE_SUP_SUB,
|
272 |
+
help="Include the sup and sub tags",
|
273 |
+
)
|
274 |
+
args = p.parse_args()
|
275 |
+
|
276 |
+
if args.filename and args.filename != "-":
|
277 |
+
with open(args.filename, "rb") as fp:
|
278 |
+
data = fp.read()
|
279 |
+
else:
|
280 |
+
data = sys.stdin.buffer.read()
|
281 |
+
|
282 |
+
try:
|
283 |
+
html = data.decode(args.encoding, args.decode_errors)
|
284 |
+
except UnicodeDecodeError as err:
|
285 |
+
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
286 |
+
warning += " Use the " + bcolors.OKGREEN
|
287 |
+
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
288 |
+
print(warning)
|
289 |
+
raise err
|
290 |
+
|
291 |
+
h = HTML2Text(baseurl=baseurl)
|
292 |
+
# handle options
|
293 |
+
if args.ul_style_dash:
|
294 |
+
h.ul_item_mark = "-"
|
295 |
+
if args.em_style_asterisk:
|
296 |
+
h.emphasis_mark = "*"
|
297 |
+
h.strong_mark = "__"
|
298 |
+
|
299 |
+
h.body_width = args.body_width
|
300 |
+
h.google_list_indent = args.list_indent
|
301 |
+
h.ignore_emphasis = args.ignore_emphasis
|
302 |
+
h.ignore_links = args.ignore_links
|
303 |
+
h.ignore_mailto_links = args.ignore_mailto_links
|
304 |
+
h.protect_links = args.protect_links
|
305 |
+
h.ignore_images = args.ignore_images
|
306 |
+
h.images_as_html = args.images_as_html
|
307 |
+
h.images_to_alt = args.images_to_alt
|
308 |
+
h.images_with_size = args.images_with_size
|
309 |
+
h.google_doc = args.google_doc
|
310 |
+
h.hide_strikethrough = args.hide_strikethrough
|
311 |
+
h.escape_snob = args.escape_snob
|
312 |
+
h.bypass_tables = args.bypass_tables
|
313 |
+
h.ignore_tables = args.ignore_tables
|
314 |
+
h.single_line_break = args.single_line_break
|
315 |
+
h.inline_links = args.inline_links
|
316 |
+
h.unicode_snob = args.unicode_snob
|
317 |
+
h.use_automatic_links = args.use_automatic_links
|
318 |
+
h.skip_internal_links = args.skip_internal_links
|
319 |
+
h.links_each_paragraph = args.links_each_paragraph
|
320 |
+
h.mark_code = args.mark_code
|
321 |
+
h.wrap_links = args.wrap_links
|
322 |
+
h.wrap_list_items = args.wrap_list_items
|
323 |
+
h.wrap_tables = args.wrap_tables
|
324 |
+
h.pad_tables = args.pad_tables
|
325 |
+
h.default_image_alt = args.default_image_alt
|
326 |
+
h.open_quote = args.open_quote
|
327 |
+
h.close_quote = args.close_quote
|
328 |
+
h.include_sup_sub = args.include_sup_sub
|
329 |
+
|
330 |
+
sys.stdout.write(h.handle(html))
|
crawl4ai/html2text/config.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
# Use Unicode characters instead of their ascii pseudo-replacements
|
4 |
+
UNICODE_SNOB = False
|
5 |
+
|
6 |
+
# Marker to use for marking tables for padding post processing
|
7 |
+
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
8 |
+
# Escape all special characters. Output is less readable, but avoids
|
9 |
+
# corner case formatting issues.
|
10 |
+
ESCAPE_SNOB = False
|
11 |
+
ESCAPE_BACKSLASH = False
|
12 |
+
ESCAPE_DOT = False
|
13 |
+
ESCAPE_PLUS = False
|
14 |
+
ESCAPE_DASH = False
|
15 |
+
|
16 |
+
# Put the links after each paragraph instead of at the end.
|
17 |
+
LINKS_EACH_PARAGRAPH = False
|
18 |
+
|
19 |
+
# Wrap long lines at position. 0 for no wrapping.
|
20 |
+
BODY_WIDTH = 78
|
21 |
+
|
22 |
+
# Don't show internal links (href="#local-anchor") -- corresponding link
|
23 |
+
# targets won't be visible in the plain text file anyway.
|
24 |
+
SKIP_INTERNAL_LINKS = True
|
25 |
+
|
26 |
+
# Use inline, rather than reference, formatting for images and links
|
27 |
+
INLINE_LINKS = True
|
28 |
+
|
29 |
+
# Protect links from line breaks surrounding them with angle brackets (in
|
30 |
+
# addition to their square brackets)
|
31 |
+
PROTECT_LINKS = False
|
32 |
+
# WRAP_LINKS = True
|
33 |
+
WRAP_LINKS = True
|
34 |
+
|
35 |
+
# Wrap list items.
|
36 |
+
WRAP_LIST_ITEMS = False
|
37 |
+
|
38 |
+
# Wrap tables
|
39 |
+
WRAP_TABLES = False
|
40 |
+
|
41 |
+
# Number of pixels Google indents nested lists
|
42 |
+
GOOGLE_LIST_INDENT = 36
|
43 |
+
|
44 |
+
# Values Google and others may use to indicate bold text
|
45 |
+
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
|
46 |
+
|
47 |
+
IGNORE_ANCHORS = False
|
48 |
+
IGNORE_MAILTO_LINKS = False
|
49 |
+
IGNORE_IMAGES = False
|
50 |
+
IMAGES_AS_HTML = False
|
51 |
+
IMAGES_TO_ALT = False
|
52 |
+
IMAGES_WITH_SIZE = False
|
53 |
+
IGNORE_EMPHASIS = False
|
54 |
+
MARK_CODE = False
|
55 |
+
DECODE_ERRORS = "strict"
|
56 |
+
DEFAULT_IMAGE_ALT = ""
|
57 |
+
PAD_TABLES = False
|
58 |
+
|
59 |
+
# Convert links with same href and text to <href> format
|
60 |
+
# if they are absolute links
|
61 |
+
USE_AUTOMATIC_LINKS = True
|
62 |
+
|
63 |
+
# For checking space-only lines on line 771
|
64 |
+
RE_SPACE = re.compile(r"\s\+")
|
65 |
+
|
66 |
+
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
|
67 |
+
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
|
68 |
+
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
69 |
+
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
70 |
+
|
71 |
+
# to find links in the text
|
72 |
+
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
73 |
+
|
74 |
+
# to find table separators
|
75 |
+
RE_TABLE = re.compile(r" \| ")
|
76 |
+
|
77 |
+
RE_MD_DOT_MATCHER = re.compile(
|
78 |
+
r"""
|
79 |
+
^ # start of line
|
80 |
+
(\s*\d+) # optional whitespace and a number
|
81 |
+
(\.) # dot
|
82 |
+
(?=\s) # lookahead assert whitespace
|
83 |
+
""",
|
84 |
+
re.MULTILINE | re.VERBOSE,
|
85 |
+
)
|
86 |
+
RE_MD_PLUS_MATCHER = re.compile(
|
87 |
+
r"""
|
88 |
+
^
|
89 |
+
(\s*)
|
90 |
+
(\+)
|
91 |
+
(?=\s)
|
92 |
+
""",
|
93 |
+
flags=re.MULTILINE | re.VERBOSE,
|
94 |
+
)
|
95 |
+
RE_MD_DASH_MATCHER = re.compile(
|
96 |
+
r"""
|
97 |
+
^
|
98 |
+
(\s*)
|
99 |
+
(-)
|
100 |
+
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
101 |
+
# or another dash (header or hr)
|
102 |
+
""",
|
103 |
+
flags=re.MULTILINE | re.VERBOSE,
|
104 |
+
)
|
105 |
+
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
|
106 |
+
RE_MD_BACKSLASH_MATCHER = re.compile(
|
107 |
+
r"""
|
108 |
+
(\\) # match one slash
|
109 |
+
(?=[%s]) # followed by a char that requires escaping
|
110 |
+
"""
|
111 |
+
% re.escape(RE_SLASH_CHARS),
|
112 |
+
flags=re.VERBOSE,
|
113 |
+
)
|
114 |
+
|
115 |
+
UNIFIABLE = {
|
116 |
+
"rsquo": "'",
|
117 |
+
"lsquo": "'",
|
118 |
+
"rdquo": '"',
|
119 |
+
"ldquo": '"',
|
120 |
+
"copy": "(C)",
|
121 |
+
"mdash": "--",
|
122 |
+
"nbsp": " ",
|
123 |
+
"rarr": "->",
|
124 |
+
"larr": "<-",
|
125 |
+
"middot": "*",
|
126 |
+
"ndash": "-",
|
127 |
+
"oelig": "oe",
|
128 |
+
"aelig": "ae",
|
129 |
+
"agrave": "a",
|
130 |
+
"aacute": "a",
|
131 |
+
"acirc": "a",
|
132 |
+
"atilde": "a",
|
133 |
+
"auml": "a",
|
134 |
+
"aring": "a",
|
135 |
+
"egrave": "e",
|
136 |
+
"eacute": "e",
|
137 |
+
"ecirc": "e",
|
138 |
+
"euml": "e",
|
139 |
+
"igrave": "i",
|
140 |
+
"iacute": "i",
|
141 |
+
"icirc": "i",
|
142 |
+
"iuml": "i",
|
143 |
+
"ograve": "o",
|
144 |
+
"oacute": "o",
|
145 |
+
"ocirc": "o",
|
146 |
+
"otilde": "o",
|
147 |
+
"ouml": "o",
|
148 |
+
"ugrave": "u",
|
149 |
+
"uacute": "u",
|
150 |
+
"ucirc": "u",
|
151 |
+
"uuml": "u",
|
152 |
+
"lrm": "",
|
153 |
+
"rlm": "",
|
154 |
+
}
|
155 |
+
|
156 |
+
# Format tables in HTML rather than Markdown syntax
|
157 |
+
BYPASS_TABLES = False
|
158 |
+
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
159 |
+
IGNORE_TABLES = False
|
160 |
+
|
161 |
+
|
162 |
+
# Use a single line break after a block element rather than two line breaks.
|
163 |
+
# NOTE: Requires body width setting to be 0.
|
164 |
+
SINGLE_LINE_BREAK = False
|
165 |
+
|
166 |
+
|
167 |
+
# Use double quotation marks when converting the <q> tag.
|
168 |
+
OPEN_QUOTE = '"'
|
169 |
+
CLOSE_QUOTE = '"'
|
170 |
+
|
171 |
+
# Include the <sup> and <sub> tags
|
172 |
+
INCLUDE_SUP_SUB = False
|
crawl4ai/html2text/elements.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Optional
|
2 |
+
|
3 |
+
|
4 |
+
class AnchorElement:
|
5 |
+
__slots__ = ["attrs", "count", "outcount"]
|
6 |
+
|
7 |
+
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
|
8 |
+
self.attrs = attrs
|
9 |
+
self.count = count
|
10 |
+
self.outcount = outcount
|
11 |
+
|
12 |
+
|
13 |
+
class ListElement:
|
14 |
+
__slots__ = ["name", "num"]
|
15 |
+
|
16 |
+
def __init__(self, name: str, num: int):
|
17 |
+
self.name = name
|
18 |
+
self.num = num
|
crawl4ai/html2text/utils.py
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html.entities
|
2 |
+
from typing import Dict, List, Optional
|
3 |
+
|
4 |
+
from . import config
|
5 |
+
|
6 |
+
unifiable_n = {
|
7 |
+
html.entities.name2codepoint[k]: v
|
8 |
+
for k, v in config.UNIFIABLE.items()
|
9 |
+
if k != "nbsp"
|
10 |
+
}
|
11 |
+
|
12 |
+
|
13 |
+
def hn(tag: str) -> int:
|
14 |
+
if tag[0] == "h" and len(tag) == 2:
|
15 |
+
n = tag[1]
|
16 |
+
if "0" < n <= "9":
|
17 |
+
return int(n)
|
18 |
+
return 0
|
19 |
+
|
20 |
+
|
21 |
+
def dumb_property_dict(style: str) -> Dict[str, str]:
|
22 |
+
"""
|
23 |
+
:returns: A hash of css attributes
|
24 |
+
"""
|
25 |
+
return {
|
26 |
+
x.strip().lower(): y.strip().lower()
|
27 |
+
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
|
28 |
+
}
|
29 |
+
|
30 |
+
|
31 |
+
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
|
32 |
+
"""
|
33 |
+
:type data: str
|
34 |
+
|
35 |
+
:returns: A hash of css selectors, each of which contains a hash of
|
36 |
+
css attributes.
|
37 |
+
:rtype: dict
|
38 |
+
"""
|
39 |
+
# remove @import sentences
|
40 |
+
data += ";"
|
41 |
+
importIndex = data.find("@import")
|
42 |
+
while importIndex != -1:
|
43 |
+
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
|
44 |
+
importIndex = data.find("@import")
|
45 |
+
|
46 |
+
# parse the css. reverted from dictionary comprehension in order to
|
47 |
+
# support older pythons
|
48 |
+
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
|
49 |
+
try:
|
50 |
+
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
|
51 |
+
except ValueError:
|
52 |
+
elements = {} # not that important
|
53 |
+
|
54 |
+
return elements
|
55 |
+
|
56 |
+
|
57 |
+
def element_style(
|
58 |
+
attrs: Dict[str, Optional[str]],
|
59 |
+
style_def: Dict[str, Dict[str, str]],
|
60 |
+
parent_style: Dict[str, str],
|
61 |
+
) -> Dict[str, str]:
|
62 |
+
"""
|
63 |
+
:type attrs: dict
|
64 |
+
:type style_def: dict
|
65 |
+
:type style_def: dict
|
66 |
+
|
67 |
+
:returns: A hash of the 'final' style attributes of the element
|
68 |
+
:rtype: dict
|
69 |
+
"""
|
70 |
+
style = parent_style.copy()
|
71 |
+
if "class" in attrs:
|
72 |
+
assert attrs["class"] is not None
|
73 |
+
for css_class in attrs["class"].split():
|
74 |
+
css_style = style_def.get("." + css_class, {})
|
75 |
+
style.update(css_style)
|
76 |
+
if "style" in attrs:
|
77 |
+
assert attrs["style"] is not None
|
78 |
+
immediate_style = dumb_property_dict(attrs["style"])
|
79 |
+
style.update(immediate_style)
|
80 |
+
|
81 |
+
return style
|
82 |
+
|
83 |
+
|
84 |
+
def google_list_style(style: Dict[str, str]) -> str:
|
85 |
+
"""
|
86 |
+
Finds out whether this is an ordered or unordered list
|
87 |
+
|
88 |
+
:type style: dict
|
89 |
+
|
90 |
+
:rtype: str
|
91 |
+
"""
|
92 |
+
if "list-style-type" in style:
|
93 |
+
list_style = style["list-style-type"]
|
94 |
+
if list_style in ["disc", "circle", "square", "none"]:
|
95 |
+
return "ul"
|
96 |
+
|
97 |
+
return "ol"
|
98 |
+
|
99 |
+
|
100 |
+
def google_has_height(style: Dict[str, str]) -> bool:
|
101 |
+
"""
|
102 |
+
Check if the style of the element has the 'height' attribute
|
103 |
+
explicitly defined
|
104 |
+
|
105 |
+
:type style: dict
|
106 |
+
|
107 |
+
:rtype: bool
|
108 |
+
"""
|
109 |
+
return "height" in style
|
110 |
+
|
111 |
+
|
112 |
+
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
|
113 |
+
"""
|
114 |
+
:type style: dict
|
115 |
+
|
116 |
+
:returns: A list of all emphasis modifiers of the element
|
117 |
+
:rtype: list
|
118 |
+
"""
|
119 |
+
emphasis = []
|
120 |
+
if "text-decoration" in style:
|
121 |
+
emphasis.append(style["text-decoration"])
|
122 |
+
if "font-style" in style:
|
123 |
+
emphasis.append(style["font-style"])
|
124 |
+
if "font-weight" in style:
|
125 |
+
emphasis.append(style["font-weight"])
|
126 |
+
|
127 |
+
return emphasis
|
128 |
+
|
129 |
+
|
130 |
+
def google_fixed_width_font(style: Dict[str, str]) -> bool:
|
131 |
+
"""
|
132 |
+
Check if the css of the current element defines a fixed width font
|
133 |
+
|
134 |
+
:type style: dict
|
135 |
+
|
136 |
+
:rtype: bool
|
137 |
+
"""
|
138 |
+
font_family = ""
|
139 |
+
if "font-family" in style:
|
140 |
+
font_family = style["font-family"]
|
141 |
+
return "courier new" == font_family or "consolas" == font_family
|
142 |
+
|
143 |
+
|
144 |
+
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
145 |
+
"""
|
146 |
+
Extract numbering from list element attributes
|
147 |
+
|
148 |
+
:type attrs: dict
|
149 |
+
|
150 |
+
:rtype: int or None
|
151 |
+
"""
|
152 |
+
if "start" in attrs:
|
153 |
+
assert attrs["start"] is not None
|
154 |
+
try:
|
155 |
+
return int(attrs["start"]) - 1
|
156 |
+
except ValueError:
|
157 |
+
pass
|
158 |
+
|
159 |
+
return 0
|
160 |
+
|
161 |
+
|
162 |
+
def skipwrap(
|
163 |
+
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
164 |
+
) -> bool:
|
165 |
+
# If it appears to contain a link
|
166 |
+
# don't wrap
|
167 |
+
if not wrap_links and config.RE_LINK.search(para):
|
168 |
+
return True
|
169 |
+
# If the text begins with four spaces or one tab, it's a code block;
|
170 |
+
# don't wrap
|
171 |
+
if para[0:4] == " " or para[0] == "\t":
|
172 |
+
return True
|
173 |
+
|
174 |
+
# If the text begins with only two "--", possibly preceded by
|
175 |
+
# whitespace, that's an emdash; so wrap.
|
176 |
+
stripped = para.lstrip()
|
177 |
+
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
178 |
+
return False
|
179 |
+
|
180 |
+
# I'm not sure what this is for; I thought it was to detect lists,
|
181 |
+
# but there's a <br>-inside-<span> case in one of the tests that
|
182 |
+
# also depends upon it.
|
183 |
+
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
|
184 |
+
return not wrap_list_items
|
185 |
+
|
186 |
+
# If text contains a pipe character it is likely a table
|
187 |
+
if not wrap_tables and config.RE_TABLE.search(para):
|
188 |
+
return True
|
189 |
+
|
190 |
+
# If the text begins with a single -, *, or +, followed by a space,
|
191 |
+
# or an integer, followed by a ., followed by a space (in either
|
192 |
+
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
193 |
+
return bool(
|
194 |
+
config.RE_ORDERED_LIST_MATCHER.match(stripped)
|
195 |
+
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
|
196 |
+
)
|
197 |
+
|
198 |
+
|
199 |
+
def escape_md(text: str) -> str:
|
200 |
+
"""
|
201 |
+
Escapes markdown-sensitive characters within other markdown
|
202 |
+
constructs.
|
203 |
+
"""
|
204 |
+
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
205 |
+
|
206 |
+
|
207 |
+
def escape_md_section(
|
208 |
+
text: str,
|
209 |
+
escape_backslash: bool = True,
|
210 |
+
snob: bool = False,
|
211 |
+
escape_dot: bool = True,
|
212 |
+
escape_plus: bool = True,
|
213 |
+
escape_dash: bool = True
|
214 |
+
) -> str:
|
215 |
+
"""
|
216 |
+
Escapes markdown-sensitive characters across whole document sections.
|
217 |
+
Each escaping operation can be controlled individually.
|
218 |
+
"""
|
219 |
+
if escape_backslash:
|
220 |
+
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
221 |
+
|
222 |
+
if snob:
|
223 |
+
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
224 |
+
|
225 |
+
if escape_dot:
|
226 |
+
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
227 |
+
|
228 |
+
if escape_plus:
|
229 |
+
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
230 |
+
|
231 |
+
if escape_dash:
|
232 |
+
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
233 |
+
|
234 |
+
return text
|
235 |
+
|
236 |
+
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
237 |
+
"""
|
238 |
+
Given the lines of a table
|
239 |
+
padds the cells and returns the new lines
|
240 |
+
"""
|
241 |
+
# find the maximum width of the columns
|
242 |
+
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
|
243 |
+
max_cols = len(max_width)
|
244 |
+
for line in lines:
|
245 |
+
cols = [x.rstrip() for x in line.split("|")]
|
246 |
+
num_cols = len(cols)
|
247 |
+
|
248 |
+
# don't drop any data if colspan attributes result in unequal lengths
|
249 |
+
if num_cols < max_cols:
|
250 |
+
cols += [""] * (max_cols - num_cols)
|
251 |
+
elif max_cols < num_cols:
|
252 |
+
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
253 |
+
max_cols = num_cols
|
254 |
+
|
255 |
+
max_width = [
|
256 |
+
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
257 |
+
]
|
258 |
+
|
259 |
+
# reformat
|
260 |
+
new_lines = []
|
261 |
+
for line in lines:
|
262 |
+
cols = [x.rstrip() for x in line.split("|")]
|
263 |
+
if set(line.strip()) == set("-|"):
|
264 |
+
filler = "-"
|
265 |
+
new_cols = [
|
266 |
+
x.rstrip() + (filler * (M - len(x.rstrip())))
|
267 |
+
for x, M in zip(cols, max_width)
|
268 |
+
]
|
269 |
+
new_lines.append("|-" + "|".join(new_cols) + "|")
|
270 |
+
else:
|
271 |
+
filler = " "
|
272 |
+
new_cols = [
|
273 |
+
x.rstrip() + (filler * (M - len(x.rstrip())))
|
274 |
+
for x, M in zip(cols, max_width)
|
275 |
+
]
|
276 |
+
new_lines.append("| " + "|".join(new_cols) + "|")
|
277 |
+
return new_lines
|
278 |
+
|
279 |
+
|
280 |
+
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
|
281 |
+
"""
|
282 |
+
Provide padding for tables in the text
|
283 |
+
"""
|
284 |
+
lines = text.split("\n")
|
285 |
+
table_buffer = [] # type: List[str]
|
286 |
+
table_started = False
|
287 |
+
new_lines = []
|
288 |
+
for line in lines:
|
289 |
+
# Toggle table started
|
290 |
+
if config.TABLE_MARKER_FOR_PAD in line:
|
291 |
+
table_started = not table_started
|
292 |
+
if not table_started:
|
293 |
+
table = reformat_table(table_buffer, right_margin)
|
294 |
+
new_lines.extend(table)
|
295 |
+
table_buffer = []
|
296 |
+
new_lines.append("")
|
297 |
+
continue
|
298 |
+
# Process lines
|
299 |
+
if table_started:
|
300 |
+
table_buffer.append(line)
|
301 |
+
else:
|
302 |
+
new_lines.append(line)
|
303 |
+
return "\n".join(new_lines)
|
crawl4ai/install.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import sys
|
3 |
+
import asyncio
|
4 |
+
from .async_logger import AsyncLogger, LogLevel
|
5 |
+
|
6 |
+
# Initialize logger
|
7 |
+
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
8 |
+
|
9 |
+
def post_install():
|
10 |
+
"""Run all post-installation tasks"""
|
11 |
+
logger.info("Running post-installation setup...", tag="INIT")
|
12 |
+
install_playwright()
|
13 |
+
run_migration()
|
14 |
+
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
15 |
+
|
16 |
+
def install_playwright():
|
17 |
+
logger.info("Installing Playwright browsers...", tag="INIT")
|
18 |
+
try:
|
19 |
+
# subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
|
20 |
+
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
|
21 |
+
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
22 |
+
except subprocess.CalledProcessError as e:
|
23 |
+
# logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
24 |
+
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
25 |
+
except Exception as e:
|
26 |
+
# logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
27 |
+
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
28 |
+
|
29 |
+
def run_migration():
|
30 |
+
"""Initialize database during installation"""
|
31 |
+
try:
|
32 |
+
logger.info("Starting database initialization...", tag="INIT")
|
33 |
+
from crawl4ai.async_database import async_db_manager
|
34 |
+
|
35 |
+
asyncio.run(async_db_manager.initialize())
|
36 |
+
logger.success("Database initialization completed successfully.", tag="COMPLETE")
|
37 |
+
except ImportError:
|
38 |
+
logger.warning("Database module not found. Will initialize on first use.")
|
39 |
+
except Exception as e:
|
40 |
+
logger.warning(f"Database initialization failed: {e}")
|
41 |
+
logger.warning("Database will be initialized on first use")
|
42 |
+
|
43 |
+
async def run_doctor():
|
44 |
+
"""Test if Crawl4AI is working properly"""
|
45 |
+
logger.info("Running Crawl4AI health check...", tag="INIT")
|
46 |
+
try:
|
47 |
+
from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
48 |
+
|
49 |
+
browser_config = BrowserConfig(
|
50 |
+
headless=True,
|
51 |
+
browser_type="chromium",
|
52 |
+
ignore_https_errors=True,
|
53 |
+
light_mode=True,
|
54 |
+
viewport_width=1280,
|
55 |
+
viewport_height=720
|
56 |
+
)
|
57 |
+
|
58 |
+
run_config = CrawlerRunConfig(
|
59 |
+
cache_mode=CacheMode.BYPASS,
|
60 |
+
screenshot=True,
|
61 |
+
)
|
62 |
+
|
63 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
64 |
+
logger.info("Testing crawling capabilities...", tag="TEST")
|
65 |
+
result = await crawler.arun(
|
66 |
+
url="https://crawl4ai.com",
|
67 |
+
config=run_config
|
68 |
+
)
|
69 |
+
|
70 |
+
if result and result.markdown:
|
71 |
+
logger.success("✅ Crawling test passed!", tag="COMPLETE")
|
72 |
+
return True
|
73 |
+
else:
|
74 |
+
raise Exception("Failed to get content")
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
logger.error(f"❌ Test failed: {e}", tag="ERROR")
|
78 |
+
return False
|
79 |
+
|
80 |
+
def doctor():
|
81 |
+
"""Entry point for the doctor command"""
|
82 |
+
import asyncio
|
83 |
+
return asyncio.run(run_doctor())
|
crawl4ai/js_snippet/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
|
3 |
+
# Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
|
4 |
+
def load_js_script(script_name):
|
5 |
+
# Get the path of the current script
|
6 |
+
current_script_path = os.path.dirname(os.path.realpath(__file__))
|
7 |
+
# Get the path of the script to load
|
8 |
+
script_path = os.path.join(current_script_path, script_name + '.js')
|
9 |
+
# Check if the script exists
|
10 |
+
if not os.path.exists(script_path):
|
11 |
+
raise ValueError(f"Script {script_name} not found in the folder {current_script_path}")
|
12 |
+
# Load the content of the script
|
13 |
+
with open(script_path, 'r') as f:
|
14 |
+
script_content = f.read()
|
15 |
+
return script_content
|
crawl4ai/js_snippet/navigator_overrider.js
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Pass the Permissions Test.
|
2 |
+
const originalQuery = window.navigator.permissions.query;
|
3 |
+
window.navigator.permissions.query = (parameters) =>
|
4 |
+
parameters.name === "notifications"
|
5 |
+
? Promise.resolve({ state: Notification.permission })
|
6 |
+
: originalQuery(parameters);
|
7 |
+
Object.defineProperty(navigator, "webdriver", {
|
8 |
+
get: () => undefined,
|
9 |
+
});
|
10 |
+
window.navigator.chrome = {
|
11 |
+
runtime: {},
|
12 |
+
// Add other properties if necessary
|
13 |
+
};
|
14 |
+
Object.defineProperty(navigator, "plugins", {
|
15 |
+
get: () => [1, 2, 3, 4, 5],
|
16 |
+
});
|
17 |
+
Object.defineProperty(navigator, "languages", {
|
18 |
+
get: () => ["en-US", "en"],
|
19 |
+
});
|
20 |
+
Object.defineProperty(document, "hidden", {
|
21 |
+
get: () => false,
|
22 |
+
});
|
23 |
+
Object.defineProperty(document, "visibilityState", {
|
24 |
+
get: () => "visible",
|
25 |
+
});
|
crawl4ai/js_snippet/remove_overlay_elements.js
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
async () => {
|
2 |
+
// Function to check if element is visible
|
3 |
+
const isVisible = (elem) => {
|
4 |
+
const style = window.getComputedStyle(elem);
|
5 |
+
return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
|
6 |
+
};
|
7 |
+
|
8 |
+
// Common selectors for popups and overlays
|
9 |
+
const commonSelectors = [
|
10 |
+
// Close buttons first
|
11 |
+
'button[class*="close" i]',
|
12 |
+
'button[class*="dismiss" i]',
|
13 |
+
'button[aria-label*="close" i]',
|
14 |
+
'button[title*="close" i]',
|
15 |
+
'a[class*="close" i]',
|
16 |
+
'span[class*="close" i]',
|
17 |
+
|
18 |
+
// Cookie notices
|
19 |
+
'[class*="cookie-banner" i]',
|
20 |
+
'[id*="cookie-banner" i]',
|
21 |
+
'[class*="cookie-consent" i]',
|
22 |
+
'[id*="cookie-consent" i]',
|
23 |
+
|
24 |
+
// Newsletter/subscription dialogs
|
25 |
+
'[class*="newsletter" i]',
|
26 |
+
'[class*="subscribe" i]',
|
27 |
+
|
28 |
+
// Generic popups/modals
|
29 |
+
'[class*="popup" i]',
|
30 |
+
'[class*="modal" i]',
|
31 |
+
'[class*="overlay" i]',
|
32 |
+
'[class*="dialog" i]',
|
33 |
+
'[role="dialog"]',
|
34 |
+
'[role="alertdialog"]',
|
35 |
+
];
|
36 |
+
|
37 |
+
// Try to click close buttons first
|
38 |
+
for (const selector of commonSelectors.slice(0, 6)) {
|
39 |
+
const closeButtons = document.querySelectorAll(selector);
|
40 |
+
for (const button of closeButtons) {
|
41 |
+
if (isVisible(button)) {
|
42 |
+
try {
|
43 |
+
button.click();
|
44 |
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
45 |
+
} catch (e) {
|
46 |
+
console.log("Error clicking button:", e);
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
// Remove remaining overlay elements
|
53 |
+
const removeOverlays = () => {
|
54 |
+
// Find elements with high z-index
|
55 |
+
const allElements = document.querySelectorAll("*");
|
56 |
+
for (const elem of allElements) {
|
57 |
+
const style = window.getComputedStyle(elem);
|
58 |
+
const zIndex = parseInt(style.zIndex);
|
59 |
+
const position = style.position;
|
60 |
+
|
61 |
+
if (
|
62 |
+
isVisible(elem) &&
|
63 |
+
(zIndex > 999 || position === "fixed" || position === "absolute") &&
|
64 |
+
(elem.offsetWidth > window.innerWidth * 0.5 ||
|
65 |
+
elem.offsetHeight > window.innerHeight * 0.5 ||
|
66 |
+
style.backgroundColor.includes("rgba") ||
|
67 |
+
parseFloat(style.opacity) < 1)
|
68 |
+
) {
|
69 |
+
elem.remove();
|
70 |
+
}
|
71 |
+
}
|
72 |
+
|
73 |
+
// Remove elements matching common selectors
|
74 |
+
for (const selector of commonSelectors) {
|
75 |
+
const elements = document.querySelectorAll(selector);
|
76 |
+
elements.forEach((elem) => {
|
77 |
+
if (isVisible(elem)) {
|
78 |
+
elem.remove();
|
79 |
+
}
|
80 |
+
});
|
81 |
+
}
|
82 |
+
};
|
83 |
+
|
84 |
+
// Remove overlay elements
|
85 |
+
removeOverlays();
|
86 |
+
|
87 |
+
// Remove any fixed/sticky position elements at the top/bottom
|
88 |
+
const removeFixedElements = () => {
|
89 |
+
const elements = document.querySelectorAll("*");
|
90 |
+
elements.forEach((elem) => {
|
91 |
+
const style = window.getComputedStyle(elem);
|
92 |
+
if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) {
|
93 |
+
elem.remove();
|
94 |
+
}
|
95 |
+
});
|
96 |
+
};
|
97 |
+
|
98 |
+
removeFixedElements();
|
99 |
+
|
100 |
+
// Remove empty block elements as: div, p, span, etc.
|
101 |
+
const removeEmptyBlockElements = () => {
|
102 |
+
const blockElements = document.querySelectorAll(
|
103 |
+
"div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6"
|
104 |
+
);
|
105 |
+
blockElements.forEach((elem) => {
|
106 |
+
if (elem.innerText.trim() === "") {
|
107 |
+
elem.remove();
|
108 |
+
}
|
109 |
+
});
|
110 |
+
};
|
111 |
+
|
112 |
+
// Remove margin-right and padding-right from body (often added by modal scripts)
|
113 |
+
document.body.style.marginRight = "0px";
|
114 |
+
document.body.style.paddingRight = "0px";
|
115 |
+
document.body.style.overflow = "auto";
|
116 |
+
|
117 |
+
// Wait a bit for any animations to complete
|
118 |
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
119 |
+
};
|
crawl4ai/js_snippet/update_image_dimensions.js
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
() => {
|
2 |
+
return new Promise((resolve) => {
|
3 |
+
const filterImage = (img) => {
|
4 |
+
// Filter out images that are too small
|
5 |
+
if (img.width < 100 && img.height < 100) return false;
|
6 |
+
|
7 |
+
// Filter out images that are not visible
|
8 |
+
const rect = img.getBoundingClientRect();
|
9 |
+
if (rect.width === 0 || rect.height === 0) return false;
|
10 |
+
|
11 |
+
// Filter out images with certain class names (e.g., icons, thumbnails)
|
12 |
+
if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
|
13 |
+
|
14 |
+
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
15 |
+
if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
|
16 |
+
|
17 |
+
return true;
|
18 |
+
};
|
19 |
+
|
20 |
+
const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
|
21 |
+
let imagesLeft = images.length;
|
22 |
+
|
23 |
+
if (imagesLeft === 0) {
|
24 |
+
resolve();
|
25 |
+
return;
|
26 |
+
}
|
27 |
+
|
28 |
+
const checkImage = (img) => {
|
29 |
+
if (img.complete && img.naturalWidth !== 0) {
|
30 |
+
img.setAttribute("width", img.naturalWidth);
|
31 |
+
img.setAttribute("height", img.naturalHeight);
|
32 |
+
imagesLeft--;
|
33 |
+
if (imagesLeft === 0) resolve();
|
34 |
+
}
|
35 |
+
};
|
36 |
+
|
37 |
+
images.forEach((img) => {
|
38 |
+
checkImage(img);
|
39 |
+
if (!img.complete) {
|
40 |
+
img.onload = () => {
|
41 |
+
checkImage(img);
|
42 |
+
};
|
43 |
+
img.onerror = () => {
|
44 |
+
imagesLeft--;
|
45 |
+
if (imagesLeft === 0) resolve();
|
46 |
+
};
|
47 |
+
}
|
48 |
+
});
|
49 |
+
|
50 |
+
// Fallback timeout of 5 seconds
|
51 |
+
// setTimeout(() => resolve(), 5000);
|
52 |
+
resolve();
|
53 |
+
});
|
54 |
+
};
|
crawl4ai/llmtxt.py
ADDED
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
import re
|
4 |
+
from typing import Dict, List, Tuple, Optional, Any
|
5 |
+
import json
|
6 |
+
from tqdm import tqdm
|
7 |
+
import time
|
8 |
+
import psutil
|
9 |
+
import numpy as np
|
10 |
+
from rank_bm25 import BM25Okapi
|
11 |
+
from nltk.tokenize import word_tokenize
|
12 |
+
from nltk.corpus import stopwords
|
13 |
+
from nltk.stem import WordNetLemmatizer
|
14 |
+
from litellm import completion, batch_completion
|
15 |
+
from .async_logger import AsyncLogger
|
16 |
+
import litellm
|
17 |
+
import pickle
|
18 |
+
import hashlib # <--- ADDED for file-hash
|
19 |
+
from fnmatch import fnmatch
|
20 |
+
import glob
|
21 |
+
|
22 |
+
litellm.set_verbose = False
|
23 |
+
|
24 |
+
def _compute_file_hash(file_path: Path) -> str:
|
25 |
+
"""Compute MD5 hash for the file's entire content."""
|
26 |
+
hash_md5 = hashlib.md5()
|
27 |
+
with file_path.open("rb") as f:
|
28 |
+
for chunk in iter(lambda: f.read(4096), b""):
|
29 |
+
hash_md5.update(chunk)
|
30 |
+
return hash_md5.hexdigest()
|
31 |
+
|
32 |
+
class AsyncLLMTextManager:
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
docs_dir: Path,
|
36 |
+
logger: Optional[AsyncLogger] = None,
|
37 |
+
max_concurrent_calls: int = 5,
|
38 |
+
batch_size: int = 3
|
39 |
+
) -> None:
|
40 |
+
self.docs_dir = docs_dir
|
41 |
+
self.logger = logger
|
42 |
+
self.max_concurrent_calls = max_concurrent_calls
|
43 |
+
self.batch_size = batch_size
|
44 |
+
self.bm25_index = None
|
45 |
+
self.document_map: Dict[str, Any] = {}
|
46 |
+
self.tokenized_facts: List[str] = []
|
47 |
+
self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
|
48 |
+
|
49 |
+
async def _process_document_batch(self, doc_batch: List[Path]) -> None:
|
50 |
+
"""Process a batch of documents in parallel"""
|
51 |
+
contents = []
|
52 |
+
for file_path in doc_batch:
|
53 |
+
try:
|
54 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
55 |
+
contents.append(f.read())
|
56 |
+
except Exception as e:
|
57 |
+
self.logger.error(f"Error reading {file_path}: {str(e)}")
|
58 |
+
contents.append("") # Add empty content to maintain batch alignment
|
59 |
+
|
60 |
+
prompt = """Given a documentation file, generate a list of atomic facts where each fact:
|
61 |
+
1. Represents a single piece of knowledge
|
62 |
+
2. Contains variations in terminology for the same concept
|
63 |
+
3. References relevant code patterns if they exist
|
64 |
+
4. Is written in a way that would match natural language queries
|
65 |
+
|
66 |
+
Each fact should follow this format:
|
67 |
+
<main_concept>: <fact_statement> | <related_terms> | <code_reference>
|
68 |
+
|
69 |
+
Example Facts:
|
70 |
+
browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
|
71 |
+
redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
|
72 |
+
pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
|
73 |
+
|
74 |
+
Wrap your response in <index>...</index> tags.
|
75 |
+
"""
|
76 |
+
|
77 |
+
# Prepare messages for batch processing
|
78 |
+
messages_list = [
|
79 |
+
[
|
80 |
+
{"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"}
|
81 |
+
]
|
82 |
+
for content in contents if content
|
83 |
+
]
|
84 |
+
|
85 |
+
try:
|
86 |
+
responses = batch_completion(
|
87 |
+
model="anthropic/claude-3-5-sonnet-latest",
|
88 |
+
messages=messages_list,
|
89 |
+
logger_fn=None
|
90 |
+
)
|
91 |
+
|
92 |
+
# Process responses and save index files
|
93 |
+
for response, file_path in zip(responses, doc_batch):
|
94 |
+
try:
|
95 |
+
index_content_match = re.search(
|
96 |
+
r'<index>(.*?)</index>',
|
97 |
+
response.choices[0].message.content,
|
98 |
+
re.DOTALL
|
99 |
+
)
|
100 |
+
if not index_content_match:
|
101 |
+
self.logger.warning(f"No <index>...</index> content found for {file_path}")
|
102 |
+
continue
|
103 |
+
|
104 |
+
index_content = re.sub(
|
105 |
+
r"\n\s*\n", "\n", index_content_match.group(1)
|
106 |
+
).strip()
|
107 |
+
if index_content:
|
108 |
+
index_file = file_path.with_suffix('.q.md')
|
109 |
+
with open(index_file, 'w', encoding='utf-8') as f:
|
110 |
+
f.write(index_content)
|
111 |
+
self.logger.info(f"Created index file: {index_file}")
|
112 |
+
else:
|
113 |
+
self.logger.warning(f"No index content found in response for {file_path}")
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
self.logger.error(f"Error processing response for {file_path}: {str(e)}")
|
117 |
+
|
118 |
+
except Exception as e:
|
119 |
+
self.logger.error(f"Error in batch completion: {str(e)}")
|
120 |
+
|
121 |
+
def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
|
122 |
+
if "|" not in line:
|
123 |
+
return False, "Missing separator '|'"
|
124 |
+
|
125 |
+
parts = [p.strip() for p in line.split("|")]
|
126 |
+
if len(parts) != 3:
|
127 |
+
return False, f"Expected 3 parts, got {len(parts)}"
|
128 |
+
|
129 |
+
concept_part = parts[0]
|
130 |
+
if ":" not in concept_part:
|
131 |
+
return False, "Missing ':' in concept definition"
|
132 |
+
|
133 |
+
return True, None
|
134 |
+
|
135 |
+
def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
|
136 |
+
"""
|
137 |
+
Load token cache from .q.tokens if present and matching file hash.
|
138 |
+
Otherwise return a new structure with updated file-hash.
|
139 |
+
"""
|
140 |
+
cache_file = fact_file.with_suffix(".q.tokens")
|
141 |
+
current_hash = _compute_file_hash(fact_file)
|
142 |
+
|
143 |
+
if cache_file.exists():
|
144 |
+
try:
|
145 |
+
with open(cache_file, "r") as f:
|
146 |
+
cache = json.load(f)
|
147 |
+
# If the hash matches, return it directly
|
148 |
+
if cache.get("content_hash") == current_hash:
|
149 |
+
return cache
|
150 |
+
# Otherwise, we signal that it's changed
|
151 |
+
self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
|
152 |
+
except json.JSONDecodeError:
|
153 |
+
self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
|
154 |
+
except Exception as e:
|
155 |
+
self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
|
156 |
+
|
157 |
+
# Return a fresh cache
|
158 |
+
return {"facts": {}, "content_hash": current_hash}
|
159 |
+
|
160 |
+
def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
|
161 |
+
cache_file = fact_file.with_suffix(".q.tokens")
|
162 |
+
# Always ensure we're saving the correct file-hash
|
163 |
+
cache["content_hash"] = _compute_file_hash(fact_file)
|
164 |
+
with open(cache_file, "w") as f:
|
165 |
+
json.dump(cache, f)
|
166 |
+
|
167 |
+
def preprocess_text(self, text: str) -> List[str]:
|
168 |
+
parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
|
169 |
+
# Remove : after the first word of parts[0]
|
170 |
+
parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
|
171 |
+
|
172 |
+
lemmatizer = WordNetLemmatizer()
|
173 |
+
stop_words = set(stopwords.words("english")) - {
|
174 |
+
"how", "what", "when", "where", "why", "which",
|
175 |
+
}
|
176 |
+
|
177 |
+
tokens = []
|
178 |
+
for part in parts:
|
179 |
+
if "(" in part and ")" in part:
|
180 |
+
code_tokens = re.findall(
|
181 |
+
r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
|
182 |
+
)
|
183 |
+
tokens.extend(code_tokens)
|
184 |
+
|
185 |
+
words = word_tokenize(part.lower())
|
186 |
+
tokens.extend(
|
187 |
+
[
|
188 |
+
lemmatizer.lemmatize(token)
|
189 |
+
for token in words
|
190 |
+
if token not in stop_words
|
191 |
+
]
|
192 |
+
)
|
193 |
+
|
194 |
+
return tokens
|
195 |
+
|
196 |
+
def maybe_load_bm25_index(self, clear_cache=False) -> bool:
|
197 |
+
"""
|
198 |
+
Load existing BM25 index from disk, if present and clear_cache=False.
|
199 |
+
"""
|
200 |
+
if not clear_cache and os.path.exists(self.bm25_index_file):
|
201 |
+
self.logger.info("Loading existing BM25 index from disk.")
|
202 |
+
with open(self.bm25_index_file, "rb") as f:
|
203 |
+
data = pickle.load(f)
|
204 |
+
self.tokenized_facts = data["tokenized_facts"]
|
205 |
+
self.bm25_index = data["bm25_index"]
|
206 |
+
return True
|
207 |
+
return False
|
208 |
+
|
209 |
+
def build_search_index(self, clear_cache=False) -> None:
|
210 |
+
"""
|
211 |
+
Checks for new or modified .q.md files by comparing file-hash.
|
212 |
+
If none need reindexing and clear_cache is False, loads existing index if available.
|
213 |
+
Otherwise, reindexes only changed/new files and merges or creates a new index.
|
214 |
+
"""
|
215 |
+
# If clear_cache is True, we skip partial logic: rebuild everything from scratch
|
216 |
+
if clear_cache:
|
217 |
+
self.logger.info("Clearing cache and rebuilding full search index.")
|
218 |
+
if self.bm25_index_file.exists():
|
219 |
+
self.bm25_index_file.unlink()
|
220 |
+
|
221 |
+
process = psutil.Process()
|
222 |
+
self.logger.info("Checking which .q.md files need (re)indexing...")
|
223 |
+
|
224 |
+
# Gather all .q.md files
|
225 |
+
q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")]
|
226 |
+
|
227 |
+
# We'll store known (unchanged) facts in these lists
|
228 |
+
existing_facts: List[str] = []
|
229 |
+
existing_tokens: List[List[str]] = []
|
230 |
+
|
231 |
+
# Keep track of invalid lines for logging
|
232 |
+
invalid_lines = []
|
233 |
+
needSet = [] # files that must be (re)indexed
|
234 |
+
|
235 |
+
for qf in q_files:
|
236 |
+
token_cache_file = qf.with_suffix(".q.tokens")
|
237 |
+
|
238 |
+
# If no .q.tokens or clear_cache is True → definitely reindex
|
239 |
+
if clear_cache or not token_cache_file.exists():
|
240 |
+
needSet.append(qf)
|
241 |
+
continue
|
242 |
+
|
243 |
+
# Otherwise, load the existing cache and compare hash
|
244 |
+
cache = self._load_or_create_token_cache(qf)
|
245 |
+
# If the .q.tokens was out of date (i.e. changed hash), we reindex
|
246 |
+
if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf):
|
247 |
+
needSet.append(qf)
|
248 |
+
else:
|
249 |
+
# File is unchanged → retrieve cached token data
|
250 |
+
for line, cache_data in cache["facts"].items():
|
251 |
+
existing_facts.append(line)
|
252 |
+
existing_tokens.append(cache_data["tokens"])
|
253 |
+
self.document_map[line] = qf # track the doc for that fact
|
254 |
+
|
255 |
+
if not needSet and not clear_cache:
|
256 |
+
# If no file needs reindexing, try loading existing index
|
257 |
+
if self.maybe_load_bm25_index(clear_cache=False):
|
258 |
+
self.logger.info("No new/changed .q.md files found. Using existing BM25 index.")
|
259 |
+
return
|
260 |
+
else:
|
261 |
+
# If there's no existing index, we must build a fresh index from the old caches
|
262 |
+
self.logger.info("No existing BM25 index found. Building from cached facts.")
|
263 |
+
if existing_facts:
|
264 |
+
self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.")
|
265 |
+
self.bm25_index = BM25Okapi(existing_tokens)
|
266 |
+
self.tokenized_facts = existing_facts
|
267 |
+
with open(self.bm25_index_file, "wb") as f:
|
268 |
+
pickle.dump({
|
269 |
+
"bm25_index": self.bm25_index,
|
270 |
+
"tokenized_facts": self.tokenized_facts
|
271 |
+
}, f)
|
272 |
+
else:
|
273 |
+
self.logger.warning("No facts found at all. Index remains empty.")
|
274 |
+
return
|
275 |
+
|
276 |
+
# ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
|
277 |
+
# If we reach here, we have new or changed .q.md files
|
278 |
+
# We'll parse them, reindex them, and then combine with existing_facts
|
279 |
+
# -----------------------------------------------------
|
280 |
+
|
281 |
+
self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
|
282 |
+
|
283 |
+
# 1) Parse the new or changed .q.md files
|
284 |
+
new_facts = []
|
285 |
+
new_tokens = []
|
286 |
+
with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
|
287 |
+
for file in needSet:
|
288 |
+
# We'll build up a fresh cache
|
289 |
+
fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
|
290 |
+
try:
|
291 |
+
with open(file, "r", encoding="utf-8") as f_obj:
|
292 |
+
content = f_obj.read().strip()
|
293 |
+
lines = [l.strip() for l in content.split("\n") if l.strip()]
|
294 |
+
|
295 |
+
for line in lines:
|
296 |
+
is_valid, error = self._validate_fact_line(line)
|
297 |
+
if not is_valid:
|
298 |
+
invalid_lines.append((file, line, error))
|
299 |
+
continue
|
300 |
+
|
301 |
+
tokens = self.preprocess_text(line)
|
302 |
+
fresh_cache["facts"][line] = {
|
303 |
+
"tokens": tokens,
|
304 |
+
"added": time.time(),
|
305 |
+
}
|
306 |
+
new_facts.append(line)
|
307 |
+
new_tokens.append(tokens)
|
308 |
+
self.document_map[line] = file
|
309 |
+
|
310 |
+
# Save the new .q.tokens with updated hash
|
311 |
+
self._save_token_cache(file, fresh_cache)
|
312 |
+
|
313 |
+
mem_usage = process.memory_info().rss / 1024 / 1024
|
314 |
+
self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB")
|
315 |
+
|
316 |
+
except Exception as e:
|
317 |
+
self.logger.error(f"Error processing {file}: {str(e)}")
|
318 |
+
|
319 |
+
file_pbar.update(1)
|
320 |
+
|
321 |
+
if invalid_lines:
|
322 |
+
self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
|
323 |
+
for file, line, error in invalid_lines:
|
324 |
+
self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
|
325 |
+
|
326 |
+
# 2) Merge newly tokenized facts with the existing ones
|
327 |
+
all_facts = existing_facts + new_facts
|
328 |
+
all_tokens = existing_tokens + new_tokens
|
329 |
+
|
330 |
+
# 3) Build BM25 index from combined facts
|
331 |
+
self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).")
|
332 |
+
self.bm25_index = BM25Okapi(all_tokens)
|
333 |
+
self.tokenized_facts = all_facts
|
334 |
+
|
335 |
+
# 4) Save the updated BM25 index to disk
|
336 |
+
with open(self.bm25_index_file, "wb") as f:
|
337 |
+
pickle.dump({
|
338 |
+
"bm25_index": self.bm25_index,
|
339 |
+
"tokenized_facts": self.tokenized_facts
|
340 |
+
}, f)
|
341 |
+
|
342 |
+
final_mem = process.memory_info().rss / 1024 / 1024
|
343 |
+
self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
|
344 |
+
|
345 |
+
async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None:
|
346 |
+
"""
|
347 |
+
Generate index files for all documents in parallel batches
|
348 |
+
|
349 |
+
Args:
|
350 |
+
force_generate_facts (bool): If True, regenerate indexes even if they exist
|
351 |
+
clear_bm25_cache (bool): If True, clear existing BM25 index cache
|
352 |
+
"""
|
353 |
+
self.logger.info("Starting index generation for documentation files.")
|
354 |
+
|
355 |
+
md_files = [
|
356 |
+
self.docs_dir / f for f in os.listdir(self.docs_dir)
|
357 |
+
if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md'])
|
358 |
+
]
|
359 |
+
|
360 |
+
# Filter out files that already have .q files unless force=True
|
361 |
+
if not force_generate_facts:
|
362 |
+
md_files = [
|
363 |
+
f for f in md_files
|
364 |
+
if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists()
|
365 |
+
]
|
366 |
+
|
367 |
+
if not md_files:
|
368 |
+
self.logger.info("All index files exist. Use force=True to regenerate.")
|
369 |
+
else:
|
370 |
+
# Process documents in batches
|
371 |
+
for i in range(0, len(md_files), self.batch_size):
|
372 |
+
batch = md_files[i:i + self.batch_size]
|
373 |
+
self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}")
|
374 |
+
await self._process_document_batch(batch)
|
375 |
+
|
376 |
+
self.logger.info("Index generation complete, building/updating search index.")
|
377 |
+
self.build_search_index(clear_cache=clear_bm25_cache)
|
378 |
+
|
379 |
+
def generate(self, sections: List[str], mode: str = "extended") -> str:
|
380 |
+
# Get all markdown files
|
381 |
+
all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \
|
382 |
+
glob.glob(str(self.docs_dir / "[0-9]*.xs.md"))
|
383 |
+
|
384 |
+
# Extract base names without extensions
|
385 |
+
base_docs = {Path(f).name.split('.')[0] for f in all_files
|
386 |
+
if not Path(f).name.endswith('.q.md')}
|
387 |
+
|
388 |
+
# Filter by sections if provided
|
389 |
+
if sections:
|
390 |
+
base_docs = {doc for doc in base_docs
|
391 |
+
if any(section.lower() in doc.lower() for section in sections)}
|
392 |
+
|
393 |
+
# Get file paths based on mode
|
394 |
+
files = []
|
395 |
+
for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999):
|
396 |
+
if mode == "condensed":
|
397 |
+
xs_file = self.docs_dir / f"{doc}.xs.md"
|
398 |
+
regular_file = self.docs_dir / f"{doc}.md"
|
399 |
+
files.append(str(xs_file if xs_file.exists() else regular_file))
|
400 |
+
else:
|
401 |
+
files.append(str(self.docs_dir / f"{doc}.md"))
|
402 |
+
|
403 |
+
# Read and format content
|
404 |
+
content = []
|
405 |
+
for file in files:
|
406 |
+
try:
|
407 |
+
with open(file, 'r', encoding='utf-8') as f:
|
408 |
+
fname = Path(file).name
|
409 |
+
content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
|
410 |
+
except Exception as e:
|
411 |
+
self.logger.error(f"Error reading {file}: {str(e)}")
|
412 |
+
|
413 |
+
return "\n\n---\n\n".join(content) if content else ""
|
414 |
+
|
415 |
+
def search(self, query: str, top_k: int = 5) -> str:
|
416 |
+
if not self.bm25_index:
|
417 |
+
return "No search index available. Call build_search_index() first."
|
418 |
+
|
419 |
+
query_tokens = self.preprocess_text(query)
|
420 |
+
doc_scores = self.bm25_index.get_scores(query_tokens)
|
421 |
+
|
422 |
+
mean_score = np.mean(doc_scores)
|
423 |
+
std_score = np.std(doc_scores)
|
424 |
+
score_threshold = mean_score + (0.25 * std_score)
|
425 |
+
|
426 |
+
file_data = self._aggregate_search_scores(
|
427 |
+
doc_scores=doc_scores,
|
428 |
+
score_threshold=score_threshold,
|
429 |
+
query_tokens=query_tokens,
|
430 |
+
)
|
431 |
+
|
432 |
+
ranked_files = sorted(
|
433 |
+
file_data.items(),
|
434 |
+
key=lambda x: (
|
435 |
+
x[1]["code_match_score"] * 2.0
|
436 |
+
+ x[1]["match_count"] * 1.5
|
437 |
+
+ x[1]["total_score"]
|
438 |
+
),
|
439 |
+
reverse=True,
|
440 |
+
)[:top_k]
|
441 |
+
|
442 |
+
results = []
|
443 |
+
for file, _ in ranked_files:
|
444 |
+
main_doc = str(file).replace(".q.md", ".md")
|
445 |
+
if os.path.exists(self.docs_dir / main_doc):
|
446 |
+
with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f:
|
447 |
+
only_file_name = main_doc.split("/")[-1]
|
448 |
+
content = [
|
449 |
+
"#" * 20,
|
450 |
+
f"# {only_file_name}",
|
451 |
+
"#" * 20,
|
452 |
+
"",
|
453 |
+
f.read()
|
454 |
+
]
|
455 |
+
results.append("\n".join(content))
|
456 |
+
|
457 |
+
return "\n\n---\n\n".join(results)
|
458 |
+
|
459 |
+
def _aggregate_search_scores(
|
460 |
+
self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
|
461 |
+
) -> Dict:
|
462 |
+
file_data = {}
|
463 |
+
|
464 |
+
for idx, score in enumerate(doc_scores):
|
465 |
+
if score <= score_threshold:
|
466 |
+
continue
|
467 |
+
|
468 |
+
fact = self.tokenized_facts[idx]
|
469 |
+
file_path = self.document_map[fact]
|
470 |
+
|
471 |
+
if file_path not in file_data:
|
472 |
+
file_data[file_path] = {
|
473 |
+
"total_score": 0,
|
474 |
+
"match_count": 0,
|
475 |
+
"code_match_score": 0,
|
476 |
+
"matched_facts": [],
|
477 |
+
}
|
478 |
+
|
479 |
+
components = fact.split("|") if "|" in fact else [fact]
|
480 |
+
|
481 |
+
code_match_score = 0
|
482 |
+
if len(components) == 3:
|
483 |
+
code_ref = components[2].strip()
|
484 |
+
code_tokens = self.preprocess_text(code_ref)
|
485 |
+
code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens)
|
486 |
+
|
487 |
+
file_data[file_path]["total_score"] += score
|
488 |
+
file_data[file_path]["match_count"] += 1
|
489 |
+
file_data[file_path]["code_match_score"] = max(
|
490 |
+
file_data[file_path]["code_match_score"], code_match_score
|
491 |
+
)
|
492 |
+
file_data[file_path]["matched_facts"].append(fact)
|
493 |
+
|
494 |
+
return file_data
|
495 |
+
|
496 |
+
def refresh_index(self) -> None:
|
497 |
+
"""Convenience method for a full rebuild."""
|
498 |
+
self.build_search_index(clear_cache=True)
|
crawl4ai/markdown_generation_strategy.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import Optional, Dict, Any, Tuple
|
3 |
+
from .models import MarkdownGenerationResult
|
4 |
+
from .html2text import CustomHTML2Text
|
5 |
+
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
|
6 |
+
import re
|
7 |
+
from urllib.parse import urljoin
|
8 |
+
|
9 |
+
# Pre-compile the regex pattern
|
10 |
+
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
11 |
+
|
12 |
+
def fast_urljoin(base: str, url: str) -> str:
|
13 |
+
"""Fast URL joining for common cases."""
|
14 |
+
if url.startswith(('http://', 'https://', 'mailto:', '//')):
|
15 |
+
return url
|
16 |
+
if url.startswith('/'):
|
17 |
+
# Handle absolute paths
|
18 |
+
if base.endswith('/'):
|
19 |
+
return base[:-1] + url
|
20 |
+
return base + url
|
21 |
+
return urljoin(base, url)
|
22 |
+
|
23 |
+
class MarkdownGenerationStrategy(ABC):
|
24 |
+
"""Abstract base class for markdown generation strategies."""
|
25 |
+
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
26 |
+
self.content_filter = content_filter
|
27 |
+
self.options = options or {}
|
28 |
+
|
29 |
+
@abstractmethod
|
30 |
+
def generate_markdown(self,
|
31 |
+
cleaned_html: str,
|
32 |
+
base_url: str = "",
|
33 |
+
html2text_options: Optional[Dict[str, Any]] = None,
|
34 |
+
content_filter: Optional[RelevantContentFilter] = None,
|
35 |
+
citations: bool = True,
|
36 |
+
**kwargs) -> MarkdownGenerationResult:
|
37 |
+
"""Generate markdown from cleaned HTML."""
|
38 |
+
pass
|
39 |
+
|
40 |
+
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
41 |
+
"""
|
42 |
+
Default implementation of markdown generation strategy.
|
43 |
+
|
44 |
+
How it works:
|
45 |
+
1. Generate raw markdown from cleaned HTML.
|
46 |
+
2. Convert links to citations.
|
47 |
+
3. Generate fit markdown if content filter is provided.
|
48 |
+
4. Return MarkdownGenerationResult.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
52 |
+
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
56 |
+
"""
|
57 |
+
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
58 |
+
super().__init__(content_filter, options)
|
59 |
+
|
60 |
+
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
61 |
+
"""
|
62 |
+
Convert links in markdown to citations.
|
63 |
+
|
64 |
+
How it works:
|
65 |
+
1. Find all links in the markdown.
|
66 |
+
2. Convert links to citations.
|
67 |
+
3. Return converted markdown and references markdown.
|
68 |
+
|
69 |
+
Note:
|
70 |
+
This function uses a regex pattern to find links in markdown.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
markdown (str): Markdown text.
|
74 |
+
base_url (str): Base URL for URL joins.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Tuple[str, str]: Converted markdown and references markdown.
|
78 |
+
"""
|
79 |
+
link_map = {}
|
80 |
+
url_cache = {} # Cache for URL joins
|
81 |
+
parts = []
|
82 |
+
last_end = 0
|
83 |
+
counter = 1
|
84 |
+
|
85 |
+
for match in LINK_PATTERN.finditer(markdown):
|
86 |
+
parts.append(markdown[last_end:match.start()])
|
87 |
+
text, url, title = match.groups()
|
88 |
+
|
89 |
+
# Use cached URL if available, otherwise compute and cache
|
90 |
+
if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
|
91 |
+
if url not in url_cache:
|
92 |
+
url_cache[url] = fast_urljoin(base_url, url)
|
93 |
+
url = url_cache[url]
|
94 |
+
|
95 |
+
if url not in link_map:
|
96 |
+
desc = []
|
97 |
+
if title: desc.append(title)
|
98 |
+
if text and text != title: desc.append(text)
|
99 |
+
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
|
100 |
+
counter += 1
|
101 |
+
|
102 |
+
num = link_map[url][0]
|
103 |
+
parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
|
104 |
+
last_end = match.end()
|
105 |
+
|
106 |
+
parts.append(markdown[last_end:])
|
107 |
+
converted_text = ''.join(parts)
|
108 |
+
|
109 |
+
# Pre-build reference strings
|
110 |
+
references = ["\n\n## References\n\n"]
|
111 |
+
references.extend(
|
112 |
+
f"⟨{num}⟩ {url}{desc}\n"
|
113 |
+
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
|
114 |
+
)
|
115 |
+
|
116 |
+
return converted_text, ''.join(references)
|
117 |
+
|
118 |
+
def generate_markdown(self,
|
119 |
+
cleaned_html: str,
|
120 |
+
base_url: str = "",
|
121 |
+
html2text_options: Optional[Dict[str, Any]] = None,
|
122 |
+
options: Optional[Dict[str, Any]] = None,
|
123 |
+
content_filter: Optional[RelevantContentFilter] = None,
|
124 |
+
citations: bool = True,
|
125 |
+
**kwargs) -> MarkdownGenerationResult:
|
126 |
+
"""
|
127 |
+
Generate markdown with citations from cleaned HTML.
|
128 |
+
|
129 |
+
How it works:
|
130 |
+
1. Generate raw markdown from cleaned HTML.
|
131 |
+
2. Convert links to citations.
|
132 |
+
3. Generate fit markdown if content filter is provided.
|
133 |
+
4. Return MarkdownGenerationResult.
|
134 |
+
|
135 |
+
Args:
|
136 |
+
cleaned_html (str): Cleaned HTML content.
|
137 |
+
base_url (str): Base URL for URL joins.
|
138 |
+
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
139 |
+
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
140 |
+
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
141 |
+
citations (bool): Whether to generate citations.
|
142 |
+
|
143 |
+
Returns:
|
144 |
+
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
145 |
+
"""
|
146 |
+
try:
|
147 |
+
# Initialize HTML2Text with default options for better conversion
|
148 |
+
h = CustomHTML2Text(baseurl=base_url)
|
149 |
+
default_options = {
|
150 |
+
'body_width': 0, # Disable text wrapping
|
151 |
+
'ignore_emphasis': False,
|
152 |
+
'ignore_links': False,
|
153 |
+
'ignore_images': False,
|
154 |
+
'protect_links': True,
|
155 |
+
'single_line_break': True,
|
156 |
+
'mark_code': True,
|
157 |
+
'escape_snob': False
|
158 |
+
}
|
159 |
+
|
160 |
+
# Update with custom options if provided
|
161 |
+
if html2text_options:
|
162 |
+
default_options.update(html2text_options)
|
163 |
+
elif options:
|
164 |
+
default_options.update(options)
|
165 |
+
elif self.options:
|
166 |
+
default_options.update(self.options)
|
167 |
+
|
168 |
+
h.update_params(**default_options)
|
169 |
+
|
170 |
+
# Ensure we have valid input
|
171 |
+
if not cleaned_html:
|
172 |
+
cleaned_html = ""
|
173 |
+
elif not isinstance(cleaned_html, str):
|
174 |
+
cleaned_html = str(cleaned_html)
|
175 |
+
|
176 |
+
# Generate raw markdown
|
177 |
+
try:
|
178 |
+
raw_markdown = h.handle(cleaned_html)
|
179 |
+
except Exception as e:
|
180 |
+
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
181 |
+
|
182 |
+
raw_markdown = raw_markdown.replace(' ```', '```')
|
183 |
+
|
184 |
+
# Convert links to citations
|
185 |
+
markdown_with_citations: str = raw_markdown
|
186 |
+
references_markdown: str = ""
|
187 |
+
if citations:
|
188 |
+
try:
|
189 |
+
markdown_with_citations, references_markdown = self.convert_links_to_citations(
|
190 |
+
raw_markdown, base_url
|
191 |
+
)
|
192 |
+
except Exception as e:
|
193 |
+
markdown_with_citations = raw_markdown
|
194 |
+
references_markdown = f"Error generating citations: {str(e)}"
|
195 |
+
|
196 |
+
# Generate fit markdown if content filter is provided
|
197 |
+
fit_markdown: Optional[str] = ""
|
198 |
+
filtered_html: Optional[str] = ""
|
199 |
+
if content_filter or self.content_filter:
|
200 |
+
try:
|
201 |
+
content_filter = content_filter or self.content_filter
|
202 |
+
filtered_html = content_filter.filter_content(cleaned_html)
|
203 |
+
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
|
204 |
+
fit_markdown = h.handle(filtered_html)
|
205 |
+
except Exception as e:
|
206 |
+
fit_markdown = f"Error generating fit markdown: {str(e)}"
|
207 |
+
filtered_html = ""
|
208 |
+
|
209 |
+
return MarkdownGenerationResult(
|
210 |
+
raw_markdown=raw_markdown or "",
|
211 |
+
markdown_with_citations=markdown_with_citations or "",
|
212 |
+
references_markdown=references_markdown or "",
|
213 |
+
fit_markdown=fit_markdown or "",
|
214 |
+
fit_html=filtered_html or "",
|
215 |
+
)
|
216 |
+
except Exception as e:
|
217 |
+
# If anything fails, return empty strings with error message
|
218 |
+
error_msg = f"Error in markdown generation: {str(e)}"
|
219 |
+
return MarkdownGenerationResult(
|
220 |
+
raw_markdown=error_msg,
|
221 |
+
markdown_with_citations=error_msg,
|
222 |
+
references_markdown="",
|
223 |
+
fit_markdown="",
|
224 |
+
fit_html="",
|
225 |
+
)
|
crawl4ai/migrations.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
import logging
|
4 |
+
from pathlib import Path
|
5 |
+
import aiosqlite
|
6 |
+
from typing import Optional
|
7 |
+
import xxhash
|
8 |
+
import aiofiles
|
9 |
+
import shutil
|
10 |
+
import time
|
11 |
+
from datetime import datetime
|
12 |
+
from .async_logger import AsyncLogger, LogLevel
|
13 |
+
|
14 |
+
# Initialize logger
|
15 |
+
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
16 |
+
|
17 |
+
# logging.basicConfig(level=logging.INFO)
|
18 |
+
# logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
class DatabaseMigration:
|
21 |
+
def __init__(self, db_path: str):
|
22 |
+
self.db_path = db_path
|
23 |
+
self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
|
24 |
+
|
25 |
+
def _ensure_content_dirs(self, base_path: str) -> dict:
|
26 |
+
dirs = {
|
27 |
+
'html': 'html_content',
|
28 |
+
'cleaned': 'cleaned_html',
|
29 |
+
'markdown': 'markdown_content',
|
30 |
+
'extracted': 'extracted_content',
|
31 |
+
'screenshots': 'screenshots'
|
32 |
+
}
|
33 |
+
content_paths = {}
|
34 |
+
for key, dirname in dirs.items():
|
35 |
+
path = os.path.join(base_path, dirname)
|
36 |
+
os.makedirs(path, exist_ok=True)
|
37 |
+
content_paths[key] = path
|
38 |
+
return content_paths
|
39 |
+
|
40 |
+
def _generate_content_hash(self, content: str) -> str:
|
41 |
+
x = xxhash.xxh64()
|
42 |
+
x.update(content.encode())
|
43 |
+
content_hash = x.hexdigest()
|
44 |
+
return content_hash
|
45 |
+
# return hashlib.sha256(content.encode()).hexdigest()
|
46 |
+
|
47 |
+
async def _store_content(self, content: str, content_type: str) -> str:
|
48 |
+
if not content:
|
49 |
+
return ""
|
50 |
+
|
51 |
+
content_hash = self._generate_content_hash(content)
|
52 |
+
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
53 |
+
|
54 |
+
if not os.path.exists(file_path):
|
55 |
+
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
56 |
+
await f.write(content)
|
57 |
+
|
58 |
+
return content_hash
|
59 |
+
|
60 |
+
async def migrate_database(self):
|
61 |
+
"""Migrate existing database to file-based storage"""
|
62 |
+
# logger.info("Starting database migration...")
|
63 |
+
logger.info("Starting database migration...", tag="INIT")
|
64 |
+
|
65 |
+
try:
|
66 |
+
async with aiosqlite.connect(self.db_path) as db:
|
67 |
+
# Get all rows
|
68 |
+
async with db.execute(
|
69 |
+
'''SELECT url, html, cleaned_html, markdown,
|
70 |
+
extracted_content, screenshot FROM crawled_data'''
|
71 |
+
) as cursor:
|
72 |
+
rows = await cursor.fetchall()
|
73 |
+
|
74 |
+
migrated_count = 0
|
75 |
+
for row in rows:
|
76 |
+
url, html, cleaned_html, markdown, extracted_content, screenshot = row
|
77 |
+
|
78 |
+
# Store content in files and get hashes
|
79 |
+
html_hash = await self._store_content(html, 'html')
|
80 |
+
cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
|
81 |
+
markdown_hash = await self._store_content(markdown, 'markdown')
|
82 |
+
extracted_hash = await self._store_content(extracted_content, 'extracted')
|
83 |
+
screenshot_hash = await self._store_content(screenshot, 'screenshots')
|
84 |
+
|
85 |
+
# Update database with hashes
|
86 |
+
await db.execute('''
|
87 |
+
UPDATE crawled_data
|
88 |
+
SET html = ?,
|
89 |
+
cleaned_html = ?,
|
90 |
+
markdown = ?,
|
91 |
+
extracted_content = ?,
|
92 |
+
screenshot = ?
|
93 |
+
WHERE url = ?
|
94 |
+
''', (html_hash, cleaned_hash, markdown_hash,
|
95 |
+
extracted_hash, screenshot_hash, url))
|
96 |
+
|
97 |
+
migrated_count += 1
|
98 |
+
if migrated_count % 100 == 0:
|
99 |
+
logger.info(f"Migrated {migrated_count} records...", tag="INIT")
|
100 |
+
|
101 |
+
|
102 |
+
await db.commit()
|
103 |
+
logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
|
104 |
+
|
105 |
+
except Exception as e:
|
106 |
+
# logger.error(f"Migration failed: {e}")
|
107 |
+
logger.error(
|
108 |
+
message="Migration failed: {error}",
|
109 |
+
tag="ERROR",
|
110 |
+
params={"error": str(e)}
|
111 |
+
)
|
112 |
+
raise e
|
113 |
+
|
114 |
+
async def backup_database(db_path: str) -> str:
|
115 |
+
"""Create backup of existing database"""
|
116 |
+
if not os.path.exists(db_path):
|
117 |
+
logger.info("No existing database found. Skipping backup.", tag="INIT")
|
118 |
+
return None
|
119 |
+
|
120 |
+
# Create backup with timestamp
|
121 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
122 |
+
backup_path = f"{db_path}.backup_{timestamp}"
|
123 |
+
|
124 |
+
try:
|
125 |
+
# Wait for any potential write operations to finish
|
126 |
+
await asyncio.sleep(1)
|
127 |
+
|
128 |
+
# Create backup
|
129 |
+
shutil.copy2(db_path, backup_path)
|
130 |
+
logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
|
131 |
+
return backup_path
|
132 |
+
except Exception as e:
|
133 |
+
# logger.error(f"Backup failed: {e}")
|
134 |
+
logger.error(
|
135 |
+
message="Migration failed: {error}",
|
136 |
+
tag="ERROR",
|
137 |
+
params={"error": str(e)}
|
138 |
+
)
|
139 |
+
raise e
|
140 |
+
|
141 |
+
async def run_migration(db_path: Optional[str] = None):
|
142 |
+
"""Run database migration"""
|
143 |
+
if db_path is None:
|
144 |
+
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
|
145 |
+
|
146 |
+
if not os.path.exists(db_path):
|
147 |
+
logger.info("No existing database found. Skipping migration.", tag="INIT")
|
148 |
+
return
|
149 |
+
|
150 |
+
# Create backup first
|
151 |
+
backup_path = await backup_database(db_path)
|
152 |
+
if not backup_path:
|
153 |
+
return
|
154 |
+
|
155 |
+
migration = DatabaseMigration(db_path)
|
156 |
+
await migration.migrate_database()
|
157 |
+
|
158 |
+
def main():
|
159 |
+
"""CLI entry point for migration"""
|
160 |
+
import argparse
|
161 |
+
parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
|
162 |
+
parser.add_argument('--db-path', help='Custom database path')
|
163 |
+
args = parser.parse_args()
|
164 |
+
|
165 |
+
asyncio.run(run_migration(args.db_path))
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
main()
|
crawl4ai/model_loader.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import lru_cache
|
2 |
+
from pathlib import Path
|
3 |
+
import subprocess, os
|
4 |
+
import shutil
|
5 |
+
import tarfile
|
6 |
+
from .model_loader import *
|
7 |
+
import argparse
|
8 |
+
import urllib.request
|
9 |
+
from crawl4ai.config import MODEL_REPO_BRANCH
|
10 |
+
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
11 |
+
|
12 |
+
@lru_cache()
|
13 |
+
def get_available_memory(device):
|
14 |
+
import torch
|
15 |
+
if device.type == 'cuda':
|
16 |
+
return torch.cuda.get_device_properties(device).total_memory
|
17 |
+
elif device.type == 'mps':
|
18 |
+
return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
|
19 |
+
else:
|
20 |
+
return 0
|
21 |
+
|
22 |
+
@lru_cache()
|
23 |
+
def calculate_batch_size(device):
|
24 |
+
available_memory = get_available_memory(device)
|
25 |
+
|
26 |
+
if device.type == 'cpu':
|
27 |
+
return 16
|
28 |
+
elif device.type in ['cuda', 'mps']:
|
29 |
+
# Adjust these thresholds based on your model size and available memory
|
30 |
+
if available_memory >= 31 * 1024 ** 3: # > 32GB
|
31 |
+
return 256
|
32 |
+
elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
|
33 |
+
return 128
|
34 |
+
elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
|
35 |
+
return 64
|
36 |
+
else:
|
37 |
+
return 32
|
38 |
+
else:
|
39 |
+
return 16 # Default batch size
|
40 |
+
|
41 |
+
@lru_cache()
|
42 |
+
def get_device():
|
43 |
+
import torch
|
44 |
+
if torch.cuda.is_available():
|
45 |
+
device = torch.device('cuda')
|
46 |
+
elif torch.backends.mps.is_available():
|
47 |
+
device = torch.device('mps')
|
48 |
+
else:
|
49 |
+
device = torch.device('cpu')
|
50 |
+
return device
|
51 |
+
|
52 |
+
def set_model_device(model):
|
53 |
+
device = get_device()
|
54 |
+
model.to(device)
|
55 |
+
return model, device
|
56 |
+
|
57 |
+
@lru_cache()
|
58 |
+
def get_home_folder():
|
59 |
+
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
60 |
+
os.makedirs(home_folder, exist_ok=True)
|
61 |
+
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
62 |
+
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
63 |
+
return home_folder
|
64 |
+
|
65 |
+
@lru_cache()
|
66 |
+
def load_bert_base_uncased():
|
67 |
+
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
68 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
69 |
+
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
70 |
+
model.eval()
|
71 |
+
model, device = set_model_device(model)
|
72 |
+
return tokenizer, model
|
73 |
+
|
74 |
+
@lru_cache()
|
75 |
+
def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
|
76 |
+
"""Load the Hugging Face model for embedding.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
tuple: The tokenizer and model.
|
83 |
+
"""
|
84 |
+
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
85 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
|
86 |
+
model = AutoModel.from_pretrained(model_name, resume_download=None)
|
87 |
+
model.eval()
|
88 |
+
model, device = set_model_device(model)
|
89 |
+
return tokenizer, model
|
90 |
+
|
91 |
+
@lru_cache()
|
92 |
+
def load_text_classifier():
|
93 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
94 |
+
from transformers import pipeline
|
95 |
+
import torch
|
96 |
+
|
97 |
+
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
98 |
+
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
99 |
+
model.eval()
|
100 |
+
model, device = set_model_device(model)
|
101 |
+
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
102 |
+
return pipe
|
103 |
+
|
104 |
+
@lru_cache()
|
105 |
+
def load_text_multilabel_classifier():
|
106 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
107 |
+
import numpy as np
|
108 |
+
from scipy.special import expit
|
109 |
+
import torch
|
110 |
+
|
111 |
+
# # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
112 |
+
# if torch.cuda.is_available():
|
113 |
+
# device = torch.device("cuda")
|
114 |
+
# elif torch.backends.mps.is_available():
|
115 |
+
# device = torch.device("mps")
|
116 |
+
# else:
|
117 |
+
# device = torch.device("cpu")
|
118 |
+
# # return load_spacy_model(), torch.device("cpu")
|
119 |
+
|
120 |
+
|
121 |
+
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
122 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
123 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
124 |
+
model.eval()
|
125 |
+
model, device = set_model_device(model)
|
126 |
+
class_mapping = model.config.id2label
|
127 |
+
|
128 |
+
def _classifier(texts, threshold=0.5, max_length=64):
|
129 |
+
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
130 |
+
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
131 |
+
|
132 |
+
with torch.no_grad():
|
133 |
+
output = model(**tokens)
|
134 |
+
|
135 |
+
scores = output.logits.detach().cpu().numpy()
|
136 |
+
scores = expit(scores)
|
137 |
+
predictions = (scores >= threshold) * 1
|
138 |
+
|
139 |
+
batch_labels = []
|
140 |
+
for prediction in predictions:
|
141 |
+
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
|
142 |
+
batch_labels.append(labels)
|
143 |
+
|
144 |
+
return batch_labels
|
145 |
+
|
146 |
+
return _classifier, device
|
147 |
+
|
148 |
+
@lru_cache()
|
149 |
+
def load_nltk_punkt():
|
150 |
+
import nltk
|
151 |
+
try:
|
152 |
+
nltk.data.find('tokenizers/punkt')
|
153 |
+
except LookupError:
|
154 |
+
nltk.download('punkt')
|
155 |
+
return nltk.data.find('tokenizers/punkt')
|
156 |
+
|
157 |
+
@lru_cache()
|
158 |
+
def load_spacy_model():
|
159 |
+
import spacy
|
160 |
+
name = "models/reuters"
|
161 |
+
home_folder = get_home_folder()
|
162 |
+
model_folder = Path(home_folder) / name
|
163 |
+
|
164 |
+
# Check if the model directory already exists
|
165 |
+
if not (model_folder.exists() and any(model_folder.iterdir())):
|
166 |
+
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
167 |
+
branch = MODEL_REPO_BRANCH
|
168 |
+
repo_folder = Path(home_folder) / "crawl4ai"
|
169 |
+
|
170 |
+
print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
171 |
+
|
172 |
+
# Remove existing repo folder if it exists
|
173 |
+
if repo_folder.exists():
|
174 |
+
try:
|
175 |
+
shutil.rmtree(repo_folder)
|
176 |
+
if model_folder.exists():
|
177 |
+
shutil.rmtree(model_folder)
|
178 |
+
except PermissionError:
|
179 |
+
print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
|
180 |
+
print(f"- {repo_folder}")
|
181 |
+
print(f"- {model_folder}")
|
182 |
+
return None
|
183 |
+
|
184 |
+
try:
|
185 |
+
# Clone the repository
|
186 |
+
subprocess.run(
|
187 |
+
["git", "clone", "-b", branch, repo_url, str(repo_folder)],
|
188 |
+
stdout=subprocess.DEVNULL,
|
189 |
+
stderr=subprocess.DEVNULL,
|
190 |
+
check=True
|
191 |
+
)
|
192 |
+
|
193 |
+
# Create the models directory if it doesn't exist
|
194 |
+
models_folder = Path(home_folder) / "models"
|
195 |
+
models_folder.mkdir(parents=True, exist_ok=True)
|
196 |
+
|
197 |
+
# Copy the reuters model folder to the models directory
|
198 |
+
source_folder = repo_folder / "models" / "reuters"
|
199 |
+
shutil.copytree(source_folder, model_folder)
|
200 |
+
|
201 |
+
# Remove the cloned repository
|
202 |
+
shutil.rmtree(repo_folder)
|
203 |
+
|
204 |
+
print("[LOG] ✅ Spacy Model downloaded successfully")
|
205 |
+
except subprocess.CalledProcessError as e:
|
206 |
+
print(f"An error occurred while cloning the repository: {e}")
|
207 |
+
return None
|
208 |
+
except Exception as e:
|
209 |
+
print(f"An error occurred: {e}")
|
210 |
+
return None
|
211 |
+
|
212 |
+
try:
|
213 |
+
return spacy.load(str(model_folder))
|
214 |
+
except Exception as e:
|
215 |
+
print(f"Error loading spacy model: {e}")
|
216 |
+
return None
|
217 |
+
|
218 |
+
def download_all_models(remove_existing=False):
|
219 |
+
"""Download all models required for Crawl4AI."""
|
220 |
+
if remove_existing:
|
221 |
+
print("[LOG] Removing existing models...")
|
222 |
+
home_folder = get_home_folder()
|
223 |
+
model_folders = [
|
224 |
+
os.path.join(home_folder, "models/reuters"),
|
225 |
+
os.path.join(home_folder, "models"),
|
226 |
+
]
|
227 |
+
for folder in model_folders:
|
228 |
+
if Path(folder).exists():
|
229 |
+
shutil.rmtree(folder)
|
230 |
+
print("[LOG] Existing models removed.")
|
231 |
+
|
232 |
+
# Load each model to trigger download
|
233 |
+
# print("[LOG] Downloading BERT Base Uncased...")
|
234 |
+
# load_bert_base_uncased()
|
235 |
+
# print("[LOG] Downloading BGE Small EN v1.5...")
|
236 |
+
# load_bge_small_en_v1_5()
|
237 |
+
# print("[LOG] Downloading ONNX model...")
|
238 |
+
# load_onnx_all_MiniLM_l6_v2()
|
239 |
+
print("[LOG] Downloading text classifier...")
|
240 |
+
_, device = load_text_multilabel_classifier()
|
241 |
+
print(f"[LOG] Text classifier loaded on {device}")
|
242 |
+
print("[LOG] Downloading custom NLTK Punkt model...")
|
243 |
+
load_nltk_punkt()
|
244 |
+
print("[LOG] ✅ All models downloaded successfully.")
|
245 |
+
|
246 |
+
def main():
|
247 |
+
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
|
248 |
+
print("[LOG] This script will download all the models required for Crawl4AI.")
|
249 |
+
parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
|
250 |
+
parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
|
251 |
+
args = parser.parse_args()
|
252 |
+
|
253 |
+
download_all_models(remove_existing=args.remove_existing)
|
254 |
+
|
255 |
+
if __name__ == "__main__":
|
256 |
+
main()
|
crawl4ai/models.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, HttpUrl
|
2 |
+
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from .ssl_certificate import SSLCertificate
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class TokenUsage:
|
8 |
+
completion_tokens: int = 0
|
9 |
+
prompt_tokens: int = 0
|
10 |
+
total_tokens: int = 0
|
11 |
+
completion_tokens_details: Optional[dict] = None
|
12 |
+
prompt_tokens_details: Optional[dict] = None
|
13 |
+
|
14 |
+
|
15 |
+
class UrlModel(BaseModel):
|
16 |
+
url: HttpUrl
|
17 |
+
forced: bool = False
|
18 |
+
|
19 |
+
class MarkdownGenerationResult(BaseModel):
|
20 |
+
raw_markdown: str
|
21 |
+
markdown_with_citations: str
|
22 |
+
references_markdown: str
|
23 |
+
fit_markdown: Optional[str] = None
|
24 |
+
fit_html: Optional[str] = None
|
25 |
+
|
26 |
+
class CrawlResult(BaseModel):
|
27 |
+
url: str
|
28 |
+
html: str
|
29 |
+
success: bool
|
30 |
+
cleaned_html: Optional[str] = None
|
31 |
+
media: Dict[str, List[Dict]] = {}
|
32 |
+
links: Dict[str, List[Dict]] = {}
|
33 |
+
downloaded_files: Optional[List[str]] = None
|
34 |
+
screenshot: Optional[str] = None
|
35 |
+
pdf : Optional[bytes] = None
|
36 |
+
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
37 |
+
markdown_v2: Optional[MarkdownGenerationResult] = None
|
38 |
+
fit_markdown: Optional[str] = None
|
39 |
+
fit_html: Optional[str] = None
|
40 |
+
extracted_content: Optional[str] = None
|
41 |
+
metadata: Optional[dict] = None
|
42 |
+
error_message: Optional[str] = None
|
43 |
+
session_id: Optional[str] = None
|
44 |
+
response_headers: Optional[dict] = None
|
45 |
+
status_code: Optional[int] = None
|
46 |
+
ssl_certificate: Optional[SSLCertificate] = None
|
47 |
+
class Config:
|
48 |
+
arbitrary_types_allowed = True
|
49 |
+
|
50 |
+
class AsyncCrawlResponse(BaseModel):
|
51 |
+
html: str
|
52 |
+
response_headers: Dict[str, str]
|
53 |
+
status_code: int
|
54 |
+
screenshot: Optional[str] = None
|
55 |
+
pdf_data: Optional[bytes] = None
|
56 |
+
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
57 |
+
downloaded_files: Optional[List[str]] = None
|
58 |
+
ssl_certificate: Optional[SSLCertificate] = None
|
59 |
+
|
60 |
+
class Config:
|
61 |
+
arbitrary_types_allowed = True
|
crawl4ai/prompts.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
|
2 |
+
<url>{URL}</url>
|
3 |
+
|
4 |
+
And here is the cleaned HTML content of that webpage:
|
5 |
+
<html>
|
6 |
+
{HTML}
|
7 |
+
</html>
|
8 |
+
|
9 |
+
Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
|
10 |
+
|
11 |
+
- index: an integer representing the index of the block in the content
|
12 |
+
- tags: a list of semantic tags that are relevant to the content of the block
|
13 |
+
- content: a list of strings containing the text content of the block
|
14 |
+
- questions: a list of 3 questions that a user may ask about the content in this block
|
15 |
+
|
16 |
+
To generate the JSON objects:
|
17 |
+
|
18 |
+
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
19 |
+
|
20 |
+
2. For each block:
|
21 |
+
a. Assign it an index based on its order in the content.
|
22 |
+
b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about.
|
23 |
+
c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field.
|
24 |
+
d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block.
|
25 |
+
|
26 |
+
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
27 |
+
|
28 |
+
4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.).
|
29 |
+
|
30 |
+
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
31 |
+
|
32 |
+
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
33 |
+
|
34 |
+
Please provide your output within <blocks> tags, like this:
|
35 |
+
|
36 |
+
<blocks>
|
37 |
+
[{
|
38 |
+
"index": 0,
|
39 |
+
"tags": ["introduction", "overview"],
|
40 |
+
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."],
|
41 |
+
"questions": [
|
42 |
+
"What is the main topic of this article?",
|
43 |
+
"What can I expect to learn from reading this article?",
|
44 |
+
"Is this article suitable for beginners or experts in the field?"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"index": 1,
|
49 |
+
"tags": ["history", "background"],
|
50 |
+
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
51 |
+
"It provides context and sets the stage for the rest of the article."],
|
52 |
+
"questions": [
|
53 |
+
"What historical events led to the development of this topic?",
|
54 |
+
"How has the understanding of this topic evolved over time?",
|
55 |
+
"What are some key milestones in the history of this topic?"
|
56 |
+
]
|
57 |
+
}]
|
58 |
+
</blocks>
|
59 |
+
|
60 |
+
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
61 |
+
|
62 |
+
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
|
63 |
+
<url>{URL}</url>
|
64 |
+
|
65 |
+
And here is the cleaned HTML content of that webpage:
|
66 |
+
<html>
|
67 |
+
{HTML}
|
68 |
+
</html>
|
69 |
+
|
70 |
+
Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
|
71 |
+
|
72 |
+
- index: an integer representing the index of the block in the content
|
73 |
+
- content: a list of strings containing the text content of the block
|
74 |
+
|
75 |
+
To generate the JSON objects:
|
76 |
+
|
77 |
+
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
78 |
+
|
79 |
+
2. For each block:
|
80 |
+
a. Assign it an index based on its order in the content.
|
81 |
+
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
82 |
+
c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
83 |
+
|
84 |
+
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
85 |
+
|
86 |
+
4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
|
87 |
+
|
88 |
+
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
89 |
+
|
90 |
+
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
91 |
+
|
92 |
+
7. Never alter the extracted content, just copy and paste it as it is.
|
93 |
+
|
94 |
+
Please provide your output within <blocks> tags, like this:
|
95 |
+
|
96 |
+
<blocks>
|
97 |
+
[{
|
98 |
+
"index": 0,
|
99 |
+
"tags": ["introduction"],
|
100 |
+
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"index": 1,
|
104 |
+
"tags": ["background"],
|
105 |
+
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
106 |
+
"It provides context and sets the stage for the rest of the article."]
|
107 |
+
}]
|
108 |
+
</blocks>
|
109 |
+
|
110 |
+
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
111 |
+
|
112 |
+
PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage:
|
113 |
+
<url>{URL}</url>
|
114 |
+
|
115 |
+
And here is the cleaned HTML content of that webpage:
|
116 |
+
<html>
|
117 |
+
{HTML}
|
118 |
+
</html>
|
119 |
+
|
120 |
+
Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys:
|
121 |
+
|
122 |
+
- index: an integer representing the index of the block in the content
|
123 |
+
- content: a list of strings containing the text content of the block
|
124 |
+
|
125 |
+
This is the user's REQUEST, pay attention to it:
|
126 |
+
<request>
|
127 |
+
{REQUEST}
|
128 |
+
</request>
|
129 |
+
|
130 |
+
To generate the JSON objects:
|
131 |
+
|
132 |
+
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
133 |
+
|
134 |
+
2. For each block:
|
135 |
+
a. Assign it an index based on its order in the content.
|
136 |
+
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
137 |
+
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
138 |
+
|
139 |
+
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
140 |
+
|
141 |
+
4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
|
142 |
+
|
143 |
+
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
144 |
+
|
145 |
+
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
146 |
+
|
147 |
+
7. Never alter the extracted content, just copy and paste it as it is.
|
148 |
+
|
149 |
+
Please provide your output within <blocks> tags, like this:
|
150 |
+
|
151 |
+
<blocks>
|
152 |
+
[{
|
153 |
+
"index": 0,
|
154 |
+
"tags": ["introduction"],
|
155 |
+
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"index": 1,
|
159 |
+
"tags": ["background"],
|
160 |
+
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
161 |
+
"It provides context and sets the stage for the rest of the article."]
|
162 |
+
}]
|
163 |
+
</blocks>
|
164 |
+
|
165 |
+
**Make sure to follow the user instruction to extract blocks aligin with the instruction.**
|
166 |
+
|
167 |
+
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
168 |
+
|
169 |
+
PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL:
|
170 |
+
<url>{URL}</url>
|
171 |
+
|
172 |
+
<url_content>
|
173 |
+
{HTML}
|
174 |
+
</url_content>
|
175 |
+
|
176 |
+
The user has made the following request for what information to extract from the above content:
|
177 |
+
|
178 |
+
<user_request>
|
179 |
+
{REQUEST}
|
180 |
+
</user_request>
|
181 |
+
|
182 |
+
<schema_block>
|
183 |
+
{SCHEMA}
|
184 |
+
</schema_block>
|
185 |
+
|
186 |
+
Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the <schema_block> above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for.
|
187 |
+
|
188 |
+
Extraction instructions:
|
189 |
+
Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in <blocks>...</blocks> XML tags.
|
190 |
+
|
191 |
+
Quality Reflection:
|
192 |
+
Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred.
|
193 |
+
|
194 |
+
Quality Score:
|
195 |
+
After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside <score> tags.
|
196 |
+
|
197 |
+
Avoid Common Mistakes:
|
198 |
+
- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
|
199 |
+
- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
|
200 |
+
- Do not miss closing </blocks> tag at the end of the JSON output.
|
201 |
+
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
202 |
+
|
203 |
+
Result
|
204 |
+
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
crawl4ai/ssl_certificate.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SSL Certificate class for handling certificate operations."""
|
2 |
+
|
3 |
+
import ssl
|
4 |
+
import socket
|
5 |
+
import base64
|
6 |
+
import json
|
7 |
+
from typing import Dict, Any, Optional
|
8 |
+
from urllib.parse import urlparse
|
9 |
+
import OpenSSL.crypto
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
|
13 |
+
class SSLCertificate:
|
14 |
+
"""
|
15 |
+
A class representing an SSL certificate with methods to export in various formats.
|
16 |
+
|
17 |
+
Attributes:
|
18 |
+
cert_info (Dict[str, Any]): The certificate information.
|
19 |
+
|
20 |
+
Methods:
|
21 |
+
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
22 |
+
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
23 |
+
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
24 |
+
export_as_pem() -> str: Export the certificate as PEM format.
|
25 |
+
export_as_der() -> bytes: Export the certificate as DER format.
|
26 |
+
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
27 |
+
export_as_text() -> str: Export the certificate as text format.
|
28 |
+
"""
|
29 |
+
def __init__(self, cert_info: Dict[str, Any]):
|
30 |
+
self._cert_info = self._decode_cert_data(cert_info)
|
31 |
+
|
32 |
+
@staticmethod
|
33 |
+
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
|
34 |
+
"""
|
35 |
+
Create SSLCertificate instance from a URL.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
url (str): URL of the website.
|
39 |
+
timeout (int): Timeout for the connection (default: 10).
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
43 |
+
"""
|
44 |
+
try:
|
45 |
+
hostname = urlparse(url).netloc
|
46 |
+
if ':' in hostname:
|
47 |
+
hostname = hostname.split(':')[0]
|
48 |
+
|
49 |
+
context = ssl.create_default_context()
|
50 |
+
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
51 |
+
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
52 |
+
cert_binary = ssock.getpeercert(binary_form=True)
|
53 |
+
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
|
54 |
+
|
55 |
+
cert_info = {
|
56 |
+
"subject": dict(x509.get_subject().get_components()),
|
57 |
+
"issuer": dict(x509.get_issuer().get_components()),
|
58 |
+
"version": x509.get_version(),
|
59 |
+
"serial_number": hex(x509.get_serial_number()),
|
60 |
+
"not_before": x509.get_notBefore(),
|
61 |
+
"not_after": x509.get_notAfter(),
|
62 |
+
"fingerprint": x509.digest("sha256").hex(),
|
63 |
+
"signature_algorithm": x509.get_signature_algorithm(),
|
64 |
+
"raw_cert": base64.b64encode(cert_binary)
|
65 |
+
}
|
66 |
+
|
67 |
+
# Add extensions
|
68 |
+
extensions = []
|
69 |
+
for i in range(x509.get_extension_count()):
|
70 |
+
ext = x509.get_extension(i)
|
71 |
+
extensions.append({
|
72 |
+
"name": ext.get_short_name(),
|
73 |
+
"value": str(ext)
|
74 |
+
})
|
75 |
+
cert_info["extensions"] = extensions
|
76 |
+
|
77 |
+
return SSLCertificate(cert_info)
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
return None
|
81 |
+
|
82 |
+
@staticmethod
|
83 |
+
def _decode_cert_data(data: Any) -> Any:
|
84 |
+
"""Helper method to decode bytes in certificate data."""
|
85 |
+
if isinstance(data, bytes):
|
86 |
+
return data.decode('utf-8')
|
87 |
+
elif isinstance(data, dict):
|
88 |
+
return {
|
89 |
+
(k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
|
90 |
+
for k, v in data.items()
|
91 |
+
}
|
92 |
+
elif isinstance(data, list):
|
93 |
+
return [SSLCertificate._decode_cert_data(item) for item in data]
|
94 |
+
return data
|
95 |
+
|
96 |
+
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
97 |
+
"""
|
98 |
+
Export certificate as JSON.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
filepath (Optional[str]): Path to save the JSON file (default: None).
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
Optional[str]: JSON string if successful, None otherwise.
|
105 |
+
"""
|
106 |
+
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
107 |
+
if filepath:
|
108 |
+
Path(filepath).write_text(json_str, encoding='utf-8')
|
109 |
+
return None
|
110 |
+
return json_str
|
111 |
+
|
112 |
+
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
113 |
+
"""
|
114 |
+
Export certificate as PEM.
|
115 |
+
|
116 |
+
Args:
|
117 |
+
filepath (Optional[str]): Path to save the PEM file (default: None).
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
Optional[str]: PEM string if successful, None otherwise.
|
121 |
+
"""
|
122 |
+
try:
|
123 |
+
x509 = OpenSSL.crypto.load_certificate(
|
124 |
+
OpenSSL.crypto.FILETYPE_ASN1,
|
125 |
+
base64.b64decode(self._cert_info['raw_cert'])
|
126 |
+
)
|
127 |
+
pem_data = OpenSSL.crypto.dump_certificate(
|
128 |
+
OpenSSL.crypto.FILETYPE_PEM,
|
129 |
+
x509
|
130 |
+
).decode('utf-8')
|
131 |
+
|
132 |
+
if filepath:
|
133 |
+
Path(filepath).write_text(pem_data, encoding='utf-8')
|
134 |
+
return None
|
135 |
+
return pem_data
|
136 |
+
except Exception as e:
|
137 |
+
return None
|
138 |
+
|
139 |
+
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
140 |
+
"""
|
141 |
+
Export certificate as DER.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
filepath (Optional[str]): Path to save the DER file (default: None).
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
Optional[bytes]: DER bytes if successful, None otherwise.
|
148 |
+
"""
|
149 |
+
try:
|
150 |
+
der_data = base64.b64decode(self._cert_info['raw_cert'])
|
151 |
+
if filepath:
|
152 |
+
Path(filepath).write_bytes(der_data)
|
153 |
+
return None
|
154 |
+
return der_data
|
155 |
+
except Exception:
|
156 |
+
return None
|
157 |
+
|
158 |
+
@property
|
159 |
+
def issuer(self) -> Dict[str, str]:
|
160 |
+
"""Get certificate issuer information."""
|
161 |
+
return self._cert_info.get('issuer', {})
|
162 |
+
|
163 |
+
@property
|
164 |
+
def subject(self) -> Dict[str, str]:
|
165 |
+
"""Get certificate subject information."""
|
166 |
+
return self._cert_info.get('subject', {})
|
167 |
+
|
168 |
+
@property
|
169 |
+
def valid_from(self) -> str:
|
170 |
+
"""Get certificate validity start date."""
|
171 |
+
return self._cert_info.get('not_before', '')
|
172 |
+
|
173 |
+
@property
|
174 |
+
def valid_until(self) -> str:
|
175 |
+
"""Get certificate validity end date."""
|
176 |
+
return self._cert_info.get('not_after', '')
|
177 |
+
|
178 |
+
@property
|
179 |
+
def fingerprint(self) -> str:
|
180 |
+
"""Get certificate fingerprint."""
|
181 |
+
return self._cert_info.get('fingerprint', '')
|
crawl4ai/user_agent_generator.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from typing import Optional, Literal, List, Dict, Tuple
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
class UserAgentGenerator:
|
7 |
+
"""
|
8 |
+
Generate random user agents with specified constraints.
|
9 |
+
|
10 |
+
Attributes:
|
11 |
+
desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
|
12 |
+
mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
|
13 |
+
browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
|
14 |
+
rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
|
15 |
+
chrome_versions (list): A list of possible Chrome browser versions.
|
16 |
+
firefox_versions (list): A list of possible Firefox browser versions.
|
17 |
+
edge_versions (list): A list of possible Edge browser versions.
|
18 |
+
safari_versions (list): A list of possible Safari browser versions.
|
19 |
+
ios_versions (list): A list of possible iOS browser versions.
|
20 |
+
android_versions (list): A list of possible Android browser versions.
|
21 |
+
|
22 |
+
Methods:
|
23 |
+
generate_user_agent(
|
24 |
+
platform: Literal["desktop", "mobile"] = "desktop",
|
25 |
+
browser: str = "chrome",
|
26 |
+
rendering_engine: str = "chrome_webkit",
|
27 |
+
chrome_version: Optional[str] = None,
|
28 |
+
firefox_version: Optional[str] = None,
|
29 |
+
edge_version: Optional[str] = None,
|
30 |
+
safari_version: Optional[str] = None,
|
31 |
+
ios_version: Optional[str] = None,
|
32 |
+
android_version: Optional[str] = None
|
33 |
+
): Generates a random user agent string based on the specified parameters.
|
34 |
+
"""
|
35 |
+
def __init__(self):
|
36 |
+
# Previous platform definitions remain the same...
|
37 |
+
self.desktop_platforms = {
|
38 |
+
"windows": {
|
39 |
+
"10_64": "(Windows NT 10.0; Win64; x64)",
|
40 |
+
"10_32": "(Windows NT 10.0; WOW64)",
|
41 |
+
},
|
42 |
+
"macos": {
|
43 |
+
"intel": "(Macintosh; Intel Mac OS X 10_15_7)",
|
44 |
+
"newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)",
|
45 |
+
},
|
46 |
+
"linux": {
|
47 |
+
"generic": "(X11; Linux x86_64)",
|
48 |
+
"ubuntu": "(X11; Ubuntu; Linux x86_64)",
|
49 |
+
"chrome_os": "(X11; CrOS x86_64 14541.0.0)",
|
50 |
+
}
|
51 |
+
}
|
52 |
+
|
53 |
+
self.mobile_platforms = {
|
54 |
+
"android": {
|
55 |
+
"samsung": "(Linux; Android 13; SM-S901B)",
|
56 |
+
"pixel": "(Linux; Android 12; Pixel 6)",
|
57 |
+
"oneplus": "(Linux; Android 13; OnePlus 9 Pro)",
|
58 |
+
"xiaomi": "(Linux; Android 12; M2102J20SG)",
|
59 |
+
},
|
60 |
+
"ios": {
|
61 |
+
"iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)",
|
62 |
+
"ipad": "(iPad; CPU OS 16_5 like Mac OS X)",
|
63 |
+
}
|
64 |
+
}
|
65 |
+
|
66 |
+
# Browser Combinations
|
67 |
+
self.browser_combinations = {
|
68 |
+
1: [
|
69 |
+
["chrome"],
|
70 |
+
["firefox"],
|
71 |
+
["safari"],
|
72 |
+
["edge"]
|
73 |
+
],
|
74 |
+
2: [
|
75 |
+
["gecko", "firefox"],
|
76 |
+
["chrome", "safari"],
|
77 |
+
["webkit", "safari"]
|
78 |
+
],
|
79 |
+
3: [
|
80 |
+
["chrome", "safari", "edge"],
|
81 |
+
["webkit", "chrome", "safari"]
|
82 |
+
]
|
83 |
+
}
|
84 |
+
|
85 |
+
# Rendering Engines with versions
|
86 |
+
self.rendering_engines = {
|
87 |
+
"chrome_webkit": "AppleWebKit/537.36",
|
88 |
+
"safari_webkit": "AppleWebKit/605.1.15",
|
89 |
+
"gecko": [ # Added Gecko versions
|
90 |
+
"Gecko/20100101",
|
91 |
+
"Gecko/20100101", # Firefox usually uses this constant version
|
92 |
+
"Gecko/2010010",
|
93 |
+
]
|
94 |
+
}
|
95 |
+
|
96 |
+
# Browser Versions
|
97 |
+
self.chrome_versions = [
|
98 |
+
"Chrome/119.0.6045.199",
|
99 |
+
"Chrome/118.0.5993.117",
|
100 |
+
"Chrome/117.0.5938.149",
|
101 |
+
"Chrome/116.0.5845.187",
|
102 |
+
"Chrome/115.0.5790.171",
|
103 |
+
]
|
104 |
+
|
105 |
+
self.edge_versions = [
|
106 |
+
"Edg/119.0.2151.97",
|
107 |
+
"Edg/118.0.2088.76",
|
108 |
+
"Edg/117.0.2045.47",
|
109 |
+
"Edg/116.0.1938.81",
|
110 |
+
"Edg/115.0.1901.203",
|
111 |
+
]
|
112 |
+
|
113 |
+
self.safari_versions = [
|
114 |
+
"Safari/537.36", # For Chrome-based
|
115 |
+
"Safari/605.1.15",
|
116 |
+
"Safari/604.1",
|
117 |
+
"Safari/602.1",
|
118 |
+
"Safari/601.5.17",
|
119 |
+
]
|
120 |
+
|
121 |
+
# Added Firefox versions
|
122 |
+
self.firefox_versions = [
|
123 |
+
"Firefox/119.0",
|
124 |
+
"Firefox/118.0.2",
|
125 |
+
"Firefox/117.0.1",
|
126 |
+
"Firefox/116.0",
|
127 |
+
"Firefox/115.0.3",
|
128 |
+
"Firefox/114.0.2",
|
129 |
+
"Firefox/113.0.1",
|
130 |
+
"Firefox/112.0",
|
131 |
+
"Firefox/111.0.1",
|
132 |
+
"Firefox/110.0",
|
133 |
+
]
|
134 |
+
|
135 |
+
def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
|
136 |
+
"""
|
137 |
+
Get a valid combination of browser versions.
|
138 |
+
|
139 |
+
How it works:
|
140 |
+
1. Check if the number of browsers is supported.
|
141 |
+
2. Randomly choose a combination of browsers.
|
142 |
+
3. Iterate through the combination and add browser versions.
|
143 |
+
4. Return the browser stack.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
num_browsers: Number of browser specifications (1-3)
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
List[str]: A list of browser versions.
|
150 |
+
"""
|
151 |
+
if num_browsers not in self.browser_combinations:
|
152 |
+
raise ValueError(f"Unsupported number of browsers: {num_browsers}")
|
153 |
+
|
154 |
+
combination = random.choice(self.browser_combinations[num_browsers])
|
155 |
+
browser_stack = []
|
156 |
+
|
157 |
+
for browser in combination:
|
158 |
+
if browser == "chrome":
|
159 |
+
browser_stack.append(random.choice(self.chrome_versions))
|
160 |
+
elif browser == "firefox":
|
161 |
+
browser_stack.append(random.choice(self.firefox_versions))
|
162 |
+
elif browser == "safari":
|
163 |
+
browser_stack.append(random.choice(self.safari_versions))
|
164 |
+
elif browser == "edge":
|
165 |
+
browser_stack.append(random.choice(self.edge_versions))
|
166 |
+
elif browser == "gecko":
|
167 |
+
browser_stack.append(random.choice(self.rendering_engines["gecko"]))
|
168 |
+
elif browser == "webkit":
|
169 |
+
browser_stack.append(self.rendering_engines["chrome_webkit"])
|
170 |
+
|
171 |
+
return browser_stack
|
172 |
+
|
173 |
+
def generate(self,
|
174 |
+
device_type: Optional[Literal['desktop', 'mobile']] = None,
|
175 |
+
os_type: Optional[str] = None,
|
176 |
+
device_brand: Optional[str] = None,
|
177 |
+
browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None,
|
178 |
+
num_browsers: int = 3) -> str:
|
179 |
+
"""
|
180 |
+
Generate a random user agent with specified constraints.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
device_type: 'desktop' or 'mobile'
|
184 |
+
os_type: 'windows', 'macos', 'linux', 'android', 'ios'
|
185 |
+
device_brand: Specific device brand
|
186 |
+
browser_type: 'chrome', 'edge', 'safari', or 'firefox'
|
187 |
+
num_browsers: Number of browser specifications (1-3)
|
188 |
+
"""
|
189 |
+
# Get platform string
|
190 |
+
platform = self.get_random_platform(device_type, os_type, device_brand)
|
191 |
+
|
192 |
+
# Start with Mozilla
|
193 |
+
components = ["Mozilla/5.0", platform]
|
194 |
+
|
195 |
+
# Add browser stack
|
196 |
+
browser_stack = self.get_browser_stack(num_browsers)
|
197 |
+
|
198 |
+
# Add appropriate legacy token based on browser stack
|
199 |
+
if "Firefox" in str(browser_stack):
|
200 |
+
components.append(random.choice(self.rendering_engines["gecko"]))
|
201 |
+
elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
|
202 |
+
components.append(self.rendering_engines["chrome_webkit"])
|
203 |
+
components.append("(KHTML, like Gecko)")
|
204 |
+
|
205 |
+
# Add browser versions
|
206 |
+
components.extend(browser_stack)
|
207 |
+
|
208 |
+
return " ".join(components)
|
209 |
+
|
210 |
+
def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]:
|
211 |
+
"""Generate both user agent and matching client hints"""
|
212 |
+
user_agent = self.generate(**kwargs)
|
213 |
+
client_hints = self.generate_client_hints(user_agent)
|
214 |
+
return user_agent, client_hints
|
215 |
+
|
216 |
+
def get_random_platform(self, device_type, os_type, device_brand):
|
217 |
+
"""Helper method to get random platform based on constraints"""
|
218 |
+
platforms = self.desktop_platforms if device_type == 'desktop' else \
|
219 |
+
self.mobile_platforms if device_type == 'mobile' else \
|
220 |
+
{**self.desktop_platforms, **self.mobile_platforms}
|
221 |
+
|
222 |
+
if os_type:
|
223 |
+
for platform_group in [self.desktop_platforms, self.mobile_platforms]:
|
224 |
+
if os_type in platform_group:
|
225 |
+
platforms = {os_type: platform_group[os_type]}
|
226 |
+
break
|
227 |
+
|
228 |
+
os_key = random.choice(list(platforms.keys()))
|
229 |
+
if device_brand and device_brand in platforms[os_key]:
|
230 |
+
return platforms[os_key][device_brand]
|
231 |
+
return random.choice(list(platforms[os_key].values()))
|
232 |
+
|
233 |
+
def parse_user_agent(self, user_agent: str) -> Dict[str, str]:
|
234 |
+
"""Parse a user agent string to extract browser and version information"""
|
235 |
+
browsers = {
|
236 |
+
'chrome': r'Chrome/(\d+)',
|
237 |
+
'edge': r'Edg/(\d+)',
|
238 |
+
'safari': r'Version/(\d+)',
|
239 |
+
'firefox': r'Firefox/(\d+)'
|
240 |
+
}
|
241 |
+
|
242 |
+
result = {}
|
243 |
+
for browser, pattern in browsers.items():
|
244 |
+
match = re.search(pattern, user_agent)
|
245 |
+
if match:
|
246 |
+
result[browser] = match.group(1)
|
247 |
+
|
248 |
+
return result
|
249 |
+
|
250 |
+
def generate_client_hints(self, user_agent: str) -> str:
|
251 |
+
"""Generate Sec-CH-UA header value based on user agent string"""
|
252 |
+
browsers = self.parse_user_agent(user_agent)
|
253 |
+
|
254 |
+
# Client hints components
|
255 |
+
hints = []
|
256 |
+
|
257 |
+
# Handle different browser combinations
|
258 |
+
if 'chrome' in browsers:
|
259 |
+
hints.append(f'"Chromium";v="{browsers["chrome"]}"')
|
260 |
+
hints.append('"Not_A Brand";v="8"')
|
261 |
+
|
262 |
+
if 'edge' in browsers:
|
263 |
+
hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
|
264 |
+
else:
|
265 |
+
hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
|
266 |
+
|
267 |
+
elif 'firefox' in browsers:
|
268 |
+
# Firefox doesn't typically send Sec-CH-UA
|
269 |
+
return '""'
|
270 |
+
|
271 |
+
elif 'safari' in browsers:
|
272 |
+
# Safari's format for client hints
|
273 |
+
hints.append(f'"Safari";v="{browsers["safari"]}"')
|
274 |
+
hints.append('"Not_A Brand";v="8"')
|
275 |
+
|
276 |
+
return ', '.join(hints)
|
277 |
+
|
278 |
+
# Example usage:
|
279 |
+
if __name__ == "__main__":
|
280 |
+
generator = UserAgentGenerator()
|
281 |
+
print(generator.generate())
|
282 |
+
|
283 |
+
print("\nSingle browser (Chrome):")
|
284 |
+
print(generator.generate(num_browsers=1, browser_type='chrome'))
|
285 |
+
|
286 |
+
print("\nTwo browsers (Gecko/Firefox):")
|
287 |
+
print(generator.generate(num_browsers=2))
|
288 |
+
|
289 |
+
print("\nThree browsers (Chrome/Safari/Edge):")
|
290 |
+
print(generator.generate(num_browsers=3))
|
291 |
+
|
292 |
+
print("\nFirefox on Linux:")
|
293 |
+
print(generator.generate(
|
294 |
+
device_type='desktop',
|
295 |
+
os_type='linux',
|
296 |
+
browser_type='firefox',
|
297 |
+
num_browsers=2
|
298 |
+
))
|
299 |
+
|
300 |
+
print("\nChrome/Safari/Edge on Windows:")
|
301 |
+
print(generator.generate(
|
302 |
+
device_type='desktop',
|
303 |
+
os_type='windows',
|
304 |
+
num_browsers=3
|
305 |
+
))
|