update
Browse files- pages_helpers.py +5 -5
pages_helpers.py
CHANGED
@@ -17,15 +17,14 @@ from text_utils import *
|
|
17 |
|
18 |
from llm import *
|
19 |
|
20 |
-
from mode_llm import llm_html_to_md, md_to_text, get_html_body_with_soup
|
21 |
|
22 |
-
from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
|
23 |
|
24 |
# Create an instance of WebCrawler
|
25 |
-
crawler = WebCrawler()
|
26 |
|
27 |
# Warm up the crawler (load necessary models)
|
28 |
-
crawler.warmup()
|
29 |
|
30 |
## Cách lấy cookies và headers sử dụng https://curlconverter.com
|
31 |
cookies = {
|
@@ -193,6 +192,7 @@ Your connection is not private
|
|
193 |
|
194 |
|
195 |
meta = None
|
|
|
196 |
if html is None or len(html) < 500:
|
197 |
# Thử lần 2 bằng CRAWL4AI
|
198 |
print("GET HTML CRAWL4AI", filename, flush=True)
|
@@ -223,7 +223,7 @@ Your connection is not private
|
|
223 |
html = None
|
224 |
meta = {}
|
225 |
break
|
226 |
-
|
227 |
|
228 |
if html is None or len(html) < 500:
|
229 |
# Thử lần 3 bằng reader api
|
|
|
17 |
|
18 |
from llm import *
|
19 |
|
|
|
20 |
|
21 |
+
# from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
|
22 |
|
23 |
# Create an instance of WebCrawler
|
24 |
+
# crawler = WebCrawler()
|
25 |
|
26 |
# Warm up the crawler (load necessary models)
|
27 |
+
# crawler.warmup()
|
28 |
|
29 |
## Cách lấy cookies và headers sử dụng https://curlconverter.com
|
30 |
cookies = {
|
|
|
192 |
|
193 |
|
194 |
meta = None
|
195 |
+
'''
|
196 |
if html is None or len(html) < 500:
|
197 |
# Thử lần 2 bằng CRAWL4AI
|
198 |
print("GET HTML CRAWL4AI", filename, flush=True)
|
|
|
223 |
html = None
|
224 |
meta = {}
|
225 |
break
|
226 |
+
'''
|
227 |
|
228 |
if html is None or len(html) < 500:
|
229 |
# Thử lần 3 bằng reader api
|