tiendung commited on
Commit
8d68b9b
·
1 Parent(s): db91fa3
Files changed (1) hide show
  1. pages_helpers.py +5 -5
pages_helpers.py CHANGED
@@ -17,15 +17,14 @@ from text_utils import *
17
 
18
  from llm import *
19
 
20
- from mode_llm import llm_html_to_md, md_to_text, get_html_body_with_soup
21
 
22
- from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
23
 
24
  # Create an instance of WebCrawler
25
- crawler = WebCrawler()
26
 
27
  # Warm up the crawler (load necessary models)
28
- crawler.warmup()
29
 
30
  ## Cách lấy cookies và headers sử dụng https://curlconverter.com
31
  cookies = {
@@ -193,6 +192,7 @@ Your connection is not private
193
 
194
 
195
  meta = None
 
196
  if html is None or len(html) < 500:
197
  # Thử lần 2 bằng CRAWL4AI
198
  print("GET HTML CRAWL4AI", filename, flush=True)
@@ -223,7 +223,7 @@ Your connection is not private
223
  html = None
224
  meta = {}
225
  break
226
-
227
 
228
  if html is None or len(html) < 500:
229
  # Thử lần 3 bằng reader api
 
17
 
18
  from llm import *
19
 
 
20
 
21
+ # from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
22
 
23
  # Create an instance of WebCrawler
24
+ # crawler = WebCrawler()
25
 
26
  # Warm up the crawler (load necessary models)
27
+ # crawler.warmup()
28
 
29
  ## Cách lấy cookies và headers sử dụng https://curlconverter.com
30
  cookies = {
 
192
 
193
 
194
  meta = None
195
+ '''
196
  if html is None or len(html) < 500:
197
  # Thử lần 2 bằng CRAWL4AI
198
  print("GET HTML CRAWL4AI", filename, flush=True)
 
223
  html = None
224
  meta = {}
225
  break
226
+ '''
227
 
228
  if html is None or len(html) < 500:
229
  # Thử lần 3 bằng reader api