eljanmahammadli commited on
Commit
d904dd4
·
1 Parent(s): 8b9c9ff

#perf: quality improvements to website scrape + PDF detect logic

Browse files
Files changed (1) hide show
  1. google_search.py +32 -31
google_search.py CHANGED
@@ -3,12 +3,10 @@ import time
3
  from googleapiclient.discovery import build
4
  import asyncio
5
  import httpx
6
- from bs4 import BeautifulSoup
7
  from dotenv import load_dotenv
8
- import html2text
9
  import requests
10
- import unicodedata
11
  import fitz
 
12
 
13
  load_dotenv()
14
 
@@ -18,26 +16,6 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
18
  # Number of pages to scrape
19
  NUM_PAGES = 10
20
 
21
- # load html2text and set up configs
22
- h2t = html2text.HTML2Text()
23
- h2t.bodywidth = 0 # No wrapping
24
- h2t.ignore_links = True # Ignore hyperlinks
25
- h2t.ignore_images = True # Ignore images
26
- h2t.ignore_emphasis = True # Ignore emphasis
27
- h2t.ignore_tables = False # Include tables
28
- h2t.skip_internal_links = True # Skip internal links
29
- h2t.skip_external_links = True # Skip external links
30
- h2t.single_line_break = True # Use single line breaks
31
- h2t.protect_links = True # Protect links from being split
32
- h2t.default_image_alt = "[image]" # Default alt text for images
33
-
34
-
35
- def clean_html(text):
36
- text = h2t.handle(text)
37
- text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
38
- return text
39
-
40
-
41
  def build_results_beautifulsoup(url_list):
42
  print("Starting to scrape URLs...")
43
  start_time = time.perf_counter()
@@ -59,8 +37,17 @@ def build_results_beautifulsoup(url_list):
59
 
60
  if soup:
61
  print(f"Processing URL: {url}")
62
- text = clean_html(soup.text)
63
- if len(text) > 500:
 
 
 
 
 
 
 
 
 
64
  print(f"Adding content from URL: {url}, content length: {len(text)}")
65
  result_content[url] = text
66
  count += 1
@@ -126,13 +113,18 @@ async def get_url_data(url, client):
126
 
127
  if r.status_code == 200:
128
  content_type = r.headers.get("Content-Type", "").lower()
129
- # detect if pdf
130
  if "application/pdf" in content_type or url.lower().endswith(".pdf"):
 
131
  pdf_content = await extract_pdf_text(r.content)
132
- return BeautifulSoup(pdf_content, "html.parser")
133
  else:
134
- return BeautifulSoup(r.content, "html.parser")
135
- except Exception:
 
 
 
 
136
  return None
137
 
138
 
@@ -142,10 +134,19 @@ async def extract_pdf_text(content):
142
  text = ""
143
  for page in doc:
144
  text += page.get_text()
145
- return f"<div>{text}</div>" # Wrap in a div to make it valid HTML
 
 
 
 
 
 
 
 
 
146
  except Exception as e:
147
  print(f"Error extracting PDF text: {str(e)}")
148
- return "<div>Error extracting PDF text</div>"
149
 
150
 
151
  async def parallel_scrap(urls):
 
3
  from googleapiclient.discovery import build
4
  import asyncio
5
  import httpx
 
6
  from dotenv import load_dotenv
 
7
  import requests
 
8
  import fitz
9
+ from trafilatura import extract
10
 
11
  load_dotenv()
12
 
 
16
  # Number of pages to scrape
17
  NUM_PAGES = 10
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def build_results_beautifulsoup(url_list):
20
  print("Starting to scrape URLs...")
21
  start_time = time.perf_counter()
 
37
 
38
  if soup:
39
  print(f"Processing URL: {url}")
40
+
41
+ text = extract(
42
+ soup,
43
+ include_tables=False,
44
+ include_comments=False,
45
+ output_format="txt",
46
+ )
47
+ # If text is None or empty, log a warning and skip
48
+ if text is None:
49
+ print(f"Warning: Extraction returned None for URL: {url}")
50
+ elif len(text) > 500:
51
  print(f"Adding content from URL: {url}, content length: {len(text)}")
52
  result_content[url] = text
53
  count += 1
 
113
 
114
  if r.status_code == 200:
115
  content_type = r.headers.get("Content-Type", "").lower()
116
+ # Improved PDF detection using Content-Type and file extension
117
  if "application/pdf" in content_type or url.lower().endswith(".pdf"):
118
+ print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
119
  pdf_content = await extract_pdf_text(r.content)
120
+ return pdf_content
121
  else:
122
+ return r.content
123
+ else:
124
+ print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
125
+ return None
126
+ except Exception as e:
127
+ print(f"Error fetching URL: {url}, Error: {str(e)}")
128
  return None
129
 
130
 
 
134
  text = ""
135
  for page in doc:
136
  text += page.get_text()
137
+ html_content = f"""
138
+ <!DOCTYPE html>
139
+ <html>
140
+ <body>
141
+ <p>{text}</p>
142
+ </body>
143
+ </html>
144
+ """
145
+ html_bytes = html_content.encode('utf-8')
146
+ return html_bytes # Return in such a format that is parsable by trafilatura
147
  except Exception as e:
148
  print(f"Error extracting PDF text: {str(e)}")
149
+ return None
150
 
151
 
152
  async def parallel_scrap(urls):