Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
d904dd4
1
Parent(s):
8b9c9ff
#perf: quality improvements to website scrape + PDF detect logic
Browse files- google_search.py +32 -31
google_search.py
CHANGED
@@ -3,12 +3,10 @@ import time
|
|
3 |
from googleapiclient.discovery import build
|
4 |
import asyncio
|
5 |
import httpx
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
from dotenv import load_dotenv
|
8 |
-
import html2text
|
9 |
import requests
|
10 |
-
import unicodedata
|
11 |
import fitz
|
|
|
12 |
|
13 |
load_dotenv()
|
14 |
|
@@ -18,26 +16,6 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
|
18 |
# Number of pages to scrape
|
19 |
NUM_PAGES = 10
|
20 |
|
21 |
-
# load html2text and set up configs
|
22 |
-
h2t = html2text.HTML2Text()
|
23 |
-
h2t.bodywidth = 0 # No wrapping
|
24 |
-
h2t.ignore_links = True # Ignore hyperlinks
|
25 |
-
h2t.ignore_images = True # Ignore images
|
26 |
-
h2t.ignore_emphasis = True # Ignore emphasis
|
27 |
-
h2t.ignore_tables = False # Include tables
|
28 |
-
h2t.skip_internal_links = True # Skip internal links
|
29 |
-
h2t.skip_external_links = True # Skip external links
|
30 |
-
h2t.single_line_break = True # Use single line breaks
|
31 |
-
h2t.protect_links = True # Protect links from being split
|
32 |
-
h2t.default_image_alt = "[image]" # Default alt text for images
|
33 |
-
|
34 |
-
|
35 |
-
def clean_html(text):
|
36 |
-
text = h2t.handle(text)
|
37 |
-
text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
|
38 |
-
return text
|
39 |
-
|
40 |
-
|
41 |
def build_results_beautifulsoup(url_list):
|
42 |
print("Starting to scrape URLs...")
|
43 |
start_time = time.perf_counter()
|
@@ -59,8 +37,17 @@ def build_results_beautifulsoup(url_list):
|
|
59 |
|
60 |
if soup:
|
61 |
print(f"Processing URL: {url}")
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
print(f"Adding content from URL: {url}, content length: {len(text)}")
|
65 |
result_content[url] = text
|
66 |
count += 1
|
@@ -126,13 +113,18 @@ async def get_url_data(url, client):
|
|
126 |
|
127 |
if r.status_code == 200:
|
128 |
content_type = r.headers.get("Content-Type", "").lower()
|
129 |
-
#
|
130 |
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
|
|
131 |
pdf_content = await extract_pdf_text(r.content)
|
132 |
-
return
|
133 |
else:
|
134 |
-
return
|
135 |
-
|
|
|
|
|
|
|
|
|
136 |
return None
|
137 |
|
138 |
|
@@ -142,10 +134,19 @@ async def extract_pdf_text(content):
|
|
142 |
text = ""
|
143 |
for page in doc:
|
144 |
text += page.get_text()
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
except Exception as e:
|
147 |
print(f"Error extracting PDF text: {str(e)}")
|
148 |
-
return
|
149 |
|
150 |
|
151 |
async def parallel_scrap(urls):
|
|
|
3 |
from googleapiclient.discovery import build
|
4 |
import asyncio
|
5 |
import httpx
|
|
|
6 |
from dotenv import load_dotenv
|
|
|
7 |
import requests
|
|
|
8 |
import fitz
|
9 |
+
from trafilatura import extract
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
16 |
# Number of pages to scrape
|
17 |
NUM_PAGES = 10
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def build_results_beautifulsoup(url_list):
|
20 |
print("Starting to scrape URLs...")
|
21 |
start_time = time.perf_counter()
|
|
|
37 |
|
38 |
if soup:
|
39 |
print(f"Processing URL: {url}")
|
40 |
+
|
41 |
+
text = extract(
|
42 |
+
soup,
|
43 |
+
include_tables=False,
|
44 |
+
include_comments=False,
|
45 |
+
output_format="txt",
|
46 |
+
)
|
47 |
+
# If text is None or empty, log a warning and skip
|
48 |
+
if text is None:
|
49 |
+
print(f"Warning: Extraction returned None for URL: {url}")
|
50 |
+
elif len(text) > 500:
|
51 |
print(f"Adding content from URL: {url}, content length: {len(text)}")
|
52 |
result_content[url] = text
|
53 |
count += 1
|
|
|
113 |
|
114 |
if r.status_code == 200:
|
115 |
content_type = r.headers.get("Content-Type", "").lower()
|
116 |
+
# Improved PDF detection using Content-Type and file extension
|
117 |
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
118 |
+
print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
|
119 |
pdf_content = await extract_pdf_text(r.content)
|
120 |
+
return pdf_content
|
121 |
else:
|
122 |
+
return r.content
|
123 |
+
else:
|
124 |
+
print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
|
125 |
+
return None
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Error fetching URL: {url}, Error: {str(e)}")
|
128 |
return None
|
129 |
|
130 |
|
|
|
134 |
text = ""
|
135 |
for page in doc:
|
136 |
text += page.get_text()
|
137 |
+
html_content = f"""
|
138 |
+
<!DOCTYPE html>
|
139 |
+
<html>
|
140 |
+
<body>
|
141 |
+
<p>{text}</p>
|
142 |
+
</body>
|
143 |
+
</html>
|
144 |
+
"""
|
145 |
+
html_bytes = html_content.encode('utf-8')
|
146 |
+
return html_bytes # Return in such a format that is parsable by trafilatura
|
147 |
except Exception as e:
|
148 |
print(f"Error extracting PDF text: {str(e)}")
|
149 |
+
return None
|
150 |
|
151 |
|
152 |
async def parallel_scrap(urls):
|