Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
bf1e0a0
1
Parent(s):
6f4a113
#fix added headers that bypasses response code 418
Browse files- google_search.py +6 -1
google_search.py
CHANGED
@@ -122,6 +122,8 @@ def build_date(year=2024, month="March", day=1):
|
|
122 |
async def get_url_data(url, client):
|
123 |
try:
|
124 |
r = await client.get(url, follow_redirects=True)
|
|
|
|
|
125 |
if r.status_code == 200:
|
126 |
content_type = r.headers.get("Content-Type", "").lower()
|
127 |
# detect if pdf
|
@@ -147,7 +149,10 @@ async def extract_pdf_text(content):
|
|
147 |
|
148 |
|
149 |
async def parallel_scrap(urls):
|
150 |
-
|
|
|
|
|
|
|
151 |
tasks = []
|
152 |
for url in urls:
|
153 |
tasks.append(get_url_data(url=url, client=client))
|
|
|
122 |
async def get_url_data(url, client):
|
123 |
try:
|
124 |
r = await client.get(url, follow_redirects=True)
|
125 |
+
print(f"URL: {url}, Response Code: {r.status_code}")
|
126 |
+
|
127 |
if r.status_code == 200:
|
128 |
content_type = r.headers.get("Content-Type", "").lower()
|
129 |
# detect if pdf
|
|
|
149 |
|
150 |
|
151 |
async def parallel_scrap(urls):
|
152 |
+
headers = {
|
153 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
154 |
+
}
|
155 |
+
async with httpx.AsyncClient(timeout=30, headers=headers) as client:
|
156 |
tasks = []
|
157 |
for url in urls:
|
158 |
tasks.append(get_url_data(url=url, client=client))
|