eljanmahammadli commited on
Commit
bf1e0a0
·
1 Parent(s): 6f4a113

#fix added headers that bypasses response code 418

Browse files
Files changed (1) hide show
  1. google_search.py +6 -1
google_search.py CHANGED
@@ -122,6 +122,8 @@ def build_date(year=2024, month="March", day=1):
122
  async def get_url_data(url, client):
123
  try:
124
  r = await client.get(url, follow_redirects=True)
 
 
125
  if r.status_code == 200:
126
  content_type = r.headers.get("Content-Type", "").lower()
127
  # detect if pdf
@@ -147,7 +149,10 @@ async def extract_pdf_text(content):
147
 
148
 
149
  async def parallel_scrap(urls):
150
- async with httpx.AsyncClient(timeout=30) as client:
 
 
 
151
  tasks = []
152
  for url in urls:
153
  tasks.append(get_url_data(url=url, client=client))
 
122
  async def get_url_data(url, client):
123
  try:
124
  r = await client.get(url, follow_redirects=True)
125
+ print(f"URL: {url}, Response Code: {r.status_code}")
126
+
127
  if r.status_code == 200:
128
  content_type = r.headers.get("Content-Type", "").lower()
129
  # detect if pdf
 
149
 
150
 
151
  async def parallel_scrap(urls):
152
+ headers = {
153
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
154
+ }
155
+ async with httpx.AsyncClient(timeout=30, headers=headers) as client:
156
  tasks = []
157
  for url in urls:
158
  tasks.append(get_url_data(url=url, client=client))