article_writer / google_search.py
eljanmahammadli's picture
chore: increase of numbers to scrape; disabled PDF check in scholar model
a6fbfb6
raw
history blame
7.66 kB
import os
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from dotenv import load_dotenv
import requests
import fitz
from trafilatura import extract
load_dotenv()
API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
# Number of pages to scrape
NUM_PAGES = 20
def build_results_beautifulsoup(url_list):
print("Starting to scrape URLs...")
start_time = time.perf_counter()
# scrape URLs in list
soups = asyncio.run(parallel_scrap(url_list))
scraping_time = time.perf_counter() - start_time
print(f"Scraping processing time: {scraping_time:.2f} seconds")
result_content = {}
count = 0
print("Starting to process each URL...")
for url, soup in zip(url_list, soups):
if count >= NUM_PAGES:
print(f"Reached the limit of {NUM_PAGES} pages. Stopping processing.")
break
if soup:
print(f"Processing URL: {url}")
text = extract(
soup,
include_tables=False,
include_comments=False,
output_format="txt",
)
# If text is None or empty, log a warning and skip
if text is None:
print(f"Warning: Extraction returned None for URL: {url}")
elif len(text) > 500:
print(f"Adding content from URL: {url}, content length: {len(text)}")
result_content[url] = text
count += 1
else:
print(f"Skipped URL: {url}, content too short (length: {len(text)})")
else:
print(f"Skipped URL: {url}, no soup content available.")
print("Finished processing URLs.")
return result_content
def build_results_extractor(url_list):
try:
endpoint = "https://extractorapi.com/api/v1/extractor"
result_content = {}
count = 0
for url in url_list:
if count >= NUM_PAGES:
break
params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
r = requests.get(endpoint, params=params)
if r.status_code == 200:
text = r.json()["text"]
if len(text) > 500:
result_content[url] = text
count += 1
if r.status_code == 403:
raise Exception(f"Error with API; using default implementaion instead")
return result_content
except Exception as e:
print(e)
return build_results_beautifulsoup(url_list)
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12",
}
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]
def build_date(year=2024, month="March", day=1):
return f"{year}{months[month]}{day}"
async def get_url_data(url, client):
try:
r = await client.get(url, follow_redirects=True)
print(f"URL: {url}, Response Code: {r.status_code}")
if r.status_code == 200:
content_type = r.headers.get("Content-Type", "").lower()
# Improved PDF detection using Content-Type and file extension
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
pdf_content = await extract_pdf_text(r.content)
return pdf_content
else:
return r.content
else:
print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
return None
except Exception as e:
print(f"Error fetching URL: {url}, Error: {str(e)}")
return None
async def extract_pdf_text(content):
try:
with fitz.open(stream=content, filetype="pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
html_content = f"""
<!DOCTYPE html>
<html>
<body>
<p>{text}</p>
</body>
</html>
"""
html_bytes = html_content.encode('utf-8')
return html_bytes # Return in such a format that is parsable by trafilatura
except Exception as e:
print(f"Error extracting PDF text: {str(e)}")
return None
async def parallel_scrap(urls):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
async with httpx.AsyncClient(timeout=30, headers=headers) as client:
tasks = []
for url in urls:
tasks.append(get_url_data(url=url, client=client))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def scrap(urls):
client = httpx.Client()
soups = []
for url in urls:
soups.append(get_url_data(url=url, client=client))
return soups
def google_search_urls(
text,
sorted_date,
domains_to_include,
api_key,
cse_id,
num_results=10, # Number of results to fetch per page
total_results=30, # Total number of results to fetch
skip_urls=None, # List of URLs to skip
**kwargs,
):
if skip_urls is None:
skip_urls = [] # Initialize as empty list if not provided
service = build("customsearch", "v1", developerKey=api_key)
url_list = []
start_index = 1 # Initial index for the search results
while len(url_list) < total_results:
# Fetch a page of results
results = service.cse().list(
q=text,
cx=cse_id,
sort=sorted_date,
start=start_index,
num=min(num_results, total_results - len(url_list)),
**kwargs
).execute()
if "items" in results and len(results["items"]) > 0:
for count, link in enumerate(results["items"]):
url = link["link"]
# Skip if the URL is in the skip_urls list or doesn't match the domain filter
if url in skip_urls:
continue
if (domains_to_include is None) or any(
("." + domain) in url for domain in domains_to_include
):
if url not in url_list:
url_list.append(url)
else:
# No more results
break
# Move to the next page of results
start_index += num_results
return url_list[:total_results]
def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
start_time = time.perf_counter()
# if scholar_mode_check:
# topic += " -filetype:pdf"
url_list = google_search_urls(
topic,
sorted_date,
domains_to_include,
api_key,
cse_id,
)
print("---")
print(len(url_list))
print(url_list)
print("---")
print("Google Search processing time: ", time.perf_counter() - start_time)
result_content = build_results_beautifulsoup(url_list)
return result_content
if __name__ == "__main__":
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
print(res.keys())
print(len(res))