import time from googleapiclient.discovery import build import asyncio import httpx from bs4 import BeautifulSoup import justext import newspaper def clean_html(text): result = "" article = newspaper.Article(url=" ") article.set_html(text) article.parse() result += article.title + "\n" paragraphs = justext.justext(text, justext.get_stoplist("English")) for paragraph in paragraphs: result += paragraph.text return result months = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12", } domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"] def build_date(year=2024, month="March", day=1): return f"{year}{months[month]}{day}" async def get_url_data(url, client): try: r = await client.get(url) if r.status_code == 200: soup = BeautifulSoup(r.content, "html.parser") return soup except Exception: return None async def parallel_scrap(urls): async with httpx.AsyncClient(timeout=30) as client: tasks = [] for url in urls: tasks.append(get_url_data(url=url, client=client)) results = await asyncio.gather(*tasks, return_exceptions=True) return results def scrap(urls): client = httpx.Client() soups = [] for url in urls: soups.append(get_url_data(url=url, client=client)) return soups def google_search_urls( text, sorted_date, domains_to_include, api_key, cse_id, **kwargs, ): service = build("customsearch", "v1", developerKey=api_key) results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute() url_list = [] if "items" in results and len(results["items"]) > 0: for count, link in enumerate(results["items"]): # skip user selected domains if (domains_to_include is None) or not any( ("." + domain) in link["link"] for domain in domains_to_include ): continue url = link["link"] if url not in url_list: url_list.append(url) return url_list def google_search( topic, sorted_date, domains_to_include, ): # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE" # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk" # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg" # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" cse_id = "851813e81162b4ed4" # get list of URLS to check start_time = time.perf_counter() url_list = google_search_urls( topic, sorted_date, domains_to_include, api_key, cse_id, ) print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time) # Scrape URLs in list start_time = time.perf_counter() soups = asyncio.run(parallel_scrap(url_list)) print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time) result_content = {} num_pages = 3 count = 0 for url, soup in zip(url_list, soups): if count >= num_pages: break if soup: text = clean_html(soup.text) result_content[url] = text count += 1 return result_content