Spaces:
Runtime error
Runtime error
File size: 3,760 Bytes
70d74f0 ca02509 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 f14cff1 70d74f0 ca02509 70d74f0 ca02509 70d74f0 ca02509 70d74f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup
import justext
import newspaper
def clean_html(text):
result = ""
article = newspaper.Article(url=" ")
article.set_html(text)
article.parse()
result += article.title + "\n"
paragraphs = justext.justext(text, justext.get_stoplist("English"))
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
result += paragraph.text
return result
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12",
}
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
def build_date(year=2024, month="March", day=1):
return f"{year}{months[month]}{day}"
async def get_url_data(url, client):
try:
r = await client.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "html.parser")
return soup
except Exception:
return None
async def parallel_scrap(urls):
async with httpx.AsyncClient(timeout=30) as client:
tasks = []
for url in urls:
tasks.append(get_url_data(url=url, client=client))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def scrap(urls):
client = httpx.Client()
soups = []
for url in urls:
soups.append(get_url_data(url=url, client=client))
return soups
def google_search_urls(
text,
sorted_date,
domains_to_include,
api_key,
cse_id,
**kwargs,
):
service = build("customsearch", "v1", developerKey=api_key)
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
url_list = []
if "items" in results and len(results["items"]) > 0:
for count, link in enumerate(results["items"]):
# skip user selected domains
if (domains_to_include is None) or not any(
("." + domain) in link["link"] for domain in domains_to_include
):
continue
url = link["link"]
if url not in url_list:
url_list.append(url)
return url_list
def google_search(
topic,
sorted_date,
domains_to_include,
):
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
cse_id = "851813e81162b4ed4"
# get list of URLS to check
start_time = time.perf_counter()
url_list = google_search_urls(
topic,
sorted_date,
domains_to_include,
api_key,
cse_id,
)
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
# Scrape URLs in list
start_time = time.perf_counter()
soups = asyncio.run(parallel_scrap(url_list))
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
result_content = {}
num_pages = 3
count = 0
for url, soup in zip(url_list, soups):
if count >= num_pages:
break
if soup:
text = clean_html(soup.text)
result_content[url] = text
count += 1
# for key, value in result_content.items():
# print("-------------------URL: ", key)
# print(value[:30])
return result_content
|