Spaces:

polygraf-ai
/

article_writer

Runtime error

File size: 3,558 Bytes

import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup
import justext
import newspaper


def clean_html(text):
    result = ""
    article = newspaper.Article(url=" ")
    article.set_html(text)
    article.parse()
    result += article.title + "\n"
    paragraphs = justext.justext(text, justext.get_stoplist("English"))
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            result += paragraph.text
    return result


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


async def get_url_data(url, client):
    try:
        r = await client.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
            return soup
    except Exception:
        return None


async def parallel_scrap(urls):
    async with httpx.AsyncClient(timeout=30) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def google_search_urls(
    text,
    sorted_date,
    domains_to_skip,
    api_key,
    cse_id,
    **kwargs,
):
    service = build("customsearch", "v1", developerKey=api_key)
    results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
    url_list = []
    if "items" in results and len(results["items"]) > 0:
        for count, link in enumerate(results["items"]):
            # skip user selected domains
            if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
                continue
            url = link["link"]
            if url not in url_list:
                url_list.append(url)
    return url_list


def google_search(
    input,
    sorted_date,
    domains_to_skip,
):
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    cse_id = "851813e81162b4ed4"

    # get list of URLS to check
    start_time = time.perf_counter()
    url_list = google_search_urls(
        input,
        sorted_date,
        domains_to_skip,
        api_key,
        cse_id,
    )
    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
    # Scrape URLs in list
    start_time = time.perf_counter()
    soups = asyncio.run(parallel_scrap(url_list))
    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
    result_content = {}
    num_pages = 3
    count = 0
    for url, soup in zip(url_list, soups):
        if count >= num_pages:
            break
        if soup:
            text = clean_html(soup.text)
            result_content[url] = text
            count += 1
    # for key, value in result_content.items():
    #     print("-------------------URL: ", key)
    #     print(value[:30])
    return result_content