File size: 3,558 Bytes
70d74f0
 
 
 
 
ca02509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d74f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca02509
 
70d74f0
ca02509
 
70d74f0
ca02509
 
 
70d74f0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup
import justext
import newspaper


def clean_html(text):
    result = ""
    article = newspaper.Article(url=" ")
    article.set_html(text)
    article.parse()
    result += article.title + "\n"
    paragraphs = justext.justext(text, justext.get_stoplist("English"))
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            result += paragraph.text
    return result


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


async def get_url_data(url, client):
    try:
        r = await client.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
            return soup
    except Exception:
        return None


async def parallel_scrap(urls):
    async with httpx.AsyncClient(timeout=30) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def google_search_urls(
    text,
    sorted_date,
    domains_to_skip,
    api_key,
    cse_id,
    **kwargs,
):
    service = build("customsearch", "v1", developerKey=api_key)
    results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
    url_list = []
    if "items" in results and len(results["items"]) > 0:
        for count, link in enumerate(results["items"]):
            # skip user selected domains
            if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
                continue
            url = link["link"]
            if url not in url_list:
                url_list.append(url)
    return url_list


def google_search(
    input,
    sorted_date,
    domains_to_skip,
):
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    cse_id = "851813e81162b4ed4"

    # get list of URLS to check
    start_time = time.perf_counter()
    url_list = google_search_urls(
        input,
        sorted_date,
        domains_to_skip,
        api_key,
        cse_id,
    )
    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
    # Scrape URLs in list
    start_time = time.perf_counter()
    soups = asyncio.run(parallel_scrap(url_list))
    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
    result_content = {}
    num_pages = 3
    count = 0
    for url, soup in zip(url_list, soups):
        if count >= num_pages:
            break
        if soup:
            text = clean_html(soup.text)
            result_content[url] = text
            count += 1
    # for key, value in result_content.items():
    #     print("-------------------URL: ", key)
    #     print(value[:30])
    return result_content