File size: 10,169 Bytes
48d4d11
70d74f0
 
 
 
48d4d11
43d4e83
a62cc34
d904dd4
b51be98
48d4d11
 
ca02509
bf91121
 
 
 
a6fbfb6
bf91121
9a9aac4
b51be98
59fbf6a
43d4e83
59fbf6a
 
43d4e83
59fbf6a
 
 
 
43d4e83
 
59fbf6a
 
43d4e83
bf91121
 
43d4e83
59fbf6a
43d4e83
59fbf6a
9a9aac4
d904dd4
 
 
 
 
 
 
 
 
 
59fbf6a
43d4e83
 
59fbf6a
 
b51be98
 
 
59fbf6a
 
 
 
43d4e83
 
 
 
 
 
 
 
 
bf91121
43d4e83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d74f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5650543
70d74f0
 
 
 
 
 
 
 
6f4a113
bf1e0a0
 
70d74f0
a62cc34
d904dd4
a62cc34
d904dd4
a62cc34
d904dd4
a62cc34
d904dd4
 
 
 
 
 
70d74f0
 
 
a62cc34
 
 
 
 
 
d904dd4
 
 
 
 
 
 
 
9a9aac4
 
a62cc34
 
d904dd4
a62cc34
 
70d74f0
bf1e0a0
 
 
 
70d74f0
 
 
 
 
 
 
f14cff1
 
 
 
 
 
 
 
70d74f0
 
 
f14cff1
70d74f0
 
5650543
 
 
70d74f0
 
5650543
 
9a9aac4
70d74f0
 
5650543
 
 
9a9aac4
 
 
 
 
 
 
 
 
 
 
 
 
5650543
 
 
 
 
 
9a9aac4
5650543
 
 
 
 
9a9aac4
5650543
 
9a9aac4
5650543
70d74f0
 
b51be98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa3e7dd
48d4d11
 
70d74f0
a6fbfb6
 
b51be98
 
 
 
 
 
 
 
 
 
 
5650543
 
 
 
b51be98
 
 
 
 
70d74f0
5650543
9a9aac4
a6fbfb6
b51be98
a6fbfb6
9a9aac4
b51be98
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import os
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from dotenv import load_dotenv
import requests
import fitz
from trafilatura import extract
from bs4 import BeautifulSoup

load_dotenv()

API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")

# Number of pages to scrape
NUM_PAGES = 20


def build_results_beautifulsoup(url_list, scholar_abstracts: list[str] = None):
    print("Starting to scrape URLs...")
    start_time = time.perf_counter()

    # scrape URLs in list
    soups = asyncio.run(parallel_scrap(url_list))

    scraping_time = time.perf_counter() - start_time
    print(f"Scraping processing time: {scraping_time:.2f} seconds")

    result_content = {}
    count = 0

    print("Starting to process each URL...")
    for url, soup in zip(url_list, soups):
        if count >= NUM_PAGES:
            print(f"Reached the limit of {NUM_PAGES} pages. Stopping processing.")
            break

        if soup:
            print(f"Processing URL: {url}")

            text = extract(
                soup,
                include_tables=False,
                include_comments=False,
                output_format="txt",
            )
            # If text is None or empty, log a warning and skip
            if text is None:
                print(f"Warning: Extraction returned None for URL: {url}")
            elif len(text) > 500:
                print(f"Adding content from URL: {url}, content length: {len(text)}")
                result_content[url] = text
                count += 1
            else:
                print(f"Skipped URL: {url}, content too short (length: {len(text)})")
        elif scholar_abstracts and scholar_abstracts.get(url):
            print(f"Skipped URL: {url}, no soup content available. Returning scholar abstract instead.")
            result_content[url] = scholar_abstracts.get(url)
        else:
            print(f"Skipped URL: {url}, no soup content available.")

    print("Finished processing URLs.")
    return result_content


def build_results_extractor(url_list):
    try:
        endpoint = "https://extractorapi.com/api/v1/extractor"
        result_content = {}
        count = 0
        for url in url_list:
            if count >= NUM_PAGES:
                break
            params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
            r = requests.get(endpoint, params=params)
            if r.status_code == 200:
                text = r.json()["text"]
                if len(text) > 500:
                    result_content[url] = text
                    count += 1
            if r.status_code == 403:
                raise Exception(f"Error with API; using default implementaion instead")
        return result_content

    except Exception as e:
        print(e)
        return build_results_beautifulsoup(url_list)


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


async def get_url_data(url, client):
    try:
        r = await client.get(url, follow_redirects=True)
        print(f"URL: {url}, Response Code: {r.status_code}")

        if r.status_code == 200:
            content_type = r.headers.get("Content-Type", "").lower()
            # Improved PDF detection using Content-Type and file extension
            if "application/pdf" in content_type or url.lower().endswith(".pdf"):
                print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
                pdf_content = await extract_pdf_text(r.content)
                return pdf_content
            else:
                return r.content
        else:
            print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching URL: {url}, Error: {str(e)}")
        return None


async def extract_pdf_text(content):
    try:
        with fitz.open(stream=content, filetype="pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <body>
            <p>{text}</p>
        </body>
        </html>
        """
        html_bytes = html_content.encode("utf-8")
        return html_bytes  # Return in such a format that is parsable by trafilatura
    except Exception as e:
        print(f"Error extracting PDF text: {str(e)}")
        return None


async def parallel_scrap(urls):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    async with httpx.AsyncClient(timeout=30, headers=headers) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def scrap(urls):
    client = httpx.Client()
    soups = []
    for url in urls:
        soups.append(get_url_data(url=url, client=client))
    return soups


def google_search_urls(
    text,
    sorted_date,
    domains_to_include,
    api_key,
    cse_id,
    num_results=10,  # Number of results to fetch per page
    total_results=30,  # Total number of results to fetch
    skip_urls=None,  # List of URLs to skip
    **kwargs,
):
    if skip_urls is None:
        skip_urls = []  # Initialize as empty list if not provided

    service = build("customsearch", "v1", developerKey=api_key)
    url_list = []
    start_index = 1  # Initial index for the search results
    while len(url_list) < total_results:
        # Fetch a page of results
        results = (
            service.cse()
            .list(
                q=text,
                cx=cse_id,
                sort=sorted_date,
                start=start_index,
                num=min(num_results, total_results - len(url_list)),
                **kwargs,
            )
            .execute()
        )

        if "items" in results and len(results["items"]) > 0:
            for count, link in enumerate(results["items"]):
                url = link["link"]
                # Skip if the URL is in the skip_urls list or doesn't match the domain filter
                if url in skip_urls:
                    continue
                if (domains_to_include is None) or any(("." + domain) in url for domain in domains_to_include):
                    if url not in url_list:
                        url_list.append(url)
        else:
            # No more results
            break

        # Move to the next page of results
        start_index += num_results

    return url_list[:total_results]


def scrape_abstract(url, title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    abstract_section = soup.find("div", class_="tldr-abstract-replacement paper-detail-page__tldr-abstract")
    abstract = abstract_section.get_text().strip() if abstract_section else ""
    return title + "\n" + abstract if abstract != "" else None


def semantic_scholar_urls(
    text,
    sorted_date,
    total_results=30,  # Total number of results to fetch
    skip_urls=None,  # List of URLs to skip
    **kwargs,
):
    ss_api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
    semantic_scholar_endpoint = "http://api.semanticscholar.org/graph/v1/paper/search/"

    date_from, date_to = sorted_date.split(":r:")[1].split(":")
    year_from = date_from[:4]
    year_to = date_to[:4]
    success_count = 0

    print(f"Dates: {year_from}-{year_to}")
    query_params = {
        "query": text,
        "fields": "title,abstract,url,publicationTypes,publicationDate,openAccessPdf,fieldsOfStudy",
        "year": f"{year_from}-{year_to}",
        "limit": 3 * total_results,
    }
    headers = {"x-api-key": ss_api_key}
    response = requests.get(semantic_scholar_endpoint, params=query_params, headers=headers).json()
    url_list = []
    scholar_abstracts = {}
    for row in response.get("data", []):
        if success_count >= total_results:
            break
        url = row.get("url")
        if isinstance(url, dict) and url.get("url"):
            url = url.get("url")
        url_list.append(url)
        abstract = row.get("abstract")
        if abstract:
            scholar_abstracts[url] = abstract
            success_count += 1
        if row.get("openAccessPdf") and row.get("url"):
            url_list.append(row.get("openAccessPdf").get("url"))
            success_count += 1
    return url_list, scholar_abstracts


def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
    api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
    cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
    start_time = time.perf_counter()
    # if scholar_mode_check:
    #     topic += " -filetype:pdf"
    scholar_abstracts = None
    if not scholar_mode_check:
        url_list = google_search_urls(
            topic,
            sorted_date,
            domains_to_include,
            api_key,
            cse_id,
        )
    else:
        url_list, scholar_abstracts = semantic_scholar_urls(topic, sorted_date)
    print("---")
    print(len(url_list))
    print(url_list)
    print("---")
    if scholar_mode_check:
        print("Semantic Scholar processing time: ", time.perf_counter() - start_time)
    else:
        print("Google Search processing time: ", time.perf_counter() - start_time)
    result_content = build_results_beautifulsoup(url_list, scholar_abstracts)
    return result_content


if __name__ == "__main__":
    res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, True)
    print(res.keys())
    print(len(res))
    print(res)