Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

eljanmahammadli commited on Sep 26, 2024

Commit

5650543

1 Parent(s): fa3e7dd

added pagintion to google search, now retrieving more sites

Browse files

Files changed (2) hide show

.gitignore +2 -1
google_search.py +47 -12

.gitignore CHANGED Viewed

@@ -8,4 +8,5 @@ nohup.out
 temp.py
 temp.ipynb
 chroma_db/
-temp.txt

 temp.py
 temp.ipynb
 chroma_db/
+temp.txt
+temp*

google_search.py CHANGED Viewed

@@ -100,6 +100,7 @@ months = {
 }
 domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
 def build_date(year=2024, month="March", day=1):
@@ -175,22 +176,47 @@ def google_search_urls(
     domains_to_include,
     api_key,
     cse_id,
     **kwargs,
 ):
     service = build("customsearch", "v1", developerKey=api_key)
-    results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
     url_list = []
-    if "items" in results and len(results["items"]) > 0:
-        for count, link in enumerate(results["items"]):
-            # skip user selected domains
-            if (domains_to_include is None) or not any(
-                ("." + domain) in link["link"] for domain in domains_to_include
-            ):
-                continue
-            url = link["link"]
-            if url not in url_list:
-                url_list.append(url)
-    return url_list
 def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
@@ -206,6 +232,15 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
         api_key,
         cse_id,
     )
     print("Google Search processing time: ", time.perf_counter() - start_time)
     result_content = build_results_beautifulsoup(url_list)
     return result_content

 }
 domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
+skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]
 def build_date(year=2024, month="March", day=1):
     domains_to_include,
     api_key,
     cse_id,
+    num_results=10,  # Number of results to fetch per page
+    total_results=30,  # Total number of results to fetch
+    skip_urls=None,  # List of URLs to skip
     **kwargs,
 ):
+    if skip_urls is None:
+        skip_urls = []  # Initialize as empty list if not provided
     service = build("customsearch", "v1", developerKey=api_key)
     url_list = []
+    start_index = 1  # Initial index for the search results
+    while len(url_list) < total_results:
+        # Fetch a page of results
+        results = service.cse().list(
+            q=text,
+            cx=cse_id,
+            sort=sorted_date,
+            start=start_index,
+            num=min(num_results, total_results - len(url_list)),
+            **kwargs
+        ).execute()
+        if "items" in results and len(results["items"]) > 0:
+            for count, link in enumerate(results["items"]):
+                url = link["link"]
+                # Skip if the URL is in the skip_urls list or doesn't match the domain filter
+                if url in skip_urls:
+                    continue
+                if (domains_to_include is None) or any(
+                    ("." + domain) in url for domain in domains_to_include
+                ):
+                    if url not in url_list:
+                        url_list.append(url)
+        else:
+            # No more results
+            break
+        # Move to the next page of results
+        start_index += num_results
+    return url_list[:total_results]
 def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
         api_key,
         cse_id,
     )
+    print("---")
+    print(len(url_list))
+    print(url_list)
+    print("---")
     print("Google Search processing time: ", time.perf_counter() - start_time)
     result_content = build_results_beautifulsoup(url_list)
     return result_content
+res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
+print(res.keys())
+print(len(res))