Spaces:

polygraf-ai
/

article_writer

Runtime error

eljanmahammadli commited on Sep 26, 2024

Commit

a6fbfb6

1 Parent(s): 5650543

chore: increase of numbers to scrape; disabled PDF check in scholar model

Files changed (2) hide show

app.py CHANGED Viewed

@@ -21,9 +21,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from google.cloud import storage
 if gr.NO_RELOAD:
-    # from humanize import humanize_text, device
-    humanize_text = None
-    device = None
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
     from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper

 from google.cloud import storage
 if gr.NO_RELOAD:
+    from humanize import humanize_text, device
+    # humanize_text = None
+    # device = None
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
     from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper

google_search.py CHANGED Viewed

@@ -14,7 +14,7 @@ API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
 CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
 # Number of pages to scrape
-NUM_PAGES = 10
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
@@ -223,8 +223,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
     api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
     cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
     start_time = time.perf_counter()
-    if scholar_mode_check:
-        topic += " -filetype:pdf"
     url_list = google_search_urls(
         topic,
         sorted_date,
@@ -240,7 +240,7 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
     result_content = build_results_beautifulsoup(url_list)
     return result_content
-res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
-print(res.keys())
-print(len(res))

 CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
 # Number of pages to scrape
+NUM_PAGES = 20
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
     cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
     start_time = time.perf_counter()
+    # if scholar_mode_check:
+    #     topic += " -filetype:pdf"
     url_list = google_search_urls(
         topic,
         sorted_date,
     result_content = build_results_beautifulsoup(url_list)
     return result_content
+if __name__ == "__main__":
+    res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
+    print(res.keys())
+    print(len(res))