eljanmahammadli commited on
Commit
a6fbfb6
·
1 Parent(s): 5650543

chore: increase of numbers to scrape; disabled PDF check in scholar model

Browse files
Files changed (2) hide show
  1. app.py +3 -3
  2. google_search.py +7 -7
app.py CHANGED
@@ -21,9 +21,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
21
  from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
- # from humanize import humanize_text, device
25
- humanize_text = None
26
- device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
28
  from google_search import google_search, months, domain_list, build_date
29
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
 
21
  from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
+ from humanize import humanize_text, device
25
+ # humanize_text = None
26
+ # device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
28
  from google_search import google_search, months, domain_list, build_date
29
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
google_search.py CHANGED
@@ -14,7 +14,7 @@ API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
14
  CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
15
 
16
  # Number of pages to scrape
17
- NUM_PAGES = 10
18
 
19
  def build_results_beautifulsoup(url_list):
20
  print("Starting to scrape URLs...")
@@ -223,8 +223,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
223
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
224
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
225
  start_time = time.perf_counter()
226
- if scholar_mode_check:
227
- topic += " -filetype:pdf"
228
  url_list = google_search_urls(
229
  topic,
230
  sorted_date,
@@ -240,7 +240,7 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
240
  result_content = build_results_beautifulsoup(url_list)
241
  return result_content
242
 
243
-
244
- res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
245
- print(res.keys())
246
- print(len(res))
 
14
  CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
15
 
16
  # Number of pages to scrape
17
+ NUM_PAGES = 20
18
 
19
  def build_results_beautifulsoup(url_list):
20
  print("Starting to scrape URLs...")
 
223
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
224
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
225
  start_time = time.perf_counter()
226
+ # if scholar_mode_check:
227
+ # topic += " -filetype:pdf"
228
  url_list = google_search_urls(
229
  topic,
230
  sorted_date,
 
240
  result_content = build_results_beautifulsoup(url_list)
241
  return result_content
242
 
243
+ if __name__ == "__main__":
244
+ res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
245
+ print(res.keys())
246
+ print(len(res))