Spaces:
Running
Running
oceansweep
commited on
Update App_Function_Libraries/Article_Summarization_Lib.py
Browse files
App_Function_Libraries/Article_Summarization_Lib.py
CHANGED
@@ -1,284 +1,292 @@
|
|
1 |
-
# Article_Summarization_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Article Summarization Library
|
4 |
-
# This library is used to handle summarization of articles.
|
5 |
-
|
6 |
-
#
|
7 |
-
####
|
8 |
-
#
|
9 |
-
####################
|
10 |
-
# Function List
|
11 |
-
#
|
12 |
-
# 1.
|
13 |
-
#
|
14 |
-
####################
|
15 |
-
#
|
16 |
-
# Import necessary libraries
|
17 |
-
import datetime
|
18 |
-
from datetime import datetime
|
19 |
-
import gradio as gr
|
20 |
-
import json
|
21 |
-
import os
|
22 |
-
import logging
|
23 |
-
import requests
|
24 |
-
# 3rd-Party Imports
|
25 |
-
from tqdm import tqdm
|
26 |
-
|
27 |
-
from App_Function_Libraries.Utils import sanitize_filename
|
28 |
-
# Local Imports
|
29 |
-
from Article_Extractor_Lib import scrape_article
|
30 |
-
from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
31 |
-
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
32 |
-
from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere,
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
#
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
logging.debug(f"
|
62 |
-
logging.debug(f"
|
63 |
-
logging.debug(f"
|
64 |
-
logging.debug(f"
|
65 |
-
logging.debug(f"
|
66 |
-
logging.debug(f"
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
# def
|
187 |
-
summary =
|
188 |
-
|
189 |
-
elif api_name.lower() == "
|
190 |
-
logging.debug(f"MAIN: Trying to summarize with
|
191 |
-
# def
|
192 |
-
summary =
|
193 |
-
|
194 |
-
elif api_name.lower() == "
|
195 |
-
logging.debug(f"MAIN: Trying to summarize with
|
196 |
-
# def
|
197 |
-
summary =
|
198 |
-
|
199 |
-
elif api_name.lower() == "
|
200 |
-
logging.debug(f"MAIN: Trying to summarize with
|
201 |
-
# def
|
202 |
-
summary =
|
203 |
-
|
204 |
-
elif api_name.lower() == "
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
# def
|
215 |
-
summary =
|
216 |
-
|
217 |
-
elif api_name.lower() == "
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
summary
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
#######################################################################################################################
|
|
|
1 |
+
# Article_Summarization_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Article Summarization Library
|
4 |
+
# This library is used to handle summarization of articles.
|
5 |
+
|
6 |
+
#
|
7 |
+
####
|
8 |
+
#
|
9 |
+
####################
|
10 |
+
# Function List
|
11 |
+
#
|
12 |
+
# 1.
|
13 |
+
#
|
14 |
+
####################
|
15 |
+
#
|
16 |
+
# Import necessary libraries
|
17 |
+
import datetime
|
18 |
+
from datetime import datetime
|
19 |
+
import gradio as gr
|
20 |
+
import json
|
21 |
+
import os
|
22 |
+
import logging
|
23 |
+
import requests
|
24 |
+
# 3rd-Party Imports
|
25 |
+
from tqdm import tqdm
|
26 |
+
|
27 |
+
from App_Function_Libraries.Utils import sanitize_filename
|
28 |
+
# Local Imports
|
29 |
+
from Article_Extractor_Lib import scrape_article
|
30 |
+
from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
31 |
+
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
32 |
+
from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
33 |
+
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
34 |
+
summarize_with_mistral
|
35 |
+
from SQLite_DB import Database, create_tables, add_media_with_keywords
|
36 |
+
#
|
37 |
+
#######################################################################################################################
|
38 |
+
# Function Definitions
|
39 |
+
#
|
40 |
+
|
41 |
+
def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt):
|
42 |
+
try:
|
43 |
+
# Check if content is not empty or whitespace
|
44 |
+
if not content.strip():
|
45 |
+
raise ValueError("Content is empty.")
|
46 |
+
|
47 |
+
db = Database()
|
48 |
+
create_tables()
|
49 |
+
keyword_list = keywords.split(',') if keywords else ["default"]
|
50 |
+
keyword_str = ', '.join(keyword_list)
|
51 |
+
|
52 |
+
# Set default values for missing fields
|
53 |
+
url = url or 'Unknown'
|
54 |
+
title = title or 'Unknown'
|
55 |
+
author = author or 'Unknown'
|
56 |
+
keywords = keywords or 'default'
|
57 |
+
summary = summary or 'No summary available'
|
58 |
+
ingestion_date = ingestion_date or datetime.datetime.now().strftime('%Y-%m-%d')
|
59 |
+
|
60 |
+
# Log the values of all fields before calling add_media_with_keywords
|
61 |
+
logging.debug(f"URL: {url}")
|
62 |
+
logging.debug(f"Title: {title}")
|
63 |
+
logging.debug(f"Author: {author}")
|
64 |
+
logging.debug(f"Content: {content[:50]}... (length: {len(content)})") # Log first 50 characters of content
|
65 |
+
logging.debug(f"Keywords: {keywords}")
|
66 |
+
logging.debug(f"Summary: {summary}")
|
67 |
+
logging.debug(f"Ingestion Date: {ingestion_date}")
|
68 |
+
logging.debug(f"Custom Prompt: {custom_prompt}")
|
69 |
+
|
70 |
+
# Check if any required field is empty and log the specific missing field
|
71 |
+
if not url:
|
72 |
+
logging.error("URL is missing.")
|
73 |
+
raise ValueError("URL is missing.")
|
74 |
+
if not title:
|
75 |
+
logging.error("Title is missing.")
|
76 |
+
raise ValueError("Title is missing.")
|
77 |
+
if not content:
|
78 |
+
logging.error("Content is missing.")
|
79 |
+
raise ValueError("Content is missing.")
|
80 |
+
if not keywords:
|
81 |
+
logging.error("Keywords are missing.")
|
82 |
+
raise ValueError("Keywords are missing.")
|
83 |
+
if not summary:
|
84 |
+
logging.error("Summary is missing.")
|
85 |
+
raise ValueError("Summary is missing.")
|
86 |
+
if not ingestion_date:
|
87 |
+
logging.error("Ingestion date is missing.")
|
88 |
+
raise ValueError("Ingestion date is missing.")
|
89 |
+
if not custom_prompt:
|
90 |
+
logging.error("Custom prompt is missing.")
|
91 |
+
raise ValueError("Custom prompt is missing.")
|
92 |
+
|
93 |
+
# Add media with keywords to the database
|
94 |
+
result = add_media_with_keywords(
|
95 |
+
url=url,
|
96 |
+
title=title,
|
97 |
+
media_type='article',
|
98 |
+
content=content,
|
99 |
+
keywords=keyword_str or "article_default",
|
100 |
+
prompt=custom_prompt or None,
|
101 |
+
summary=summary or "No summary generated",
|
102 |
+
transcription_model=None, # or some default value if applicable
|
103 |
+
author=author or 'Unknown',
|
104 |
+
ingestion_date=ingestion_date
|
105 |
+
)
|
106 |
+
return result
|
107 |
+
except Exception as e:
|
108 |
+
logging.error(f"Failed to ingest article to the database: {e}")
|
109 |
+
return str(e)
|
110 |
+
|
111 |
+
|
112 |
+
def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
|
113 |
+
urls = [url.strip() for url in urls.split('\n') if url.strip()]
|
114 |
+
custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
|
115 |
+
|
116 |
+
results = []
|
117 |
+
errors = []
|
118 |
+
|
119 |
+
# Create a progress bar
|
120 |
+
progress = gr.Progress()
|
121 |
+
|
122 |
+
for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
|
123 |
+
custom_title = custom_titles[i] if i < len(custom_titles) else None
|
124 |
+
try:
|
125 |
+
result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message)
|
126 |
+
results.append(f"Results for URL {i + 1}:\n{result}")
|
127 |
+
except Exception as e:
|
128 |
+
error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
|
129 |
+
errors.append(error_message)
|
130 |
+
results.append(f"Failed to process URL {i + 1}: {url}")
|
131 |
+
|
132 |
+
# Update progress
|
133 |
+
progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
|
134 |
+
|
135 |
+
# Combine results and errors
|
136 |
+
combined_output = "\n".join(results)
|
137 |
+
if errors:
|
138 |
+
combined_output += "\n\nErrors encountered:\n" + "\n".join(errors)
|
139 |
+
|
140 |
+
return combined_output
|
141 |
+
|
142 |
+
|
143 |
+
def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
|
144 |
+
try:
|
145 |
+
# Step 1: Scrape the article
|
146 |
+
article_data = scrape_article(url)
|
147 |
+
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
148 |
+
if not article_data:
|
149 |
+
return "Failed to scrape the article."
|
150 |
+
|
151 |
+
# Use the custom title if provided, otherwise use the scraped title
|
152 |
+
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
153 |
+
author = article_data.get('author', 'Unknown')
|
154 |
+
content = article_data.get('content', '')
|
155 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
156 |
+
|
157 |
+
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
158 |
+
|
159 |
+
# Custom system prompt for the article
|
160 |
+
system_message = system_message or "Act as a professional summarizer and summarize this article."
|
161 |
+
# Custom prompt for the article
|
162 |
+
article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
|
163 |
+
|
164 |
+
# Step 2: Summarize the article
|
165 |
+
summary = None
|
166 |
+
if api_name:
|
167 |
+
logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
|
168 |
+
|
169 |
+
# Sanitize filename for saving the JSON file
|
170 |
+
sanitized_title = sanitize_filename(title)
|
171 |
+
json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
|
172 |
+
|
173 |
+
with open(json_file_path, 'w') as json_file:
|
174 |
+
json.dump([{'text': content}], json_file, indent=2)
|
175 |
+
|
176 |
+
# FIXME - Swap out this if/else to use the dedicated function....
|
177 |
+
try:
|
178 |
+
if api_name.lower() == 'openai':
|
179 |
+
# def summarize_with_openai(api_key, input_data, custom_prompt_arg)
|
180 |
+
summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
|
181 |
+
|
182 |
+
elif api_name.lower() == "anthropic":
|
183 |
+
# def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
|
184 |
+
summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
|
185 |
+
elif api_name.lower() == "cohere":
|
186 |
+
# def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
|
187 |
+
summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
|
188 |
+
|
189 |
+
elif api_name.lower() == "groq":
|
190 |
+
logging.debug(f"MAIN: Trying to summarize with groq")
|
191 |
+
# def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
|
192 |
+
summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
|
193 |
+
|
194 |
+
elif api_name.lower() == "openrouter":
|
195 |
+
logging.debug(f"MAIN: Trying to summarize with OpenRouter")
|
196 |
+
# def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
|
197 |
+
summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
|
198 |
+
|
199 |
+
elif api_name.lower() == "deepseek":
|
200 |
+
logging.debug(f"MAIN: Trying to summarize with DeepSeek")
|
201 |
+
# def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
|
202 |
+
summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
|
203 |
+
|
204 |
+
elif api_name.lower() == "mistral":
|
205 |
+
summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
|
206 |
+
|
207 |
+
elif api_name.lower() == "llama.cpp":
|
208 |
+
logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
|
209 |
+
# def summarize_with_llama(api_url, file_path, token, custom_prompt)
|
210 |
+
summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
|
211 |
+
|
212 |
+
elif api_name.lower() == "kobold":
|
213 |
+
logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
|
214 |
+
# def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
|
215 |
+
summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
|
216 |
+
|
217 |
+
elif api_name.lower() == "ooba":
|
218 |
+
# def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
|
219 |
+
summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
|
220 |
+
|
221 |
+
elif api_name.lower() == "tabbyapi":
|
222 |
+
# def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
|
223 |
+
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
224 |
+
|
225 |
+
elif api_name.lower() == "vllm":
|
226 |
+
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
227 |
+
# def summarize_with_vllm(api_key, input_data, custom_prompt_input):
|
228 |
+
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
229 |
+
|
230 |
+
elif api_name.lower() == "local-llm":
|
231 |
+
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
232 |
+
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
233 |
+
|
234 |
+
elif api_name.lower() == "huggingface":
|
235 |
+
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
236 |
+
# def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
|
237 |
+
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
238 |
+
# Add additional API handlers here...
|
239 |
+
except requests.exceptions.ConnectionError as e:
|
240 |
+
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
241 |
+
|
242 |
+
if summary:
|
243 |
+
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
244 |
+
save_summary_to_file(summary, json_file_path)
|
245 |
+
else:
|
246 |
+
summary = "Summary not available"
|
247 |
+
logging.warning(f"Failed to generate summary using {api_name} API")
|
248 |
+
|
249 |
+
else:
|
250 |
+
summary = "Article Summarization: No API provided for summarization."
|
251 |
+
|
252 |
+
print(f"Summary: {summary}") # Debugging statement
|
253 |
+
|
254 |
+
# Step 3: Ingest the article into the database
|
255 |
+
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
256 |
+
article_custom_prompt)
|
257 |
+
|
258 |
+
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
259 |
+
except Exception as e:
|
260 |
+
logging.error(f"Error processing URL {url}: {str(e)}")
|
261 |
+
return f"Failed to process URL {url}: {str(e)}"
|
262 |
+
|
263 |
+
|
264 |
+
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
265 |
+
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
266 |
+
author = "Unknown"
|
267 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
268 |
+
|
269 |
+
# Summarize the unstructured text
|
270 |
+
if api_name:
|
271 |
+
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
272 |
+
with open(json_file_path, 'w') as json_file:
|
273 |
+
json.dump([{'text': text}], json_file, indent=2)
|
274 |
+
|
275 |
+
if api_name.lower() == 'openai':
|
276 |
+
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
277 |
+
# Add other APIs as needed
|
278 |
+
else:
|
279 |
+
summary = "Unsupported API."
|
280 |
+
else:
|
281 |
+
summary = "No API provided for summarization."
|
282 |
+
|
283 |
+
# Ingest the unstructured text into the database
|
284 |
+
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
285 |
+
custom_prompt)
|
286 |
+
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
287 |
+
|
288 |
+
|
289 |
+
|
290 |
+
#
|
291 |
+
#
|
292 |
#######################################################################################################################
|