RMWeerasinghe
commited on
Commit
·
99e744f
1
Parent(s):
0528be1
Initial Commit
Browse files- .gitignore +6 -1
- app.py +255 -68
- config.py +5 -0
- mapReduceSummarizer.py +50 -0
- model.py +43 -0
- preprocess.py +33 -0
- refineSummarizer.py +41 -0
- requirements.txt +0 -0
- summarizer.py +72 -0
- utils.py +6 -7
.gitignore
CHANGED
@@ -25,6 +25,7 @@ share/python-wheels/
|
|
25 |
.installed.cfg
|
26 |
*.egg
|
27 |
MANIFEST
|
|
|
28 |
|
29 |
# PyInstaller
|
30 |
# Usually these files are written by a python script from a template
|
@@ -142,4 +143,8 @@ Docs/
|
|
142 |
.DS_Store
|
143 |
.vscode/
|
144 |
test.ipynb
|
145 |
-
test.py
|
|
|
|
|
|
|
|
|
|
25 |
.installed.cfg
|
26 |
*.egg
|
27 |
MANIFEST
|
28 |
+
.conda
|
29 |
|
30 |
# PyInstaller
|
31 |
# Usually these files are written by a python script from a template
|
|
|
143 |
.DS_Store
|
144 |
.vscode/
|
145 |
test.ipynb
|
146 |
+
test.py
|
147 |
+
requirements1.txt
|
148 |
+
|
149 |
+
#logs
|
150 |
+
logs/
|
app.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
1 |
import nltk
|
2 |
import validators
|
3 |
import streamlit as st
|
4 |
-
from
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
from extractive_summarizer.model_processors import Summarizer
|
8 |
from utils import (
|
9 |
clean_text,
|
10 |
fetch_article_text,
|
@@ -12,20 +15,69 @@ from utils import (
|
|
12 |
read_text_from_file,
|
13 |
)
|
14 |
|
|
|
15 |
from rouge import Rouge
|
16 |
|
17 |
-
|
18 |
-
#
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
st.title("Text Summarizer 📝")
|
22 |
|
23 |
-
st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
24 |
-
st.markdown(
|
25 |
-
|
|
|
|
|
|
|
26 |
)
|
27 |
-
|
28 |
-
"
|
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
st.markdown(
|
@@ -44,15 +96,8 @@ if __name__ == "__main__":
|
|
44 |
)
|
45 |
st.markdown("---")
|
46 |
# ---------------------------
|
47 |
-
# SETUP & Constants
|
48 |
-
nltk.download("punkt")
|
49 |
-
abs_tokenizer_name = "facebook/bart-large-cnn"
|
50 |
-
abs_model_name = "facebook/bart-large-cnn"
|
51 |
-
abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
|
52 |
-
abs_max_length = 90
|
53 |
-
abs_min_length = 30
|
54 |
-
# ---------------------------
|
55 |
|
|
|
56 |
inp_text = st.text_input("Enter text or a url here")
|
57 |
st.markdown(
|
58 |
"<h3 style='text-align: center; color: green;'>OR</h3>",
|
@@ -65,11 +110,14 @@ if __name__ == "__main__":
|
|
65 |
is_url = validators.url(inp_text)
|
66 |
if is_url:
|
67 |
# complete text, chunks to summarize (list of sentences for long docs)
|
|
|
68 |
text, cleaned_txt = fetch_article_text(url=inp_text)
|
69 |
elif uploaded_file:
|
|
|
70 |
cleaned_txt = read_text_from_file(uploaded_file)
|
71 |
cleaned_txt = clean_text(cleaned_txt)
|
72 |
else:
|
|
|
73 |
cleaned_txt = clean_text(inp_text)
|
74 |
|
75 |
# view summarized text (expander)
|
@@ -80,51 +128,190 @@ if __name__ == "__main__":
|
|
80 |
st.write(cleaned_txt)
|
81 |
summarize = st.button("Summarize")
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import logging
|
3 |
import nltk
|
4 |
import validators
|
5 |
import streamlit as st
|
6 |
+
from summarizer import Summarizer
|
7 |
+
from config import MODELS
|
8 |
+
from warnings import filterwarnings
|
9 |
|
10 |
+
filterwarnings("ignore")
|
|
|
11 |
from utils import (
|
12 |
clean_text,
|
13 |
fetch_article_text,
|
|
|
15 |
read_text_from_file,
|
16 |
)
|
17 |
|
18 |
+
|
19 |
from rouge import Rouge
|
20 |
|
21 |
+
def filer():
|
22 |
+
# return "logs/log "
|
23 |
+
today = datetime.datetime.today()
|
24 |
+
log_filename = f"logs/{today.year}-{today.month:02d}-{today.day:02d}.log"
|
25 |
+
return log_filename
|
26 |
+
|
27 |
+
file_handler = logging.FileHandler(filer())
|
28 |
+
# file_handler = logging.handlers.TimedRotatingFileHandler(filer(),when="D")
|
29 |
+
file_handler.setLevel(logging.INFO)
|
30 |
+
|
31 |
+
logging.basicConfig(
|
32 |
+
level=logging.DEBUG,
|
33 |
+
format="%(asctime)s %(levelname)s (%(name)s) : %(message)s",
|
34 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
35 |
+
handlers=[file_handler],
|
36 |
+
force=True,
|
37 |
+
)
|
38 |
+
|
39 |
+
logger = logging.getLogger(__name__)
|
40 |
+
|
41 |
+
|
42 |
+
if "api_key" not in st.session_state:
|
43 |
+
st.session_state.api_key = " "
|
44 |
+
|
45 |
+
|
46 |
+
@st.cache_resource
|
47 |
+
def initialize_app():
|
48 |
+
nltk.download("punkt")
|
49 |
+
|
50 |
+
@st.cache_resource
|
51 |
+
def init_summarizer(model_name,api_key=None):
|
52 |
+
|
53 |
+
model_type = "local"
|
54 |
+
if model_name == "OpenAI":
|
55 |
+
model_type = "openai"
|
56 |
+
|
57 |
+
model_path = MODELS[model_name]
|
58 |
+
if model_type == "openai":
|
59 |
+
#validation logic
|
60 |
+
|
61 |
+
return Summarizer(model_path,model_type,api_key)
|
62 |
+
else:
|
63 |
+
logger.info(f"Model for summarization : {model_path}")
|
64 |
+
return Summarizer(model_path, model_type)
|
65 |
+
|
66 |
+
def load_app():
|
67 |
st.title("Text Summarizer 📝")
|
68 |
|
69 |
+
# st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
70 |
+
# st.markdown(
|
71 |
+
# "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
|
72 |
+
# )
|
73 |
+
model_name = st.sidebar.selectbox(
|
74 |
+
"Model Name", options=["Version 0", "Version 1","OpenAI"]
|
75 |
)
|
76 |
+
if model_name == "OpenAI":
|
77 |
+
st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
|
78 |
+
|
79 |
+
summarizer_type = st.sidebar.selectbox(
|
80 |
+
"Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
|
81 |
)
|
82 |
|
83 |
st.markdown(
|
|
|
96 |
)
|
97 |
st.markdown("---")
|
98 |
# ---------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
# ---------------------------
|
101 |
inp_text = st.text_input("Enter text or a url here")
|
102 |
st.markdown(
|
103 |
"<h3 style='text-align: center; color: green;'>OR</h3>",
|
|
|
110 |
is_url = validators.url(inp_text)
|
111 |
if is_url:
|
112 |
# complete text, chunks to summarize (list of sentences for long docs)
|
113 |
+
logger.info("Text Input Type: URL")
|
114 |
text, cleaned_txt = fetch_article_text(url=inp_text)
|
115 |
elif uploaded_file:
|
116 |
+
logger.info("Text Input Type: FILE")
|
117 |
cleaned_txt = read_text_from_file(uploaded_file)
|
118 |
cleaned_txt = clean_text(cleaned_txt)
|
119 |
else:
|
120 |
+
logger.info("Text Input Type: INPUT TEXT")
|
121 |
cleaned_txt = clean_text(inp_text)
|
122 |
|
123 |
# view summarized text (expander)
|
|
|
128 |
st.write(cleaned_txt)
|
129 |
summarize = st.button("Summarize")
|
130 |
|
131 |
+
if is_url:
|
132 |
+
text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
133 |
+
else:
|
134 |
+
text_to_summarize = cleaned_txt
|
135 |
+
|
136 |
+
return text_to_summarize, model_name, summarizer_type, summarize
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
def get_summary(text_to_summarize,model_name, summarizer_type, summarize):
|
142 |
+
|
143 |
+
while not summarize:
|
144 |
+
continue
|
145 |
+
|
146 |
+
else:
|
147 |
+
|
148 |
+
logger.info(f"Model Name: {model_name}")
|
149 |
+
logger.info(f"Summarization Type for Long Text: {summarizer_type}")
|
150 |
+
|
151 |
+
api_key = st.session_state.api_key
|
152 |
+
|
153 |
+
|
154 |
+
summarizer = init_summarizer(model_name,api_key)
|
155 |
+
|
156 |
+
|
157 |
+
with st.spinner(
|
158 |
+
text="Creating summary. This might take a few seconds ..."
|
159 |
+
):
|
160 |
+
|
161 |
+
if summarizer_type == "Refine":
|
162 |
+
summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
|
163 |
+
return summarized_text, time
|
164 |
+
else :
|
165 |
+
summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
|
166 |
+
return summarized_text, time
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
def display_output(summarized_text,time):
|
172 |
+
|
173 |
+
|
174 |
+
logger.info(f"SUMMARY: {summarized_text}")
|
175 |
+
logger.info(f"Summary took {time}s")
|
176 |
+
st.subheader("Summarized text")
|
177 |
+
st.info(f"{summarized_text}")
|
178 |
+
st.info(f"Time: {time}s")
|
179 |
+
|
180 |
+
|
181 |
+
# def summarizer_app():
|
182 |
+
# # ---------------------------------
|
183 |
+
# # Main Application
|
184 |
+
# # ---------------------------------
|
185 |
+
# st.title("Text Summarizer 📝")
|
186 |
+
|
187 |
+
# # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
188 |
+
# # st.markdown(
|
189 |
+
# # "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
|
190 |
+
# # )
|
191 |
+
# model_name = st.sidebar.selectbox(
|
192 |
+
# "Model Name", options=["Version 0", "Version 1","OpenAI"]
|
193 |
+
# )
|
194 |
+
# if model_name == "OpenAI":
|
195 |
+
# st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
|
196 |
+
|
197 |
+
# summarizer_type = st.sidebar.selectbox(
|
198 |
+
# "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
|
199 |
+
# )
|
200 |
+
|
201 |
+
# st.markdown(
|
202 |
+
# "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
|
203 |
+
# )
|
204 |
+
# st.markdown(
|
205 |
+
# """- Raw text in text box
|
206 |
+
# - URL of article/news to be summarized
|
207 |
+
# - .txt, .pdf, .docx file formats"""
|
208 |
+
# )
|
209 |
+
# st.markdown(
|
210 |
+
# """This app supports two type of summarization:
|
211 |
+
|
212 |
+
# 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
|
213 |
+
# 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
|
214 |
+
# )
|
215 |
+
# st.markdown("---")
|
216 |
+
# # ---------------------------
|
217 |
+
# # SETUP & Constants
|
218 |
+
# # nltk.download("punkt")
|
219 |
+
# # abs_tokenizer_name = "facebook/bart-large-cnn"
|
220 |
+
# # abs_model_name = "facebook/bart-large-cnn"
|
221 |
+
# # abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
|
222 |
+
# # abs_max_length = 90
|
223 |
+
# # abs_min_length = 30
|
224 |
+
|
225 |
+
# # model_name_v0 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0"
|
226 |
+
# # model_name_v1 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1"
|
227 |
+
# # ---------------------------
|
228 |
+
# inp_text = st.text_input("Enter text or a url here")
|
229 |
+
# st.markdown(
|
230 |
+
# "<h3 style='text-align: center; color: green;'>OR</h3>",
|
231 |
+
# unsafe_allow_html=True,
|
232 |
+
# )
|
233 |
+
# uploaded_file = st.file_uploader(
|
234 |
+
# "Upload a .txt, .pdf, .docx file for summarization"
|
235 |
+
# )
|
236 |
+
|
237 |
+
# is_url = validators.url(inp_text)
|
238 |
+
# if is_url:
|
239 |
+
# # complete text, chunks to summarize (list of sentences for long docs)
|
240 |
+
# logger.info("Text Input Type: URL")
|
241 |
+
# text, cleaned_txt = fetch_article_text(url=inp_text)
|
242 |
+
# elif uploaded_file:
|
243 |
+
# logger.info("Text Input Type: FILE")
|
244 |
+
# cleaned_txt = read_text_from_file(uploaded_file)
|
245 |
+
# cleaned_txt = clean_text(cleaned_txt)
|
246 |
+
# else:
|
247 |
+
# logger.info("Text Input Type: INPUT TEXT")
|
248 |
+
# cleaned_txt = clean_text(inp_text)
|
249 |
+
|
250 |
+
# # view summarized text (expander)
|
251 |
+
# with st.expander("View input text"):
|
252 |
+
# if is_url:
|
253 |
+
# st.write(cleaned_txt[0])
|
254 |
+
# else:
|
255 |
+
# st.write(cleaned_txt)
|
256 |
+
# summarize = st.button("Summarize")
|
257 |
+
|
258 |
+
# # called on toggle button [summarize]
|
259 |
+
# if summarize:
|
260 |
+
# if is_url:
|
261 |
+
# text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
262 |
+
# else:
|
263 |
+
# text_to_summarize = cleaned_txt
|
264 |
+
|
265 |
+
# logger.info(f"Model Name: {model_name}")
|
266 |
+
# logger.info(f"Summarization Type for Long Text: {summarizer_type}")
|
267 |
+
|
268 |
+
# api_key = st.session_state.api_key
|
269 |
+
|
270 |
+
# print(api_key)
|
271 |
+
|
272 |
+
# summarizer = init_summarizer(model_name,api_key)
|
273 |
+
|
274 |
+
# with st.spinner(
|
275 |
+
# text="Creating summary. This might take a few seconds ..."
|
276 |
+
# ):
|
277 |
+
# #ext_model = Summarizer()
|
278 |
+
# #summarized_text = ext_model(text_to_summarize, num_sentences=5)
|
279 |
+
|
280 |
+
# if summarizer_type == "Refine":
|
281 |
+
# summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
|
282 |
+
# else :
|
283 |
+
# summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
|
284 |
+
|
285 |
+
|
286 |
+
# # elif model_name == "Version 1":
|
287 |
+
# # with st.spinner(
|
288 |
+
# # text="Creating summary. This might take a few seconds ..."
|
289 |
+
# # ):
|
290 |
+
# # if summarizer_type == "Refine":
|
291 |
+
# # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"refine")
|
292 |
+
# # else :
|
293 |
+
# # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"map_reduce")
|
294 |
+
|
295 |
+
# # final summarized output
|
296 |
+
|
297 |
+
# logger.info(f"SUMMARY: {summarized_text}")
|
298 |
+
# logger.info(f"Summary took {time}s")
|
299 |
+
# st.subheader("Summarized text")
|
300 |
+
# st.info(f"{summarized_text}")
|
301 |
+
# st.info(f"Time: {time}s")
|
302 |
+
|
303 |
+
# # st.subheader("Rogue Scores")
|
304 |
+
# # rouge_sc = Rouge()
|
305 |
+
# # ground_truth = cleaned_txt[0] if is_url else cleaned_txt
|
306 |
+
# # score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
|
307 |
+
# # st.code(score)
|
308 |
+
|
309 |
+
|
310 |
+
if __name__ == "__main__":
|
311 |
+
initialize_app()
|
312 |
+
text_to_summarize, model_name, summarizer_type, summarize = load_app()
|
313 |
+
summarized_text,time = get_summary(text_to_summarize, model_name, summarizer_type, summarize)
|
314 |
+
display_output(summarized_text,time)
|
315 |
+
|
316 |
+
|
317 |
+
|
config.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODELS = {
|
2 |
+
"Version 0":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0",
|
3 |
+
"Version 1":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1",
|
4 |
+
"OpenAI" : "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1" #for tokenizer
|
5 |
+
}
|
mapReduceSummarizer.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
|
2 |
+
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, LLMChain, StuffDocumentsChain
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
|
5 |
+
def get_map_reduce_chain(pipeline_or_llm,model_type)-> LLMChain:
|
6 |
+
|
7 |
+
if model_type == "openai":
|
8 |
+
llm = pipeline_or_llm
|
9 |
+
map_template = """The following is a set of documents
|
10 |
+
{docs}
|
11 |
+
Based on this list of docs, please identify the main themes.
|
12 |
+
Helpful Answer:"""
|
13 |
+
map_prompt = PromptTemplate.from_template(map_template)
|
14 |
+
reduce_template = """The following is set of summaries:
|
15 |
+
{docs}
|
16 |
+
Take these and distill into a final, consolidated summary of the main themes.
|
17 |
+
Helpful Answer:"""
|
18 |
+
reduce_prompt = PromptTemplate.from_template(reduce_template)
|
19 |
+
|
20 |
+
else:
|
21 |
+
map_prompt = PromptTemplate.from_template(template="{docs}")
|
22 |
+
reduce_prompt = PromptTemplate.from_template(template="{docs}")
|
23 |
+
llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
|
24 |
+
|
25 |
+
|
26 |
+
map_chain = LLMChain(llm = llm, prompt=map_prompt)
|
27 |
+
reduce_chain = LLMChain(llm = llm, prompt = reduce_prompt,verbose = True)
|
28 |
+
combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="docs")
|
29 |
+
reduce_documents_chain = ReduceDocumentsChain(
|
30 |
+
combine_documents_chain=combine_documents_chain,
|
31 |
+
collapse_documents_chain=combine_documents_chain,
|
32 |
+
token_max=16384,
|
33 |
+
verbose = True,
|
34 |
+
)
|
35 |
+
map_reduce_chain = MapReduceDocumentsChain(
|
36 |
+
llm_chain=map_chain,
|
37 |
+
reduce_documents_chain=reduce_documents_chain,
|
38 |
+
document_variable_name="docs",
|
39 |
+
return_intermediate_steps=False,
|
40 |
+
verbose = True,
|
41 |
+
)
|
42 |
+
|
43 |
+
return map_reduce_chain
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
model.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
3 |
+
from langchain_openai import OpenAI
|
4 |
+
from huggingface_hub import login
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from logging import getLogger
|
7 |
+
import streamlit as st
|
8 |
+
import torch
|
9 |
+
|
10 |
+
# load_dotenv()
|
11 |
+
# hf_token = os.environ.get("HF_TOKEN")
|
12 |
+
hf_token = st.secrets["HF_TOKEN"]
|
13 |
+
login(token=hf_token)
|
14 |
+
logger = getLogger(__name__)
|
15 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
+
|
17 |
+
def get_local_model(model_name_or_path:str)->pipeline:
|
18 |
+
|
19 |
+
#print(f"Model is running on {device}")
|
20 |
+
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
22 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
|
23 |
+
pipe = pipeline(
|
24 |
+
task = 'summarization',
|
25 |
+
model=model,
|
26 |
+
tokenizer=tokenizer,
|
27 |
+
device = device,
|
28 |
+
)
|
29 |
+
|
30 |
+
logger.info(f"Summarization pipeline created and loaded to {device}")
|
31 |
+
|
32 |
+
return pipe
|
33 |
+
|
34 |
+
def get_endpoint(api_key:str):
|
35 |
+
|
36 |
+
llm = OpenAI(openai_api_key=api_key)
|
37 |
+
return llm
|
38 |
+
|
39 |
+
def get_model(model_type,model_name_or_path,api_key = None):
|
40 |
+
if model_type == "openai":
|
41 |
+
return get_endpoint(api_key)
|
42 |
+
else:
|
43 |
+
return get_local_model(model_name_or_path)
|
preprocess.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.docstore.document import Document
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from logging import getLogger
|
4 |
+
|
5 |
+
logger = getLogger(__name__)
|
6 |
+
|
7 |
+
def get_input_token_count(text:str,tokenizer)->int:
|
8 |
+
tokens = tokenizer.tokenize(text)
|
9 |
+
return len(tokens)
|
10 |
+
|
11 |
+
def get_document_splits_from_text(text:str) -> Document:
|
12 |
+
document = Document(page_content=text)
|
13 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
14 |
+
separators=["\n\n","\n",".","?"," "],
|
15 |
+
chunk_size=15000,
|
16 |
+
chunk_overlap = 50
|
17 |
+
)
|
18 |
+
split_documents = text_splitter.split_documents([document])
|
19 |
+
logger.info(f"Splitting Document: Total Chunks: {len(split_documents)} ")
|
20 |
+
return split_documents
|
21 |
+
|
22 |
+
|
23 |
+
def prepare_for_summarize(text:str,tokenizer):
|
24 |
+
no_input_tokens = get_input_token_count(text,tokenizer)
|
25 |
+
if no_input_tokens<12000:
|
26 |
+
text_to_summarize = text
|
27 |
+
length_type = "short"
|
28 |
+
return text_to_summarize,length_type
|
29 |
+
else:
|
30 |
+
text_to_summarize = get_document_splits_from_text(text)
|
31 |
+
length_type = "long"
|
32 |
+
|
33 |
+
return text_to_summarize, length_type
|
refineSummarizer.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
|
2 |
+
from langchain.chains.summarize import load_summarize_chain
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
|
5 |
+
|
6 |
+
def get_refine_chain(pipeline_or_llm, model_type):
|
7 |
+
if model_type == "openai":
|
8 |
+
llm = pipeline_or_llm
|
9 |
+
question_template = """Write a concise summary of the following:
|
10 |
+
{text}
|
11 |
+
CONCISE SUMMARY:"""
|
12 |
+
question_prompt = PromptTemplate.from_template(question_template)
|
13 |
+
refine_template = """Your job is to produce a final summary
|
14 |
+
We have provided an existing summary up to a certain point: {existing_answer}
|
15 |
+
We have the opportunity to refine the existing summary (only if needed) with some more context below.
|
16 |
+
------------
|
17 |
+
{text}
|
18 |
+
------------
|
19 |
+
Given the new context, refine the original summary in bullets. If the context isn't useful return the original summary."""
|
20 |
+
refine_prompt = PromptTemplate.from_template(refine_template)
|
21 |
+
|
22 |
+
else:
|
23 |
+
question_prompt = PromptTemplate.from_template(template="{text}")
|
24 |
+
refine_prompt = PromptTemplate.from_template(template= "{existing_answer}\n{text}")
|
25 |
+
llm = HuggingFacePipeline(pipeline=pipeline_or_llm)
|
26 |
+
|
27 |
+
|
28 |
+
refine_chain = load_summarize_chain(
|
29 |
+
llm=llm,
|
30 |
+
chain_type="refine",
|
31 |
+
question_prompt=question_prompt,
|
32 |
+
refine_prompt=refine_prompt,
|
33 |
+
return_intermediate_steps=False,
|
34 |
+
input_key="input_documents",
|
35 |
+
output_key="output_text",
|
36 |
+
verbose=True,
|
37 |
+
)
|
38 |
+
return refine_chain
|
39 |
+
|
40 |
+
|
41 |
+
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
summarizer.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from model import get_model
|
2 |
+
from mapReduceSummarizer import get_map_reduce_chain
|
3 |
+
from refineSummarizer import get_refine_chain
|
4 |
+
from preprocess import prepare_for_summarize
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from logging import getLogger
|
8 |
+
import time
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
class Summarizer:
|
12 |
+
|
13 |
+
|
14 |
+
def __init__(self,model_name,model_type,api_key=None) -> None:
|
15 |
+
self.model_type = model_type
|
16 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
+
self.base_summarizer = get_model(model_type,model_name,api_key)
|
18 |
+
|
19 |
+
def summarize(self,text:str,summarizer_type = "map_reduce")->str:
|
20 |
+
|
21 |
+
text_to_summarize,length_type = prepare_for_summarize(text,self.tokenizer)
|
22 |
+
|
23 |
+
if length_type =="short":
|
24 |
+
|
25 |
+
logger.info("Processing Input Text less than 12000 Tokens")
|
26 |
+
if self.model_type=="openai":
|
27 |
+
llm = self.base_summarizer
|
28 |
+
prompt = PromptTemplate.from_template(
|
29 |
+
template="""Write a concise and complete summary in bullet points of the given annual report.
|
30 |
+
Important:
|
31 |
+
* Note that the summary should contain all important information and it should not contain any unwanted information.
|
32 |
+
* Make sure to keep the summary as short as possible. And Summary should be in bullet points. Seperate each point with a new line.
|
33 |
+
TEXT: {text}
|
34 |
+
SUMMARY:"""
|
35 |
+
)
|
36 |
+
llm_chain = prompt|llm
|
37 |
+
start = time.time()
|
38 |
+
summary = llm_chain.invoke({"text": text_to_summarize})
|
39 |
+
end = time.time()
|
40 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
41 |
+
return summary,round((end-start),2)
|
42 |
+
|
43 |
+
elif self.model_type == "local":
|
44 |
+
pipe = self.base_summarizer
|
45 |
+
start = time.time()
|
46 |
+
summary = pipe(text_to_summarize)[0]['summary_text']
|
47 |
+
end = time.time()
|
48 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
49 |
+
return summary,round((end-start),2)
|
50 |
+
else:
|
51 |
+
if summarizer_type == "refine":
|
52 |
+
print("The text is too long, Running Refine Summarizer")
|
53 |
+
llm_chain = get_refine_chain(self.base_summarizer,self.model_type)
|
54 |
+
logger.info("Running Refine Chain for Summarization")
|
55 |
+
start = time.time()
|
56 |
+
summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
|
57 |
+
end = time.time()
|
58 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
59 |
+
return summary,round((end-start),2)
|
60 |
+
|
61 |
+
|
62 |
+
else:
|
63 |
+
print("The text is too long, Running Map Reduce Summarizer")
|
64 |
+
|
65 |
+
llm_chain = get_map_reduce_chain(self.base_summarizer,model_type=self.model_type)
|
66 |
+
logger.info("Running Map Reduce Chain for Summarization")
|
67 |
+
start = time.time()
|
68 |
+
summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
|
69 |
+
end = time.time()
|
70 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
71 |
+
return summary,round((end-start),2)
|
72 |
+
|
utils.py
CHANGED
@@ -2,7 +2,7 @@ import re
|
|
2 |
import requests
|
3 |
import docx2txt
|
4 |
from io import StringIO
|
5 |
-
from PyPDF2 import
|
6 |
|
7 |
from bs4 import BeautifulSoup
|
8 |
from nltk.tokenize import sent_tokenize
|
@@ -31,7 +31,8 @@ def clean_text(x):
|
|
31 |
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
32 |
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
33 |
x = emoji_pattern.sub(r"", x) # emojis
|
34 |
-
x =
|
|
|
35 |
|
36 |
return x
|
37 |
|
@@ -103,12 +104,10 @@ def preprocess_text_for_abstractive_summarization(tokenizer, text):
|
|
103 |
|
104 |
|
105 |
def read_pdf(file):
|
106 |
-
pdfReader =
|
107 |
-
count = pdfReader.numPages
|
108 |
all_page_text = ""
|
109 |
-
for
|
110 |
-
|
111 |
-
all_page_text += page.extractText()
|
112 |
|
113 |
return all_page_text
|
114 |
|
|
|
2 |
import requests
|
3 |
import docx2txt
|
4 |
from io import StringIO
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
|
7 |
from bs4 import BeautifulSoup
|
8 |
from nltk.tokenize import sent_tokenize
|
|
|
31 |
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
32 |
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
33 |
x = emoji_pattern.sub(r"", x) # emojis
|
34 |
+
x = x.replace("$","Dollars ")
|
35 |
+
x = re.sub("[^.,!?%A-Za-z0-9]+", " ", x) # special charachters except .,!?
|
36 |
|
37 |
return x
|
38 |
|
|
|
104 |
|
105 |
|
106 |
def read_pdf(file):
|
107 |
+
pdfReader = PdfReader(file)
|
|
|
108 |
all_page_text = ""
|
109 |
+
for page in pdfReader.pages:
|
110 |
+
all_page_text += page.extract_text()
|
|
|
111 |
|
112 |
return all_page_text
|
113 |
|