AlbertoFH98 commited on
Commit
e73a11f
·
1 Parent(s): d3f801e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +125 -52
utils.py CHANGED
@@ -223,62 +223,135 @@ def get_gpt_response(transcription_path, query, logger):
223
  return llm_output
224
 
225
  # -- Text summarisation with OpenAI (map-reduce technique)
226
- def summarise_doc(transcription_path):
227
- llm = ChatOpenAI(temperature=0, max_tokens=1024)
 
 
 
 
 
 
 
 
 
 
 
228
 
229
- # -- Map
230
- loader = TextLoader(transcription_path)
231
- docs = loader.load()
232
- map_template = """Lo siguiente es listado de fragmentos de una conversacion:
233
- {docs}
234
- En base a este listado, por favor identifica los temas/topics principales.
235
- Respuesta:"""
236
- map_prompt = PromptTemplate.from_template(map_template)
237
- map_chain = LLMChain(llm=llm, prompt=map_prompt)
238
-
239
- # -- Reduce
240
- reduce_template = """A continuacion se muestra un conjunto de resumenes:
241
- {docs}
242
- Usalos para crear un unico resumen consolidado de todos los temas/topics principales.
243
- Respuesta:"""
244
- reduce_prompt = PromptTemplate.from_template(reduce_template)
245
-
246
- # Run chain
247
- reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
248
 
249
- # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
250
- combine_documents_chain = StuffDocumentsChain(
251
- llm_chain=reduce_chain, document_variable_name="docs"
252
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- # Combines and iteravely reduces the mapped documents
255
- reduce_documents_chain = ReduceDocumentsChain(
256
- # This is final chain that is called.
257
- combine_documents_chain=combine_documents_chain,
258
- # If documents exceed context for `StuffDocumentsChain`
259
- collapse_documents_chain=combine_documents_chain,
260
- # The maximum number of tokens to group documents into.
261
- token_max=3000,
262
- )
263
-
264
- # Combining documents by mapping a chain over them, then combining results
265
- map_reduce_chain = MapReduceDocumentsChain(
266
- # Map chain
267
- llm_chain=map_chain,
268
- # Reduce chain
269
- reduce_documents_chain=reduce_documents_chain,
270
- # The variable name in the llm_chain to put the documents in
271
- document_variable_name="docs",
272
- # Return the results of the map steps in the output
273
- return_intermediate_steps=False,
274
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
- text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
277
- chunk_size=3000, chunk_overlap=0
278
- )
279
- split_docs = text_splitter.split_documents(docs)
280
-
281
- return map_reduce_chain.run(split_docs)
282
 
283
  # -- Python function to setup basic features: SpaCy pipeline and LLM model
284
  @st.cache_resource
 
223
  return llm_output
224
 
225
  # -- Text summarisation with OpenAI (map-reduce technique)
226
+ def summarise_doc(transcription_path, model_name, model=None):
227
+ if model_name == 'gpt':
228
+ llm = ChatOpenAI(temperature=0, max_tokens=1024)
229
+
230
+ # -- Map
231
+ loader = TextLoader(transcription_path)
232
+ docs = loader.load()
233
+ map_template = """Lo siguiente es listado de fragmentos de una conversacion:
234
+ {docs}
235
+ En base a este listado, por favor identifica los temas/topics principales.
236
+ Respuesta:"""
237
+ map_prompt = PromptTemplate.from_template(map_template)
238
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
239
 
240
+ # -- Reduce
241
+ reduce_template = """A continuacion se muestra un conjunto de resumenes:
242
+ {docs}
243
+ Usalos para crear un unico resumen consolidado de todos los temas/topics principales.
244
+ Respuesta:"""
245
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ # Run chain
248
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
249
+
250
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
251
+ combine_documents_chain = StuffDocumentsChain(
252
+ llm_chain=reduce_chain, document_variable_name="docs"
253
+ )
254
+
255
+ # Combines and iteravely reduces the mapped documents
256
+ reduce_documents_chain = ReduceDocumentsChain(
257
+ # This is final chain that is called.
258
+ combine_documents_chain=combine_documents_chain,
259
+ # If documents exceed context for `StuffDocumentsChain`
260
+ collapse_documents_chain=combine_documents_chain,
261
+ # The maximum number of tokens to group documents into.
262
+ token_max=3000,
263
+ )
264
 
265
+ # Combining documents by mapping a chain over them, then combining results
266
+ map_reduce_chain = MapReduceDocumentsChain(
267
+ # Map chain
268
+ llm_chain=map_chain,
269
+ # Reduce chain
270
+ reduce_documents_chain=reduce_documents_chain,
271
+ # The variable name in the llm_chain to put the documents in
272
+ document_variable_name="docs",
273
+ # Return the results of the map steps in the output
274
+ return_intermediate_steps=False,
275
+ )
276
+
277
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
278
+ chunk_size=3000, chunk_overlap=0
279
+ )
280
+ split_docs = text_splitter.split_documents(docs)
281
+ doc_summary = map_reduce_chain.run(split_docs)
282
+ else:
283
+ loader = TextLoader(transcription_path)
284
+ docs = loader.load()
285
+
286
+ # -- Keep original transcription
287
+ with open(transcription_path, 'r') as f:
288
+ formatted_transcription = f.read()
289
+
290
+ llm = TogetherLLM(
291
+ model= model,
292
+ temperature = 0.0,
293
+ max_tokens = 1024,
294
+ original_transcription = formatted_transcription
295
+ )
296
+
297
+ # Map
298
+ map_template = """Lo siguiente es un extracto de una conversación entre dos hablantes en español.
299
+ {docs}
300
+ Por favor resuma la conversación en español.
301
+ Resumen:"""
302
+ map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])
303
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
304
+
305
+ # Reduce
306
+ reduce_template = """Lo siguiente es una lista de resumenes en español:
307
+ {doc_summaries}
308
+ Tómelos y descríbalos en un resumen final consolidado en español. Además, enumera los temas principales de la conversación en español.
309
+
310
+ Resumen:"""
311
+ reduce_prompt = PromptTemplate(template=reduce_template, input_variables=["doc_summaries"])
312
+
313
+ # Run chain
314
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
315
+
316
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
317
+ combine_documents_chain = StuffDocumentsChain(
318
+ llm_chain=reduce_chain, document_variable_name="doc_summaries"
319
+ )
320
+
321
+ # Combines and iteravely reduces the mapped documents
322
+ reduce_documents_chain = ReduceDocumentsChain(
323
+ # This is final chain that is called.
324
+ combine_documents_chain=combine_documents_chain,
325
+ # If documents exceed context for `StuffDocumentsChain`
326
+ collapse_documents_chain=combine_documents_chain,
327
+ # The maximum number of tokens to group documents into.
328
+ verbose=True,
329
+ token_max=1024
330
+ )
331
+
332
+ # Combining documents by mapping a chain over them, then combining results
333
+ map_reduce_chain = MapReduceDocumentsChain(
334
+ # Map chain
335
+ llm_chain=map_chain,
336
+ # Reduce chain
337
+ reduce_documents_chain=reduce_documents_chain,
338
+ # The variable name in the llm_chain to put the documents in
339
+ document_variable_name="docs",
340
+ # Return the results of the map steps in the output
341
+ return_intermediate_steps=False,
342
+ verbose=True
343
+ )
344
+ text_splitter = CharacterTextSplitter(
345
+ separator = "\n\n",
346
+ chunk_size = 2000,
347
+ chunk_overlap = 50,
348
+ length_function = len,
349
+ is_separator_regex = True,
350
+ )
351
+ split_docs = text_splitter.create_documents([docs])
352
+
353
 
354
+ return doc_summary
 
 
 
 
 
355
 
356
  # -- Python function to setup basic features: SpaCy pipeline and LLM model
357
  @st.cache_resource