# -- Import libraries from langchain.prompts import PromptTemplate from PIL import Image from streamlit.logger import get_logger from streamlit_player import st_player import pandas as pd import streamlit as st import urllib.request import argparse import together import logging import requests import utils import spacy import time import os import re st.set_page_config(layout="wide") @st.cache_data def get_args(): # -- 1. Setup arguments parser = argparse.ArgumentParser() parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK') parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH') parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription') parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name') parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name') os.system("python -m spacy download es_core_news_lg") # -- 2. Setup env and logger logger = get_logger(__name__) # -- 3. Setup constants args = parser.parse_args() return args, logger @st.cache_data def get_podcast_data(path): podcast_url_video_df = pd.read_csv(path, sep=';') return podcast_url_video_df @st.cache_resource(experimental_allow_widgets=True) def get_basics_comp(emb_model, model, default_system_prompt_link, _logger, podcast_url_video_df, img_size=100): r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True) icon = Image.open(r.raw) icon = icon.resize((img_size, img_size)) with st.sidebar.container(): st.markdown( """

""", unsafe_allow_html=True, ) genre = st.sidebar.radio( "Seleccione el LLM", ["LLAMA", "GPT"] ) st.sidebar.info('Modelo LLAMA: ' + str(model).split('/')[-1] + '\nModelo GPT: gpt-3.5-turbo', icon="ℹ️") podcast_list = list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", ""))) video_option = st.sidebar.selectbox( "Seleccione el podcast", podcast_list, on_change=clean_chat ) # -- Add icons with st.sidebar.container(): st.markdown( """
""", unsafe_allow_html=True, ) video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "") video_option_joined_path = "{}_transcription.txt".format(video_option_joined) youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "") st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url)) # -- 4. Setup request for system prompt f = urllib.request.urlopen(default_system_prompt_link) default_system_prompt = str(f.read(), 'UTF-8') # -- 5. Setup app nlp, retriever = utils.setup_app(video_option_joined_path, emb_model, model, _logger) # -- 6. Setup model together.api_key = os.environ["TOGETHER_API_KEY"] #together.Models.start(model) return together, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url, genre def clean_chat(): st.session_state.conversation = None st.session_state.chat_history = None st.session_state.messages = [{'role': 'assistant', 'content': 'Nuevo chat creado'}] def main(): args, logger = get_args() B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK TRANSCRIPTION = args.TRANSCRIPTION TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION) MODEL = args.MODEL EMB_MODEL = args.EMB_MODEL WIDTH = 50 SIDE = (100 - WIDTH) / 2 podcast_url_video_df = get_podcast_data(PODCAST_URL_VIDEO_PATH) together, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url, genre = get_basics_comp(EMB_MODEL, MODEL, DEFAULT_SYSTEM_PROMPT_LINK, logger, podcast_url_video_df, img_size=100) # -- 6. Setup prompt template + llm chain instruction = """CONTEXTO:/n/n {context}/n PREGUNTA: {question} RESPUESTA: """ prompt_template = utils.get_prompt(instruction, default_system_prompt, B_SYS, E_SYS, B_INST, E_INST, logger) llama_prompt = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) chain_type_kwargs = {"prompt": llama_prompt} qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path) # --------------------------------------------------------------------- _, container, _ = st.columns([SIDE, WIDTH, SIDE]) with container: st_player(utils.typewrite(youtube_video_url)) if "messages" not in st.session_state: st.session_state.messages = [] for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) if prompt := st.chat_input("¡Pregunta lo que quieras!"): with st.chat_message("user"): st.markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("assistant"): if 'GPT' not in genre: if prompt.lower() == 'resume': llm_response = utils.summarise_doc(TRANSCRIPTION_PATH, model_name='llama', model=MODEL) st.markdown(llm_response) else: llm_response = qa_chain(prompt)['result'] llm_response = utils.process_llm_response(llm_response) st.markdown(llm_response) start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = [] for response in llm_response.split('\n'): if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None: start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response) start_time_str_list.append(start_time_str) start_time_seconds_list.append(start_time_seconds) end_time_seconds_list.append(end_time_seconds) if start_time_str_list: for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list): st.markdown("__Fragmento: " + start_time_str + "__") _, container, _ = st.columns([SIDE, WIDTH, SIDE]) with container: st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}') else: if prompt.lower() == 'resume': llm_response = utils.summarise_doc(TRANSCRIPTION_PATH, model_name='gpt') st.markdown(llm_response) else: llm_response = utils.get_gpt_response(TRANSCRIPTION_PATH, prompt, logger) llm_response = utils.process_llm_response(llm_response) st.markdown(llm_response) start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = [] for response in llm_response.split('\n'): if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None: start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response) start_time_str_list.append(start_time_str) start_time_seconds_list.append(start_time_seconds) end_time_seconds_list.append(end_time_seconds) if start_time_str_list: for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list): st.markdown("__Fragmento: " + start_time_str + "__") _, container, _ = st.columns([SIDE, WIDTH, SIDE]) with container: st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}') st.session_state.messages.append({"role": "assistant", "content": llm_response}) # -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5 if __name__ == '__main__': main()