Spaces:
Runtime error
Runtime error
File size: 6,729 Bytes
04558b7 66216e0 04558b7 66216e0 04558b7 66216e0 04558b7 66216e0 04558b7 cce1e9f 04558b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# -- Import libraries
from langchain.prompts import PromptTemplate
from PIL import Image
from streamlit.logger import get_logger
from streamlit_player import st_player
import pandas as pd
import streamlit as st
import urllib.request
import argparse
import together
import logging
import requests
import utils
import spacy
import time
import os
import re
@st.cache
def get_args():
st.set_page_config(layout="wide")
# -- 1. Setup arguments
parser = argparse.ArgumentParser()
parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK')
parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH')
parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription')
parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name')
parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name')
os.system("python -m spacy download es_core_news_lg")
# -- 2. Setup env and logger
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"
logger = get_logger(__name__)
# -- 3. Setup constants
args = parser.parse_args()
return args
@st.cache
def get_podcast_data(transcription_path):
together.api_key = os.environ["TOGETHER_API_KEY"]
together.Models.start(MODEL)
podcast_url_video_df = pd.read_csv(PODCAST_URL_VIDEO_PATH, sep=';')
return podcast_url_video_df
def main():
args = get_args()
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# -- 4. Get parameters
PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH
DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK
TRANSCRIPTION = args.TRANSCRIPTION
TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION)
MODEL = args.MODEL
EMB_MODEL = args.EMB_MODEL
podcast_url_video_df = get_podcast_data(TRANSCRIPTION_PATH)
r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True)
icon = Image.open(r.raw)
icon = icon.resize((100, 100))
st.sidebar.image(icon)
video_option = st.sidebar.selectbox(
"Seleccione el podcast",
list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", "")))
)
video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "")
video_option_joined_path = "{}_transcription.txt".format(video_option_joined)
youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "")
# -- 4. Setup request for system prompt
f = urllib.request.urlopen(DEFAULT_SYSTEM_PROMPT_LINK)
DEFAULT_SYSTEM_PROMPT = str(f.read(), 'UTF-8')
# -- 5. Setup app
translator, nlp, retriever = utils.setup_app(video_option_joined_path, EMB_MODEL, MODEL, logger)
# -- 6. Setup prompt template + llm chain
instruction = """CONTEXTO:/n/n {context}/n
PREGUNTA: {question}
RESPUESTA: """
prompt_template = utils.get_prompt(instruction, DEFAULT_SYSTEM_PROMPT, B_SYS, E_SYS, B_INST, E_INST, logger)
llama_prompt = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": llama_prompt}
qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path)
# ---------------------------------------------------------------------
# -- 7. Setup Streamlit app
st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url))
width = 50
side = (100 - width) / 2
_, container, _ = st.columns([side, width, side])
with container:
st_player(utils.typewrite(youtube_video_url))
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("¡Pregunta lo que quieras!"):
with st.chat_message("user"):
st.markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("assistant"):
llm_response = qa_chain(prompt)
llm_response = utils.process_llm_response(llm_response, nlp)
st.markdown(llm_response)
start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = []
for response in llm_response.split('\n'):
if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None:
start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response)
start_time_str_list.append(start_time_str)
start_time_seconds_list.append(start_time_seconds)
end_time_seconds_list.append(end_time_seconds)
if start_time_str_list:
width = 40
side = (100 - width) / 2
for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list):
st.markdown("__Fragmento: " + start_time_str + "__")
_, container, _ = st.columns([side, width, side])
with container:
st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}')
st.session_state.messages.append({"role": "assistant", "content": llm_response})
# -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5
if __name__ == '__main__':
main() |