Spaces:
Runtime error
Runtime error
File size: 9,899 Bytes
04558b7 121ee4b 04558b7 24bbdf7 04558b7 2bae7ed 66216e0 04558b7 66216e0 2bae7ed 66216e0 2bae7ed 66216e0 04558b7 2bae7ed 24bbdf7 04558b7 24bbdf7 90e8e7a 121ee4b 90e8e7a 04558b7 90e8e7a 04558b7 90e8e7a 04558b7 2bae7ed 04558b7 24bbdf7 2bae7ed 04558b7 2bae7ed 97550df 2bae7ed 90e8e7a 24bbdf7 2bae7ed 24bbdf7 2bae7ed 24bbdf7 90e8e7a 2bae7ed 24bbdf7 2bae7ed 04558b7 2bae7ed 04558b7 2bae7ed 04558b7 90e8e7a 04558b7 90e8e7a 04558b7 cce1e9f 04558b7 ed3190f 04558b7 ed3190f 04558b7 2bae7ed 04558b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# -- Import libraries
from langchain.prompts import PromptTemplate
from PIL import Image
from streamlit.logger import get_logger
from streamlit_player import st_player
import pandas as pd
import streamlit as st
import urllib.request
import argparse
import together
import logging
import requests
import utils
import spacy
import time
import os
import re
st.set_page_config(layout="wide")
@st.cache_data
def get_args():
# -- 1. Setup arguments
parser = argparse.ArgumentParser()
parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK')
parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH')
parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription')
parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name')
parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name')
os.system("python -m spacy download es_core_news_lg")
# -- 2. Setup env and logger
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"
logger = get_logger(__name__)
# -- 3. Setup constants
args = parser.parse_args()
return args, logger
@st.cache_data
def get_podcast_data(path):
podcast_url_video_df = pd.read_csv(path, sep=';')
return podcast_url_video_df
@st.cache_resource(experimental_allow_widgets=True)
def get_basics_comp(emb_model, model, default_system_prompt_link, _logger, podcast_url_video_df, img_size=100):
r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True)
icon = Image.open(r.raw)
icon = icon.resize((img_size, img_size))
with st.sidebar.container():
st.markdown(
"""
<head>
<style>
.footer1 {
text-align: center;
}
</style>
</head>
<body>
<div class="footer1">
<img src=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif width="150" height="150">
</div>
<br>
</body>
""",
unsafe_allow_html=True,
)
genre = st.sidebar.radio(
"Seleccione el LLM",
["LLAMA", "GPT (not available yet)"]
)
st.sidebar.info('Modelo LLAMA: ' + str(model).split('/')[-1] + '\nModelo GPT: gpt-3.5-turbo', icon="ℹ️")
podcast_list = list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", "")))
video_option = st.sidebar.selectbox(
"Seleccione el podcast",
podcast_list,
on_change=clean_chat
)
# -- Add icons
with st.sidebar.container():
st.markdown(
"""
<head>
<style>
.footer2 {
position: fixed;
bottom: 2%;
left: 6.5%;
}
.footer2 a {
margin: 10px;
text-decoration: none;
}
</style>
</head>
<body>
<div class="footer2">
<a href="https://www.linkedin.com/in/alberto-fernandez-hernandez-3a3474136">
<img src="https://cdn-icons-png.flaticon.com/128/3536/3536505.png" width="32" height="32">
</a>
<a href="https://github.com/AlbertoUAH/Castena">
<img src="https://cdn-icons-png.flaticon.com/128/733/733553.png" width="32" height="32">
</a>
<a href="https://www.buymeacoffee.com/castena">
<img src="https://cdn-icons-png.flaticon.com/128/761/761767.png" width="32" height="32">
</a>
</div>
</body>
""",
unsafe_allow_html=True,
)
video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "")
video_option_joined_path = "{}_transcription.txt".format(video_option_joined)
youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "")
st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url))
# -- 4. Setup request for system prompt
f = urllib.request.urlopen(default_system_prompt_link)
default_system_prompt = str(f.read(), 'UTF-8')
# -- 5. Setup app
translator, nlp, retriever = utils.setup_app(video_option_joined_path, emb_model, model, _logger)
# -- 6. Setup model
together.api_key = os.environ["TOGETHER_API_KEY"]
#together.Models.start(model)
return together, translator, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url
def clean_chat():
st.session_state.conversation = None
st.session_state.chat_history = None
st.session_state.messages = [{'role': 'assistant', 'content': 'Nuevo chat creado'}]
def main():
args, logger = get_args()
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH
DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK
TRANSCRIPTION = args.TRANSCRIPTION
TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION)
MODEL = args.MODEL
EMB_MODEL = args.EMB_MODEL
WIDTH = 50
SIDE = (100 - WIDTH) / 2
podcast_url_video_df = get_podcast_data(PODCAST_URL_VIDEO_PATH)
together, translator, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url = get_basics_comp(EMB_MODEL, MODEL,
DEFAULT_SYSTEM_PROMPT_LINK, logger,
podcast_url_video_df, img_size=100)
# -- 6. Setup prompt template + llm chain
instruction = """CONTEXTO:/n/n {context}/n
PREGUNTA: {question}
RESPUESTA: """
prompt_template = utils.get_prompt(instruction, default_system_prompt, B_SYS, E_SYS, B_INST, E_INST, logger)
llama_prompt = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": llama_prompt}
qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path)
# ---------------------------------------------------------------------
_, container, _ = st.columns([SIDE, WIDTH, SIDE])
with container:
st_player(utils.typewrite(youtube_video_url))
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("¡Pregunta lo que quieras!"):
with st.chat_message("user"):
st.markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("assistant"):
llm_response, cleaned_prompt = qa_chain(prompt)
llm_response = utils.process_llm_response(llm_response, nlp)
st.markdown(llm_response)
start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = []
for response in llm_response.split('\n'):
if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None:
start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response, cleaned_prompt)
start_time_str_list.append(start_time_str)
start_time_seconds_list.append(start_time_seconds)
end_time_seconds_list.append(end_time_seconds)
if start_time_str_list:
for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list):
st.markdown("__Fragmento: " + start_time_str + "__")
_, container, _ = st.columns([SIDE, WIDTH, SIDE])
with container:
st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}')
st.session_state.messages.append({"role": "assistant", "content": llm_response})
# -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5
if __name__ == '__main__':
main() |