Spaces:
Running
Running
from datetime import timedelta | |
import os | |
import streamlit as st | |
import duckdb | |
from embedding import get_embeddings | |
from config import HF_HOST, DUCKDB_FILE, HF_REPO_TYPE, HF_REPO_ID, HF_FILENAME | |
def get_conn(): | |
if HF_HOST: | |
os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HF_TOKEN", "") | |
from huggingface_hub import hf_hub_download | |
local_file = hf_hub_download( | |
repo_type=HF_REPO_TYPE, | |
repo_id=HF_REPO_ID, | |
filename=HF_FILENAME) | |
return duckdb.connect(local_file) | |
else: | |
return duckdb.connect(DUCKDB_FILE) | |
title_query = """SELECT id, title FROM podcasts | |
ORDER BY date DESC; | |
""" | |
query = """WITH filtered_podcasts AS ( | |
SELECT id | |
FROM podcasts | |
WHERE id in ? | |
), | |
ordered_embeddings AS ( | |
SELECT embeddings.id, embeddings.part, array_distance(embedding, ?::FLOAT[1024]) AS distance | |
FROM embeddings | |
JOIN filtered_podcasts fp ON embeddings.id = fp.id | |
ORDER BY distance | |
LIMIT 10 | |
) | |
SELECT | |
p.title, | |
p.date, | |
e.start, | |
e.text, | |
e.part, | |
p.audio, | |
oe.distance, | |
FROM | |
ordered_embeddings oe | |
JOIN | |
episodes e | |
ON | |
oe.id = e.id AND oe.part = e.part | |
JOIN | |
podcasts p | |
ON | |
oe.id = p.id | |
ORDER BY oe.distance; | |
""" | |
st.title("terapyon cannel search") | |
conn = get_conn() | |
titles = conn.execute(title_query).df() | |
selected_title: list[str] | None = st.multiselect("Select title", titles["title"]) | |
if selected_title: | |
selected_ids = titles.loc[titles.loc[:, "title"].isin(selected_title), "id"].tolist() | |
else: | |
st.write("All titles") | |
selected_ids = titles.loc[:, "id"].tolist() | |
word = st.text_input("Search word") | |
if word: | |
st.write(f"Search word: {word}") | |
embeddings = get_embeddings([word], query=True) | |
word_embedding = embeddings[0, :] | |
result = conn.execute(query, | |
(selected_ids, word_embedding,)).df() | |
selected = st.dataframe(result, | |
column_order=["title", "date", "part", "start", "distance", "text", "audio"], | |
on_select="rerun", | |
selection_mode="single-row") | |
if selected: | |
show_audio = False | |
rows = selected["selection"].get("rows") | |
if rows: | |
row = rows[0] | |
text = result.iloc[row, 3] | |
start = result.iloc[row, 2].astype(float) | |
start_delta = timedelta(seconds=start) | |
if st.button("ใชใผใใฃใชใๅ็"): | |
show_audio = True | |
if show_audio: | |
st.write(f"Start time: {str(start_delta)}") | |
st.audio(result.iloc[row, 5], start_time=start-5.0) | |
st.text(text) | |