from datetime import timedelta import os import streamlit as st import duckdb from embedding import get_embeddings from config import HF_HOST, DUCKDB_FILE, HF_REPO_TYPE, HF_REPO_ID, HF_FILENAME @st.cache_resource def get_conn(): if HF_HOST: os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HF_TOKEN", "") from huggingface_hub import hf_hub_download local_file = hf_hub_download( repo_type=HF_REPO_TYPE, repo_id=HF_REPO_ID, filename=HF_FILENAME) return duckdb.connect(local_file) else: return duckdb.connect(DUCKDB_FILE) title_query = """SELECT id, title FROM podcasts ORDER BY date DESC; """ query = """WITH filtered_podcasts AS ( SELECT id FROM podcasts WHERE id in ? ), ordered_embeddings AS ( SELECT embeddings.id, embeddings.part, array_distance(embedding, ?::FLOAT[1024]) AS distance FROM embeddings JOIN filtered_podcasts fp ON embeddings.id = fp.id ORDER BY distance LIMIT 10 ) SELECT p.title, p.date, e.start, e.text, e.part, p.audio, oe.distance, FROM ordered_embeddings oe JOIN episodes e ON oe.id = e.id AND oe.part = e.part JOIN podcasts p ON oe.id = p.id ORDER BY oe.distance; """ st.title("terapyon cannel search") conn = get_conn() titles = conn.execute(title_query).df() selected_title: list[str] | None = st.multiselect("Select title", titles["title"]) if selected_title: selected_ids = titles.loc[titles.loc[:, "title"].isin(selected_title), "id"].tolist() else: st.write("All titles") selected_ids = titles.loc[:, "id"].tolist() word = st.text_input("Search word") if word: st.write(f"Search word: {word}") embeddings = get_embeddings([word], query=True) word_embedding = embeddings[0, :] result = conn.execute(query, (selected_ids, word_embedding,)).df() selected = st.dataframe(result, column_order=["title", "date", "part", "start", "distance", "text", "audio"], on_select="rerun", selection_mode="single-row") if selected: show_audio = False rows = selected["selection"].get("rows") if rows: row = rows[0] text = result.iloc[row, 3] start = result.iloc[row, 2].astype(float) start_delta = timedelta(seconds=start) if st.button("オーディオを再生"): show_audio = True if show_audio: st.write(f"Start time: {str(start_delta)}") st.audio(result.iloc[row, 5], start_time=start-5.0) st.text(text)