Spaces:

terapyon
/

podcast-search

Running

App Files Files Community

terapyon commited on 29 days ago

Commit

eeafe79

unverified ·

2 Parent(s): 7f1680d ee7e0ce

Merge pull request #4 from terapyon/terada/mt-241-streamlit-ui

Browse files

Files changed (4) hide show

pyproject.toml +1 -0
requirements.txt +2 -1
src/app.py +74 -0
src/embedding.py +7 -1

pyproject.toml CHANGED Viewed

@@ -14,6 +14,7 @@ dependencies = [
     "pyarrow>=18.1.0",
     "sentence-transformers>=3.3.1",
     "sentencepiece>=0.2.0",
     "torch>=2.5.1",
     "tqdm>=4.67.1",
     "unidic-lite>=1.0.8",

     "pyarrow>=18.1.0",
     "sentence-transformers>=3.3.1",
     "sentencepiece>=0.2.0",
+    "streamlit>=1.41.1",
     "torch>=2.5.1",
     "tqdm>=4.67.1",
     "unidic-lite>=1.0.8",

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ pandas
 numpy
 polars
 pyarrow
-duckdb

 numpy
 polars
 pyarrow
+duckdb
+streamlit

src/app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+import duckdb
+from embedding import get_embeddings
+from config import DUCKDB_FILE
+@st.cache_resource
+def get_conn():
+    return duckdb.connect(DUCKDB_FILE)
+title_query = """SELECT id, title FROM podcasts
+    ORDER BY date DESC;
+"""
+query = """WITH filtered_podcasts AS (
+    SELECT id
+      FROM podcasts
+        WHERE id in ?
+),
+ordered_embeddings AS (
+    SELECT embeddings.id, embeddings.part
+    FROM embeddings
+    JOIN filtered_podcasts fp ON embeddings.id = fp.id
+    ORDER BY array_distance(embedding, ?::FLOAT[1024])
+    LIMIT 10
+)
+SELECT
+    p.title,
+    p.date,
+    e.start,
+    e.text,
+    e.part,
+    p.audio,
+  FROM
+      ordered_embeddings oe
+  JOIN
+      episodes e
+    ON
+      oe.id = e.id AND oe.part = e.part
+  JOIN
+      podcasts p
+    ON
+      oe.id = p.id;
+"""
+st.title("terapyon cannel search")
+conn = get_conn()
+titles = conn.execute(title_query).df()
+selected_title: list[str] | None = st.multiselect("Select title", titles["title"])
+if selected_title:
+    selected_ids = titles.loc[titles.loc[:, "title"].isin(selected_title), "id"].tolist()
+else:
+    st.write("All titles")
+    selected_ids = titles.loc[:, "id"].tolist()
+word = st.text_input("Search word")
+if word:
+    st.write(f"Search word: {word}")
+    embeddings = get_embeddings([word], query=True)
+    word_embedding = embeddings[0, :]
+    result = conn.execute(query,
+                          (selected_ids, word_embedding,)).df()
+    selected = st.dataframe(result,
+                            column_order=["title", "date", "part", "start", "text", "audio"],
+                            on_select="rerun",
+                            selection_mode="single-row")
+    if selected:
+        rows = selected["selection"].get("rows")
+        if rows:
+            row = rows[0]
+            st.text(result.iloc[row, 3])

src/embedding.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 from sentence_transformers import SentenceTransformer
@@ -5,7 +6,11 @@ MODEL_NAME = "cl-nagoya/ruri-large"
 PREFIX_QUERY = "クエリ: "  # "query: "
 PASSAGE_QUERY = "文章: "  # "passage: "
-model = SentenceTransformer(MODEL_NAME)
 def get_embeddings(texts: list[str], query=False, passage=False) -> np.ndarray:
@@ -14,6 +19,7 @@ def get_embeddings(texts: list[str], query=False, passage=False) -> np.ndarray:
     if passage:
         texts = [PASSAGE_QUERY + text for text in texts]
     # texts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
     embeddings = model.encode(texts)
     # print(embeddings.shape)
     # print(type(embeddings))

+import streamlit as st
 import numpy as np
 from sentence_transformers import SentenceTransformer
 PREFIX_QUERY = "クエリ: "  # "query: "
 PASSAGE_QUERY = "文章: "  # "passage: "
+@st.cache_resource
+def get_sentence_model():
+    model = SentenceTransformer(MODEL_NAME)
+    return model
 def get_embeddings(texts: list[str], query=False, passage=False) -> np.ndarray:
     if passage:
         texts = [PASSAGE_QUERY + text for text in texts]
     # texts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
+    model = get_sentence_model()
     embeddings = model.encode(texts)
     # print(embeddings.shape)
     # print(type(embeddings))