podcast-search / src /store.py
terapyon's picture
srtの分割を1分にし、configなどを整え、READMEを書いた
d788666
import duckdb
from embedding import get_embeddings
from config import DUCKDB_FILE
from config import PODCAST_TITLE_LIST, EPISODES_PARQUET
def create_table():
conn = duckdb.connect(DUCKDB_FILE)
podcasts_create = """CREATE TABLE podcasts (
id BIGINT PRIMARY KEY,
title TEXT, date DATE, guests TEXT[], length BIGINT, audio TEXT
);
"""
episodes_create = """CREATE TABLE episodes (
id BIGINT, part BIGINT, start BIGINT, end_ BIGINT, text TEXT,
PRIMARY KEY (id, part)
);
"""
embeddings_create = """CREATE TABLE embeddings (
id BIGINT, part BIGINT, embedding FLOAT[1024],
PRIMARY KEY (id, part)
);
"""
conn.execute(podcasts_create)
conn.execute(episodes_create)
conn.execute(embeddings_create)
conn.commit()
conn.close()
print("Tables created.")
def insert_podcast():
conn = duckdb.connect(DUCKDB_FILE)
sql = """INSERT INTO podcasts
SELECT id, title, date, [], length, audio
FROM read_parquet(?);
"""
conn.execute(sql, [PODCAST_TITLE_LIST])
conn.commit()
conn.close()
def insert_episodes():
conn = duckdb.connect(DUCKDB_FILE)
sql = """INSERT INTO episodes
SELECT id, part, start, end_, text
FROM read_parquet(?);
"""
conn.execute(sql, [EPISODES_PARQUET])
conn.commit()
conn.close()
def embed_store():
conn = duckdb.connect(DUCKDB_FILE)
sql_select = """SELECT id, part, text FROM episodes;"""
data = conn.execute(sql_select).df()
targets = data["text"].tolist()
enbeddings = get_embeddings(targets)
for id_, part, emb in zip(data["id"], data["part"], enbeddings):
# print(id_, title)
conn.execute(
"INSERT INTO embeddings VALUES (?, ?, ?)", (id_, part, emb.tolist())
)
conn.commit()
conn.close()
def create_index():
conn = duckdb.connect(DUCKDB_FILE)
conn.execute("LOAD vss;")
conn.execute("SET hnsw_enable_experimental_persistence=true;")
conn.execute("""CREATE INDEX embeddings_index
ON embeddings USING HNSW (embedding);""")
conn.commit()
conn.close()
if __name__ == "__main__":
import sys
args = sys.argv
if len(args) == 2:
if args[1] == "create":
create_table()
elif args[1] == "podcastinsert":
insert_podcast()
elif args[1] == "episodeinsert":
insert_episodes()
elif args[1] == "embed":
embed_store()
elif args[1] == "index":
create_index()
elif args[1] == "all":
create_table()
insert_podcast()
insert_episodes()
embed_store()
create_index()
else:
print("Usage: python store.py all")
sys.exit(1)
else:
print("Usage: python store.py create")
sys.exit(1)