HengJay's picture
first commit with git lfs
cf2d0a9
from timeit import default_timer as timer
import streamlit as st
import chromadb
import pandas as pd
import numpy as np
# configure sqlite3
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
st.set_page_config(layout="wide")
# App Title
st.title("📚 Semantic Search with Vector Database of SNOMED-CT 💡")
st.caption("🔍 Search any SNOMED-CT relate decription & concept with natural language.")
st.sidebar.title("🔍 Search Setting")
query_number = st.sidebar.slider("Query Numbers", 10, 30, 10)
st.markdown("##### ➡️⌨️ Please input some medical description here, e.g. \"insomnia two nights a week.\", \"COPD\", \"Degenerative Joint Disease\"")
query_text = st.text_input("Input: any medical description snippet","Type-2 Diabetes")
# Chroma DB Client
chroma_client = chromadb.PersistentClient(path="snomed_ct_id_term_1410k")
collection = chroma_client.get_or_create_collection(name="snomed_ct_id_term")
start = 1.0
end = 1.1
st.markdown("##### ➡️Chroma DB will return " + str(query_number)
+ " related instances from " + str(collection.count()) + " collections.")
# st.warning("Due to the SQLite [file size limit on GitHub](https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-git-large-file-storage), this testing only query from 500k SNOMED-CT instances.", icon="🚨")
# Func: query chrome_db
def query_chroma_db(query_text, query_number):
results = collection.query(
query_texts=[query_text],
n_results=query_number,
include=["distances", "metadatas", "documents"]
)
return results
# Func: chrome_db_result to df
def get_df_from_chroma_results(results):
result_dict = {'ids': results['ids'][0], 'concept_ids': [ str(sub['concept_id']) for sub in results['metadatas'][0] ], 'distances': results['distances'][0], 'documents': results['documents'][0]}
df = pd.DataFrame(result_dict)
return df
start = timer()
results = query_chroma_db(query_text, query_number)
end = timer()
st.markdown("###### ➡️ Query Time : {: .6f} seconds.".format(end - start))
st.divider()
results_df = get_df_from_chroma_results(results)
#displaying the dataframe as an interactive object
st.markdown("### 📊 Similar Search Results from Chroma Vector DB")
st.dataframe(results_df, 1000, 500)