Spaces:

HengJay
/

Nvidia_GenAI_Contest-SNOMED_CT_Assistant

Sleeping

App Files Files Community

HengJay commited on Jun 24, 2024

Commit

cf2d0a9

1 Parent(s): 70365d5

first commit with git lfs

Browse files

Files changed (12) hide show

.gitattributes +1 -0
.gitignore +140 -0
SNOMED-CT_Assistant.py +171 -0
pages/Vector DB of SNOMED-CT.py +59 -0
requirements.txt +8 -0
snomed-entity-challenge.csv +0 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/data_level0.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/header.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/index_metadata.pickle +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/length.bin +3 -0
snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/link_lists.bin +3 -0
snomed_ct_id_term_1410k/chroma.sqlite3 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,140 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# End of https://mrkandreev.name/snippets/gitignore-generator/#Python

SNOMED-CT_Assistant.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import random
+import json
+import streamlit as st
+import chromadb
+from openai import OpenAI
+from dotenv import load_dotenv
+import pandas as pd
+# Config Streamlit
+st.set_page_config(layout="wide")
+remote = True
+if remote:
+    with st.sidebar:
+        if 'OPENAI_API_TOKEN' in st.secrets:
+            st.success('API key already provided!', icon='✅')
+            openai_api_key = st.secrets['OPENAI_API_TOKEN']
+else:
+    load_dotenv()
+    openai_api_key = os.environ.get("OpenAI_API_KEY")
+st.title("🏥 SNOMED-CT Assistant")
+st.caption("👩‍⚕️ A smart medical assistant with SNOMED-CT knowledge.")
+# System prompt
+system_prompt = """You are a medical expert with rich experience in SNOMED-CT professional knowledge.
+You are skilled at assisting medical professionals and answering questions in the medical field.
+You are patient, helpful and professional.
+Your comprehensive knowledge and mastery of these key components make you an invaluable asset in the realm of biomedical natural language processing and knowledge extraction.
+With your specialized expertise, you are able to navigate the complexities of SNOMED CT Entity Linking with ease, delivering accurate and reliable results that support various healthcare and research applications.
+Please refuse to answer inquiries and requests unrelated to the medical field, in order to maintain professionalism in medicine.
+As an experienced professional, you possess deep expertise in the field of SNOMED CT Entity Linking.
+You have a thorough understanding of the relevant workflows and critical aspects involved, encompassing:
+- Adept handling of electronic medical record (EMR) data processing
+- Entity Identification, Proficient entity recognition capabilities, identifying and extracting relevant medical concepts from unstructured text
+- Skilled Entity Mapping, accurately linking identified entities to their corresponding SNOMED CT concepts
+- Seamless integration and output of clinical terminology, ensuring the accurate representation and utilization of standardized medical language
+- Patiently and professionally respond to all SNOMED CT related inquiries, even if the user repeats questions.
+- Demonstrate deep expertise in the standard SNOMED CT Entity Linking workflow, which involves:
+  1. Performing Entity Identification to extract relevant medical terminology from the input.
+  2. Conducting Entity Mapping to link the identified entities to their corresponding SNOMED CT concepts.
+- Present the results in a tabular format only with the following 3 columns: "Identified Entity", "SNOMED CT Concept IDs", "SNOMED CT Descriptions".
+Here is the practical entity linking process example:
+- the input text in EHRs: "Patient referred for a biopsy to investigate potential swelling in upper larynx."
+- the identified entity: "biopsy", "larynx"
+- response the identified entities with JSON format: {"identified_entity" : ["biopsy", "larynx"]}
+- During Entity Identification processing, if the original medical text data clearly contains commonly used medical abbreviations, convert the abbreviations into their full names, and provide the original abbreviations in parentheses for easy reference.
+- For example: "The patient has the multiple disease, including T2D, CAD, HTN, CKD etc. decreased T3 and T4 levels."
+- T2D: "Type 2 Diabetes Mellitus", CAD: "Coronary Artery Disease", HTN: "Hypertension", CKD: "Chronic Kidney Disease", T3: "Triiodothyronine", T4: "Thyroxine"
+- Respond with full names in JSON format: {"identified_entity" : ["Type 2 Diabetes Mellitus (T2D)", "Coronary Artery Disease (CAD)", "Hypertension (HTN)", "Chronic Kidney Disease (CKD)", "Triiodothyronine (T3)", "Thyroxine (T4)"]}
+List out as many potential SNOMED entities as possible from the original medical text description,
+including Diseases, Diagnoses, Clinical Findings (like Signs and Symptoms),
+Procedures (Surgical, Therapeutic, Diagnostic, Nursing), Specimen Types, Living Organisms,
+Observables (for example heart rate), Physical Objects and Forces,
+Chemicals (including the chemicals used in drug preparations), Drugs (pharmaceutical products),
+Human Anatomy (body structures, organisms), Physiological Processes and Functions,
+Patients' Occupations, Patients' Social Contexts (e.g., religion and ethnicity), and various other types from the SNOMED CT standard.
+Numbers or units related symbols are not included in this range and can be ignored.
+Output Format Requirements (Must follow):
+- As default, only process "Entity Identification", and find out the entity related to SNOMED CT terms.
+- Present the results in JSON format, like:  {"identified_entity" : ["biopsy", "larynx"]}
+"""
+# Func: generate random med text
+raw_text_df = pd.read_csv('snomed-entity-challenge.csv')
+def random_med_text(text_df):
+    rows = len(text_df['text'])
+    index = random.randint(0, rows)
+    raw_text = text_df["text"][index]
+    raw_text_spilt = raw_text.split('###TEXT:')
+    raw_text_spilt_2 = raw_text_spilt[1].split('###RESPONSE:')
+    human = raw_text_spilt[0]
+    med_text = raw_text_spilt_2[0]
+    response = raw_text_spilt_2[1]
+    return index, human, med_text, response
+# Func: Gen Medical Prompt Example
+def generate_entity_identification_prompt(medical_text):
+    return f"""Help me to do "SNOMED-CT Entity Identification" process with raw medical text (Electronic Health Record, EHR):  \n {medical_text} \n """
+def generate_entity_mapping_prompt(entity, query_result_dict):
+    return f"""Help me to do "SNOMED-CT Entity Mapping" process with entity: {entity} and query result \n {query_result_dict} \n , output with table format, including 5 columns: "Identified Entity", "Distance", "IDs", "SNOMED CT Concept IDs", "SNOMED CT Descriptions"  \n """
+# Chroma DB Client
+chroma_client = chromadb.PersistentClient(path="snomed_ct_id_term_1410k")
+collection = chroma_client.get_or_create_collection(name="snomed_ct_id_term")
+# Func: query chrome_db
+def query_chroma_db(query_text, query_number):
+    results = collection.query(
+        query_texts=[query_text],
+        n_results=query_number,
+        include=["distances", "metadatas", "documents"]
+    )
+    return results
+# Func: chroma_db_result to dict
+def get_dict_from_chroma_results(results):
+    result_dict = {'ids': results['ids'][0], 'concept_ids': [ str(sub['concept_id']) for sub in results['metadatas'][0] ], 'distances': results['distances'][0], 'documents': results['documents'][0]}
+    return result_dict
+# OpenAI Client Configuration
+client = OpenAI(api_key=openai_api_key)
+model_tag = "gpt-3.5-turbo"
+# Chat Session with OpenAI API
+def chat_input(prompt, med_text):
+    st.session_state.messages.append({"role": "user", "content": med_text})
+    st.chat_message("user").write(med_text)
+    with st.spinner("Thinking..."):
+        entity_identification_response = client.chat.completions.create(
+            model=model_tag, response_format={ "type": "json_object" }, messages=st.session_state.messages, temperature=0.5)
+        msg = entity_identification_response.choices[0].message.content
+        entity_list = json.loads(msg)["identified_entity"]
+        print("entity list: ", entity_list)
+        st.session_state.messages.append({"role": "assistant", "content": msg})
+        st.chat_message("assistant").write(msg)
+        for entity in entity_list:
+            print("entity: ", entity)
+            results = query_chroma_db(entity, 10)
+            results_dict = get_dict_from_chroma_results(results)
+            entity_mapping_prompt = generate_entity_mapping_prompt(entity, results_dict)
+            st.session_state.messages.append({"role": "user", "content": entity_mapping_prompt})
+            entity_mapping_response = client.chat.completions.create(
+                model=model_tag, messages=st.session_state.messages, temperature=0.5)
+            mapping_msg = entity_mapping_response.choices[0].message.content
+            st.session_state.messages.append({"role": "assistant", "content": mapping_msg})
+            st.chat_message("assistant").write(mapping_msg)
+if "messages" not in st.session_state:
+    st.session_state["messages"] = [{"role": "system", "content": system_prompt},
+                                    {"role": "assistant", "content": "👩‍⚕️ 您好，我是您的專業醫學助理。請問有任何我可以協助你的地方嗎?"}]
+for msg in st.session_state.messages:
+    if msg["role"] == "system":
+        continue
+    st.chat_message(msg["role"]).write(msg["content"])
+if user_input := st.chat_input():
+    if not openai_api_key:
+        st.info("Please add your OpenAI API key to continue.")
+        st.stop()
+    entity_identification_prompt = generate_entity_identification_prompt(user_input)
+    chat_input(entity_identification_prompt, user_input)
+if st.sidebar.button("Example Input",type="primary"):
+    med_text = "Patient referred for a biopsy to investigate potential swelling in upper larynx."
+    entity_identification_prompt = generate_entity_identification_prompt(med_text)
+    chat_input(entity_identification_prompt, med_text)
+if st.sidebar.button("Random Input",type="primary"):
+    index, human, med_text, response = random_med_text(raw_text_df)
+    response = response.replace(",","  \n")
+    entity_identification_prompt = generate_entity_identification_prompt(med_text)
+    chat_input(entity_identification_prompt, med_text)
+    st.sidebar.write(f"[Random Text](https://huggingface.co/datasets/JaimeML/snomed-entity-challenge) Index: {index}")
+    st.sidebar.markdown(f"Ref Entity:  \n  {response}")

pages/Vector DB of SNOMED-CT.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from timeit import default_timer as timer
+import streamlit as st
+import chromadb
+import pandas as pd
+import numpy as np
+# configure sqlite3
+# __import__('pysqlite3')
+# import sys
+# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+st.set_page_config(layout="wide")
+# App Title
+st.title("📚 Semantic Search with Vector Database of SNOMED-CT 💡")
+st.caption("🔍 Search any SNOMED-CT relate decription & concept with natural language.")
+st.sidebar.title("🔍 Search Setting")
+query_number = st.sidebar.slider("Query Numbers", 10, 30, 10)
+st.markdown("##### ➡️⌨️ Please input some medical description here, e.g. \"insomnia two nights a week.\", \"COPD\", \"Degenerative Joint Disease\"")
+query_text = st.text_input("Input: any medical description snippet","Type-2 Diabetes")
+# Chroma DB Client
+chroma_client = chromadb.PersistentClient(path="snomed_ct_id_term_1410k")
+collection = chroma_client.get_or_create_collection(name="snomed_ct_id_term")
+start = 1.0
+end = 1.1
+st.markdown("##### ➡️Chroma DB will return " + str(query_number)
+            + " related instances from " + str(collection.count()) + " collections.")
+# st.warning("Due to the SQLite [file size limit on GitHub](https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-git-large-file-storage), this testing only query from 500k SNOMED-CT instances.", icon="🚨")
+# Func: query chrome_db
+def query_chroma_db(query_text, query_number):
+    results = collection.query(
+        query_texts=[query_text],
+        n_results=query_number,
+        include=["distances", "metadatas", "documents"]
+    )
+    return results
+# Func: chrome_db_result to df
+def get_df_from_chroma_results(results):
+    result_dict = {'ids': results['ids'][0], 'concept_ids': [ str(sub['concept_id']) for sub in results['metadatas'][0] ], 'distances': results['distances'][0], 'documents': results['documents'][0]}
+    df = pd.DataFrame(result_dict)
+    return df
+start = timer()
+results = query_chroma_db(query_text, query_number)
+end = timer()
+st.markdown("###### ➡️ Query Time : {: .6f} seconds.".format(end - start))
+st.divider()
+results_df = get_df_from_chroma_results(results)
+#displaying the dataframe as an interactive object
+st.markdown("### 📊 Similar Search Results from Chroma Vector DB")
+st.dataframe(results_df, 1000, 500)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+pandas
+openai
+numpy
+chromadb == 0.5.0
+python-dotenv
+pysqlite3-binary

snomed-entity-challenge.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6052af3bc565baf830088dd4c367f3e260ddbb2cf7dfac904fb483aa64f6b31
+size 2363160000

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc4275c3ac7eb47b6540b51430e9f85f50a3ebda23a824a9afa7906a02946db
+size 100

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25b66beb13495b59f604b58531f4b2ca7a4407ee9555c6d33a8faf2913dc420b
+size 52473273

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2582aa1cc6e61c9b0b3da6575206c81c03377e13cf96fa0eb7ca509bbd1f2692
+size 5640000

snomed_ct_id_term_1410k/c8390385-a5b9-4ff6-89cd-f8bf8a760fbb/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91a660d0f12b9111f4217c2024c4b75f810fbf4c6beae03cd9576891096b06a4
+size 12018944

snomed_ct_id_term_1410k/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dbcfc18f1d97ee8184c664105863bc8be1d8b6c376aca94dea6cdb5e9b81bf1
+size 3590983680