Muhammad Haris commited on
Commit
f08ecfd
·
1 Parent(s): ec90c9c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. medical_data.csv +0 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import torch
8
+
9
+ # Load medical data
10
+ excel_file_path = r"C:\Users\HEHEBOI\Desktop\GPT2Final\medical_data.csv"
11
+ try:
12
+ medical_df = pd.read_csv(excel_file_path, encoding='utf-8')
13
+ except UnicodeDecodeError:
14
+ medical_df = pd.read_csv(excel_file_path, encoding='latin1')
15
+
16
+ # TF-IDF Vectorization
17
+ vectorizer = TfidfVectorizer(stop_words='english')
18
+ X_tfidf = vectorizer.fit_transform(medical_df['Questions'])
19
+
20
+ # Load pre-trained GPT-2 model and tokenizer
21
+ model_name = "sshleifer/tiny-gpt2"
22
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
23
+ model = GPT2LMHeadModel.from_pretrained(model_name)
24
+
25
+ # Load pre-trained Sentence Transformer model
26
+ sbert_model_name = "paraphrase-MiniLM-L6-v2"
27
+ sbert_model = SentenceTransformer(sbert_model_name)
28
+
29
+ # Function to answer medical questions using a combination of TF-IDF, LLM, and semantic similarity
30
+ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_model, medical_df):
31
+ # TF-IDF Cosine Similarity
32
+ question_vector = vectorizer.transform([question])
33
+ tfidf_similarities = cosine_similarity(question_vector, X_tfidf).flatten()
34
+
35
+ # Find the most similar question using semantic similarity
36
+ question_embedding = sbert_model.encode(question, convert_to_tensor=True)
37
+ similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df['Questions'].tolist(), convert_to_tensor=True)).flatten()
38
+ max_sim_index = similarities.argmax().item()
39
+
40
+ # LLM response generation
41
+ input_text = "Medical Bot: " + medical_df.iloc[max_sim_index]['Questions']
42
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
43
+ attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
44
+ pad_token_id = tokenizer.eos_token_id
45
+ lm_output = model.generate(input_ids, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2, attention_mask=attention_mask, pad_token_id=pad_token_id)
46
+ lm_generated_response = tokenizer.decode(lm_output[0], skip_special_tokens=True)
47
+
48
+ # Compare similarities and choose the best response
49
+ if tfidf_similarities.max() > 0.5:
50
+ tfidf_index = tfidf_similarities.argmax()
51
+ return medical_df.iloc[tfidf_index]['Answers']
52
+ else:
53
+ return lm_generated_response
54
+
55
+ # Streamlit app
56
+ st.title("Medical Bot")
57
+
58
+ user_input = st.text_input("You:")
59
+ if user_input.lower() == "exit":
60
+ st.stop()
61
+ response = get_medical_response(user_input, vectorizer, X_tfidf, model, tokenizer, sbert_model, medical_df)
62
+ st.text_area("Bot's Response:", response)
medical_data.csv ADDED
The diff for this file is too large to render. See raw diff