### Data Preparation

In [1]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/hassaanidrees/medinfo?select=MedInfo2019-QA-Medications.xlsx')

Dataset URL: https://www.kaggle.com/datasets/hassaanidrees/medinfo
Downloading medinfo.zip to ./medinfo


100%|██████████| 159k/159k [00:00<00:00, 480kB/s]







In [3]:
# Import pandas for data analysis
import pandas as pd
df = pd.read_excel("/content/medinfo/MedInfo2019-QA-Medications.xlsx")
df = df[['Question','Answer']]

In [None]:
df.head() #show first five rows

Unnamed: 0,Question,Answer
0,how does rivatigmine and otc sleep medicine in...,tell your doctor and pharmacist what prescript...
1,how does valium affect the brain,Diazepam is a benzodiazepine that exerts anxio...
2,what is morphine,Morphine is a pain medication of the opiate fa...
3,what are the milligrams for oxycodone e,… 10 mg … 20 mg … 40 mg … 80 mg ...
4,81% aspirin contain resin and shellac in it. ?,Inactive Ingredients Ingredient Name


In [None]:
df.Question[0]

'how does rivatigmine and otc sleep medicine interact'

In [None]:
df.Answer[0]

"tell your doctor and pharmacist what prescription and nonprescription medications, vitamins, nutritional supplements, and herbal products you are taking or plan to take. Be sure to mention any of the following: antihistamines; aspirin and other nonsteroidal anti-inflammatory medications (NSAIDs) such as ibuprofen (Advil, Motrin) and naproxen (Aleve, Naprosyn); bethanechol (Duvoid, Urecholine); ipratropium (Atrovent, in Combivent, DuoNeb); and medications for Alzheimer's disease, glaucoma, irritable bowel disease, motion sickness, ulcers, or urinary problems. Your doctor may need to change the doses of your medications or monitor you carefully for side effects."

In [None]:
df.shape # 690 rows | 2 cols

(690, 2)

In [None]:
!pip install cleantext

Collecting cleantext
  Downloading cleantext-1.1.4-py3-none-any.whl.metadata (3.5 kB)
Downloading cleantext-1.1.4-py3-none-any.whl (4.9 kB)
Installing collected packages: cleantext
Successfully installed cleantext-1.1.4


In [None]:
import cleantext

# Function to clean text data by removing unwanted characters and formatting
def clean(textdata):
    cleaned_text = []
    for i in textdata:
        cleaned_text.append(cleantext.clean(str(i), extra_spaces=True, lowercase=True, stopwords=False, stemming=False, numbers=True, punct=True, clean_all = True))

    return cleaned_text

In [None]:
# Apply the clean function to the questions and answers columns

df.Question = list(clean(df.Question))
df.Answer = list(clean(df.Answer))

In [None]:
# Save the cleaned data into a new CSV file & save
df.to_csv("cleaned_med_QA_data.csv", index=False)

### GPT-2 Model

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')



In [None]:
# Set the padding token for the tokenizer to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Maximum sequence length that GPT-2 can handle
max_length = tokenizer.model_max_length
print(max_length)

1024


In [None]:
# Load the cleaned QA dataset as a training set using the 'datasets' library
dataset = load_dataset('csv', data_files={'train': 'cleaned_med_QA_data.csv'}, split='train')

In [None]:
#Function to tokenize questions and answers and prepare them for the model
def tokenize_function(examples):
  '''1. Combine each question and answer into a single input string
     2. Tokenize the combined text using the GPT-2 tokenizer
     3. Set the labels to be the same as the input_ids (shifted to predict the next word)
     4. Return the tokenized output. '''

    combined_text = [str(q) + " " + str(a) for q, a in zip(examples['Question'], examples['Answer'])]
    tokenized_output = tokenizer(combined_text, padding='max_length', truncation=True, max_length=128)

    # Set the labels to be the same as the input_ids (shifted to predict the next word)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()

    return tokenized_output

# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Define training arguments for the GPT-2 model
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model outputs
    num_train_epochs=20,  # Train for 50 epochs
    per_device_train_batch_size=16, # Batch size during training
    per_device_eval_batch_size=32,  # Batch size during evaluation
    warmup_steps=500,  # Warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for saving logs
    logging_steps=10,  # Log every 10 steps
    save_steps=1000,  # Save model checkpoints every 1000 steps
)

# Trainer class to handle training process
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Step,Training Loss
10,5.8918
20,5.4979
30,4.6713
40,3.7515
50,3.016
60,2.6333
70,2.3608
80,2.079
90,2.1456
100,2.1501


TrainOutput(global_step=880, training_loss=1.5622584277933294, metrics={'train_runtime': 525.9662, 'train_samples_per_second': 26.237, 'train_steps_per_second': 1.673, 'total_flos': 901457510400000.0, 'train_loss': 1.5622584277933294, 'epoch': 20.0})

In [None]:
# Save the model
trainer.save_model('med_info_model')

### Testing

In [None]:
# Function to generate a response based on a user prompt (testing the model)
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt from the response
    if response.startswith(prompt):
        response = response[len(prompt):].strip()  # Remove the prompt from the response

    return response

In [None]:
# Example conversation
user_input = "what is desonide ointment used for"
bot_response = generate_response(user_input)
print("Bot Response:", bot_response)

Bot Response: desonide ointment is used to treat a variety of conditions it is used to treat allergies and other skin conditions it is also used to treat certain types of infections it is also used to treat skin infections caused by bacteria that are on skin desonide is in a class of medications called antimicrobials it works by killing bacteria that cause skin infections desonide is in a class of medications called antibiotics it works by killing bacteria that cause skin infections


In [None]:
# Copying the model to Google Drive (optional)
import shutil

# Path to the file in Colab
colab_file_path = '/content/med_info_model/model.safetensors'

# Path to your Google Drive
drive_file_path = '/content/drive/MyDrive'

# Copy the file
shutil.copy(colab_file_path, drive_file_path)

'/content/drive/MyDrive/model.safetensors'