In [1]:
import pandas as pd
import numpy as np

train_data=pd.read_csv("data/train.csv", engine="python")

In [2]:
attributes=["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
#train_data[attributes].sum().plot.bar()

In [3]:
from torch.utils.data import Dataset
import torch

class toxicity_dataset(Dataset):
 def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 5000):
 self.data_path=data_path
 self.tokenizer=tokenizer
 self.attributes=attributes
 self.max_token_len=max_token_len
 self.sample=sample
 self._prepare_data()
 def _prepare_data(self):
 data=pd.read_csv(self.data_path)
 if self.sample is not None:
 self.data=data.sample(self.sample,random_state=7)
 else:
 self.data=data
 def __len__(self):
 return(len(self.data))
 def __getitem__(self,index):
 item = self.data.iloc[index]
 comment = str(item.comment_text)
 attributes = torch.FloatTensor(item[self.attributes])
 tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors="pt",truncation=True,max_length=self.max_token_len,padding="max_length",return_attention_mask=True)
 return{'input_ids':tokens.input_ids.flatten(),"attention_mask":tokens.attention_mask.flatten(),"labels":attributes}


In [4]:
from transformers import AutoTokenizer
model_name="roberta-base"
tokenizer=AutoTokenizer.from_pretrained(model_name)
toxic_comments_dataset=toxicity_dataset("data/train.csv",tokenizer,attributes)
toxic_comments_dataset.__getitem__(0)


 from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([ 0, 113, 43292, 487, 1073, 6619, 16519, 4261, 1012, 28845,
 43292, 50118, 30086, 6, 38, 206, 5, 7729, 6619, 16519,
 4261, 1012, 6717, 4867, 11, 2370, 32, 45, 41039, 7140,
 250, 48149, 53, 888, 95, 41762, 30, 40823, 34740, 2071,
 4, 407, 51, 197, 213, 11, 29617, 101, 25046, 12467,
 381, 11742, 646, 19065, 6026, 742, 1195, 87, 634, 14530,
 250, 4, 1437, 1437, 22, 2, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0]),

In [5]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [6]:
class Toxcity_Data_Module(pl.LightningDataModule):
 def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name="roberta-base"):
 super().__init__()
 self.train_path=train_path
 self.test_path=test_path
 self.attributes=attributes
 self.batch_size=batch_size
 self.max_token_len=max_token_len
 self.model_name=model_name
 self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 def setup(self, stage = None):
 if stage in (None, "fit"):
 self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)
 self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)
 if stage == "predict":
 self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)
 def train_dataloader(self):
 return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=4,shuffle=True)
 def val_dataloader(self):
 return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)
 def predict_dataloader(self):
 return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)
 

In [7]:
toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/test.csv",attributes)

toxicity_data_module.setup()
dataloader=toxicity_data_module.train_dataloader()
dataloader



In [8]:
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

In [9]:
class Toxic_Comment_Classifier(pl.LightningModule):
 def __init__(self,config:dict):
 super().__init__()
 self.config=config
 self.pretrained_model = AutoModel.from_pretrained(config['model_name'],return_dict=True)
 self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
 self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
 torch.nn.init.xavier_uniform_(self.hidden.weight)
 torch.nn.init.xavier_uniform_(self.classifier.weight)
 self.loss_func=nn.BCEWithLogitsLoss(reduction='mean')
 self.dropout = nn.Dropout()

 def forward(self, input_ids,attention_mask,labels=None):
 output = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)
 pooled_output=torch.mean(output.last_hidden_state, 1)
 #nn classification
 pooled_output=self.hidden(pooled_output)
 pooled_output=self.dropout(pooled_output)
 pooled_output=F.relu(pooled_output)
 logits=self.classifier(pooled_output)
 #loss
 loss = 0
 if labels is not None:
 loss = self.loss_func(logits.view(-1,self.config['n_labels']), labels.view(-1,self.config['n_labels']))
 return loss, logits
 
 def training_step(self, batch, batch_index):
 loss, logits = self(**batch)
 self.log("train loss", loss, prog_bar=True, logger=True)
 return{'loss':loss,'predictions':logits,'labels':batch['labels']}
 def validation_step(self, batch, batch_index):
 loss, logits = self(**batch)
 self.log("validation loss", loss, prog_bar=True, logger=True)
 return{'val_loss':loss,'predictions':logits,'labels':batch['labels']}
 def prediction_step(self, batch, batch_index):
 logits = self(**batch)
 return logits
 
 def configure_optimizers(self):
 optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
 total_steps = self.config['train_size']/self.config['bs']
 warmup_steps = math.floor(total_steps*self.config['warmup'])
 scheduler = get_cosine_schedule_with_warmup(optimizer,warmup_steps,total_steps)
 return [optimizer],[scheduler]

In [10]:
config = {
 'model_name':"distilroberta-base",
 'n_labels':len(attributes),
 'bs':128,
 'lr':1.5e-6,
 'warmup':0.2,
 "train_size":len(toxicity_data_module.train_dataloader()),
 'w_decay':0.001,
 'n_epochs':1
}

model = Toxic_Comment_Classifier(config)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
idx=0
input_ids = toxic_comments_dataset.__getitem__(idx)["input_ids"]
attention_mask = toxic_comments_dataset.__getitem__(idx)["attention_mask"]
labels = toxic_comments_dataset.__getitem__(idx)["labels"]
loss,output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0),labels.unsqueeze(dim=0))

In [12]:
toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/test.csv",attributes,batch_size=config['bs'])
toxicity_data_module.setup()
model = Toxic_Comment_Classifier(config)

trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(torch.cuda.get_device_name())
#trainer.fit(model,toxicity_data_module)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
model = torch.load("ToxicityClassificationModel.pt")

In [14]:
def predict_raw_comments(model, dm):
 predictions = trainer.predict(model,datamodule=dm)
 flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
 return flattened_predictions

In [15]:
predictions = predict_raw_comments(model=model,dm=toxicity_data_module)