{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "train_data=pd.read_csv(\"data/train.csv\", engine=\"python\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "attributes=[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]\n", "#train_data[attributes].sum().plot.bar()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "import torch\n", "\n", "class toxicity_dataset(Dataset):\n", " def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 5000):\n", " self.data_path=data_path\n", " self.tokenizer=tokenizer\n", " self.attributes=attributes\n", " self.max_token_len=max_token_len\n", " self.sample=sample\n", " self._prepare_data()\n", " def _prepare_data(self):\n", " data=pd.read_csv(self.data_path)\n", " if self.sample is not None:\n", " self.data=data.sample(self.sample,random_state=7)\n", " else:\n", " self.data=data\n", " def __len__(self):\n", " return(len(self.data))\n", " def __getitem__(self,index):\n", " item = self.data.iloc[index]\n", " comment = str(item.comment_text)\n", " attributes = torch.FloatTensor(item[self.attributes])\n", " tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors=\"pt\",truncation=True,max_length=self.max_token_len,padding=\"max_length\",return_attention_mask=True)\n", " return{'input_ids':tokens.input_ids.flatten(),\"attention_mask\":tokens.attention_mask.flatten(),\"labels\":attributes}\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ "{'input_ids': tensor([ 0, 113, 43292, 487, 1073, 6619, 16519, 4261, 1012, 28845,\n", " 43292, 50118, 30086, 6, 38, 206, 5, 7729, 6619, 16519,\n", " 4261, 1012, 6717, 4867, 11, 2370, 32, 45, 41039, 7140,\n", " 250, 48149, 53, 888, 95, 41762, 30, 40823, 34740, 2071,\n", " 4, 407, 51, 197, 213, 11, 29617, 101, 25046, 12467,\n", " 381, 11742, 646, 19065, 6026, 742, 1195, 87, 634, 14530,\n", " 250, 4, 1437, 1437, 22, 2, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1]),\n", " 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]),\n", " 'labels': tensor([0., 0., 0., 0., 0., 0.])}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "model_name=\"roberta-base\"\n", "tokenizer=AutoTokenizer.from_pretrained(model_name)\n", "toxic_comments_dataset=toxicity_dataset(\"data/train.csv\",tokenizer,attributes)\n", "toxic_comments_dataset.__getitem__(0)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pytorch_lightning as pl\n", "from torch.utils.data import DataLoader" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "class Toxcity_Data_Module(pl.LightningDataModule):\n", " def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name=\"roberta-base\"):\n", " super().__init__()\n", " self.train_path=train_path\n", " self.test_path=test_path\n", " self.attributes=attributes\n", " self.batch_size=batch_size\n", " self.max_token_len=max_token_len\n", " self.model_name=model_name\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " def setup(self, stage = None):\n", " if stage in (None, \"fit\"):\n", " self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)\n", " self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)\n", " if stage == \"predict\":\n", " self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)\n", " def train_dataloader(self):\n", " return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=4,shuffle=True)\n", " def val_dataloader(self):\n", " return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n", " def predict_dataloader(self):\n", " return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes)\n", "\n", "toxicity_data_module.setup()\n", "dataloader=toxicity_data_module.train_dataloader()\n", "dataloader" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup\n", "import torch.nn as nn\n", "import math\n", "from torchmetrics.functional.classification import auroc\n", "import torch.nn.functional as F" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "class Toxic_Comment_Classifier(pl.LightningModule):\n", " def __init__(self,config:dict):\n", " super().__init__()\n", " self.config=config\n", " self.pretrained_model = AutoModel.from_pretrained(config['model_name'],return_dict=True)\n", " self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)\n", " self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])\n", " torch.nn.init.xavier_uniform_(self.hidden.weight)\n", " torch.nn.init.xavier_uniform_(self.classifier.weight)\n", " self.loss_func=nn.BCEWithLogitsLoss(reduction='mean')\n", " self.dropout = nn.Dropout()\n", "\n", " def forward(self, input_ids,attention_mask,labels=None):\n", " output = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)\n", " pooled_output=torch.mean(output.last_hidden_state, 1)\n", " #nn classification\n", " pooled_output=self.hidden(pooled_output)\n", " pooled_output=self.dropout(pooled_output)\n", " pooled_output=F.relu(pooled_output)\n", " logits=self.classifier(pooled_output)\n", " #loss\n", " loss = 0\n", " if labels is not None:\n", " loss = self.loss_func(logits.view(-1,self.config['n_labels']), labels.view(-1,self.config['n_labels']))\n", " return loss, logits\n", " \n", " def training_step(self, batch, batch_index):\n", " loss, logits = self(**batch)\n", " self.log(\"train loss\", loss, prog_bar=True, logger=True)\n", " return{'loss':loss,'predictions':logits,'labels':batch['labels']}\n", " def validation_step(self, batch, batch_index):\n", " loss, logits = self(**batch)\n", " self.log(\"validation loss\", loss, prog_bar=True, logger=True)\n", " return{'val_loss':loss,'predictions':logits,'labels':batch['labels']}\n", " def prediction_step(self, batch, batch_index):\n", " logits = self(**batch)\n", " return logits\n", " \n", " def configure_optimizers(self):\n", " optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])\n", " total_steps = self.config['train_size']/self.config['bs']\n", " warmup_steps = math.floor(total_steps*self.config['warmup'])\n", " scheduler = get_cosine_schedule_with_warmup(optimizer,warmup_steps,total_steps)\n", " return [optimizer],[scheduler]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n", "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "config = {\n", " 'model_name':\"distilroberta-base\",\n", " 'n_labels':len(attributes),\n", " 'bs':128,\n", " 'lr':1.5e-6,\n", " 'warmup':0.2,\n", " \"train_size\":len(toxicity_data_module.train_dataloader()),\n", " 'w_decay':0.001,\n", " 'n_epochs':1\n", "}\n", "\n", "model = Toxic_Comment_Classifier(config)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "idx=0\n", "input_ids = toxic_comments_dataset.__getitem__(idx)[\"input_ids\"]\n", "attention_mask = toxic_comments_dataset.__getitem__(idx)[\"attention_mask\"]\n", "labels = toxic_comments_dataset.__getitem__(idx)[\"labels\"]\n", "loss,output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0),labels.unsqueeze(dim=0))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n", "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "GPU available: False, used: False\n", "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pytorch_lightning\\trainer\\connectors\\logger_connector\\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n", " warning_cache.warn(\n" ] } ], "source": [ "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes,batch_size=config['bs'])\n", "toxicity_data_module.setup()\n", "model = Toxic_Comment_Classifier(config)\n", "\n", "trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)\n", "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "#print(torch.cuda.get_device_name())\n", "#trainer.fit(model,toxicity_data_module)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "model = torch.load(\"ToxicityClassificationModel.pt\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def predict_raw_comments(model, dm):\n", " predictions = trainer.predict(model,datamodule=dm)\n", " flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])\n", " return flattened_predictions" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "predictions = predict_raw_comments(model=model,dm=toxicity_data_module)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }