Spaces:

JKJanosko
/

Toxicity-Analysis

Runtime error

File size: 15,494 Bytes

27b305f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "train_data=pd.read_csv(\"data/train.csv\", engine=\"python\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "attributes=[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]\n",
    "#train_data[attributes].sum().plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import Dataset\n",
    "import torch\n",
    "\n",
    "class toxicity_dataset(Dataset):\n",
    "    def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 5000):\n",
    "        self.data_path=data_path\n",
    "        self.tokenizer=tokenizer\n",
    "        self.attributes=attributes\n",
    "        self.max_token_len=max_token_len\n",
    "        self.sample=sample\n",
    "        self._prepare_data()\n",
    "    def _prepare_data(self):\n",
    "        data=pd.read_csv(self.data_path)\n",
    "        if self.sample is not None:\n",
    "            self.data=data.sample(self.sample,random_state=7)\n",
    "        else:\n",
    "            self.data=data\n",
    "    def __len__(self):\n",
    "        return(len(self.data))\n",
    "    def __getitem__(self,index):\n",
    "        item = self.data.iloc[index]\n",
    "        comment = str(item.comment_text)\n",
    "        attributes = torch.FloatTensor(item[self.attributes])\n",
    "        tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors=\"pt\",truncation=True,max_length=self.max_token_len,padding=\"max_length\",return_attention_mask=True)\n",
    "        return{'input_ids':tokens.input_ids.flatten(),\"attention_mask\":tokens.attention_mask.flatten(),\"labels\":attributes}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([    0,   113, 43292,   487,  1073,  6619, 16519,  4261,  1012, 28845,\n",
       "         43292, 50118, 30086,     6,    38,   206,     5,  7729,  6619, 16519,\n",
       "          4261,  1012,  6717,  4867,    11,  2370,    32,    45, 41039,  7140,\n",
       "           250, 48149,    53,   888,    95, 41762,    30, 40823, 34740,  2071,\n",
       "             4,   407,    51,   197,   213,    11, 29617,   101, 25046, 12467,\n",
       "           381, 11742,   646, 19065,  6026,   742,  1195,    87,   634, 14530,\n",
       "           250,     4,  1437,  1437,    22,     2,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "             1,     1,     1,     1,     1,     1,     1,     1]),\n",
       " 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0]),\n",
       " 'labels': tensor([0., 0., 0., 0., 0., 0.])}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "model_name=\"roberta-base\"\n",
    "tokenizer=AutoTokenizer.from_pretrained(model_name)\n",
    "toxic_comments_dataset=toxicity_dataset(\"data/train.csv\",tokenizer,attributes)\n",
    "toxic_comments_dataset.__getitem__(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytorch_lightning as pl\n",
    "from torch.utils.data import DataLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Toxcity_Data_Module(pl.LightningDataModule):\n",
    "    def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name=\"roberta-base\"):\n",
    "        super().__init__()\n",
    "        self.train_path=train_path\n",
    "        self.test_path=test_path\n",
    "        self.attributes=attributes\n",
    "        self.batch_size=batch_size\n",
    "        self.max_token_len=max_token_len\n",
    "        self.model_name=model_name\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "    def setup(self, stage = None):\n",
    "        if stage in (None, \"fit\"):\n",
    "            self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)\n",
    "            self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)\n",
    "        if stage == \"predict\":\n",
    "            self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)\n",
    "    def train_dataloader(self):\n",
    "        return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=4,shuffle=True)\n",
    "    def val_dataloader(self):\n",
    "        return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
    "    def predict_dataloader(self):\n",
    "        return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch.utils.data.dataloader.DataLoader at 0x2241d16e5d0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes)\n",
    "\n",
    "toxicity_data_module.setup()\n",
    "dataloader=toxicity_data_module.train_dataloader()\n",
    "dataloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup\n",
    "import torch.nn as nn\n",
    "import math\n",
    "from torchmetrics.functional.classification import auroc\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Toxic_Comment_Classifier(pl.LightningModule):\n",
    "    def __init__(self,config:dict):\n",
    "        super().__init__()\n",
    "        self.config=config\n",
    "        self.pretrained_model = AutoModel.from_pretrained(config['model_name'],return_dict=True)\n",
    "        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)\n",
    "        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])\n",
    "        torch.nn.init.xavier_uniform_(self.hidden.weight)\n",
    "        torch.nn.init.xavier_uniform_(self.classifier.weight)\n",
    "        self.loss_func=nn.BCEWithLogitsLoss(reduction='mean')\n",
    "        self.dropout = nn.Dropout()\n",
    "\n",
    "    def forward(self, input_ids,attention_mask,labels=None):\n",
    "        output = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)\n",
    "        pooled_output=torch.mean(output.last_hidden_state, 1)\n",
    "        #nn classification\n",
    "        pooled_output=self.hidden(pooled_output)\n",
    "        pooled_output=self.dropout(pooled_output)\n",
    "        pooled_output=F.relu(pooled_output)\n",
    "        logits=self.classifier(pooled_output)\n",
    "        #loss\n",
    "        loss = 0\n",
    "        if labels is not None:\n",
    "            loss = self.loss_func(logits.view(-1,self.config['n_labels']), labels.view(-1,self.config['n_labels']))\n",
    "        return loss, logits\n",
    "    \n",
    "    def training_step(self, batch, batch_index):\n",
    "        loss, logits = self(**batch)\n",
    "        self.log(\"train loss\", loss, prog_bar=True, logger=True)\n",
    "        return{'loss':loss,'predictions':logits,'labels':batch['labels']}\n",
    "    def validation_step(self, batch, batch_index):\n",
    "        loss, logits = self(**batch)\n",
    "        self.log(\"validation loss\", loss, prog_bar=True, logger=True)\n",
    "        return{'val_loss':loss,'predictions':logits,'labels':batch['labels']}\n",
    "    def prediction_step(self, batch, batch_index):\n",
    "        logits = self(**batch)\n",
    "        return logits\n",
    "    \n",
    "    def configure_optimizers(self):\n",
    "        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])\n",
    "        total_steps = self.config['train_size']/self.config['bs']\n",
    "        warmup_steps = math.floor(total_steps*self.config['warmup'])\n",
    "        scheduler = get_cosine_schedule_with_warmup(optimizer,warmup_steps,total_steps)\n",
    "        return [optimizer],[scheduler]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "config = {\n",
    "    'model_name':\"distilroberta-base\",\n",
    "    'n_labels':len(attributes),\n",
    "    'bs':128,\n",
    "    'lr':1.5e-6,\n",
    "    'warmup':0.2,\n",
    "    \"train_size\":len(toxicity_data_module.train_dataloader()),\n",
    "    'w_decay':0.001,\n",
    "    'n_epochs':1\n",
    "}\n",
    "\n",
    "model = Toxic_Comment_Classifier(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx=0\n",
    "input_ids = toxic_comments_dataset.__getitem__(idx)[\"input_ids\"]\n",
    "attention_mask = toxic_comments_dataset.__getitem__(idx)[\"attention_mask\"]\n",
    "labels = toxic_comments_dataset.__getitem__(idx)[\"labels\"]\n",
    "loss,output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0),labels.unsqueeze(dim=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "GPU available: False, used: False\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "HPU available: False, using: 0 HPUs\n",
      "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pytorch_lightning\\trainer\\connectors\\logger_connector\\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
      "  warning_cache.warn(\n"
     ]
    }
   ],
   "source": [
    "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes,batch_size=config['bs'])\n",
    "toxicity_data_module.setup()\n",
    "model = Toxic_Comment_Classifier(config)\n",
    "\n",
    "trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)\n",
    "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "#print(torch.cuda.get_device_name())\n",
    "#trainer.fit(model,toxicity_data_module)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = torch.load(\"ToxicityClassificationModel.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_raw_comments(model, dm):\n",
    "    predictions = trainer.predict(model,datamodule=dm)\n",
    "    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])\n",
    "    return flattened_predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = predict_raw_comments(model=model,dm=toxicity_data_module)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}