Spaces:

Md-Hakim
/

text-summarization

Sleeping

File size: 11,719 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import  os\n",
    "os.chdir('../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "@dataclass(frozen=True)\n",
    "class ModelEvaluationConfig:\n",
    "    root_dir : Path\n",
    "    data_path : Path\n",
    "    model_path : Path\n",
    "    all_params: dict\n",
    "    tokenizer_path : Path\n",
    "    metric_file_name : Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textsummarizer.constants import *\n",
    "from textsummarizer.utils.common import read_yaml, create_directories, save_json, load_json\n",
    "\n",
    "class ConfigurationManager:\n",
    "    def __init__(\n",
    "        self,\n",
    "        config_filepath = CONFIG_FILE_PATH,\n",
    "        params_filepath = PARAMS_FILE_PATH):\n",
    "\n",
    "        self.config = read_yaml(config_filepath)\n",
    "        self.params = read_yaml(params_filepath)\n",
    "\n",
    "        create_directories([self.config.artifacts_root])\n",
    "\n",
    "\n",
    "    \n",
    "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
    "        config = self.config.model_evaluation\n",
    "        params = self.params.TrainingArguments\n",
    "\n",
    "        create_directories([config.root_dir])\n",
    "\n",
    "        model_evaluation_config = ModelEvaluationConfig(\n",
    "            root_dir=config.root_dir,\n",
    "            data_path=config.data_path,\n",
    "            model_path = config.model_path,\n",
    "            tokenizer_path = config.tokenizer_path,\n",
    "            metric_file_name = config.metric_file_name,\n",
    "            all_params = params\n",
    "           \n",
    "        )\n",
    "\n",
    "        return model_evaluation_config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 20:23:00,587: INFO: config: PyTorch version 2.2.2+cu121 available.]\n",
      "[2024-08-11 20:23:00,589: INFO: config: TensorFlow version 2.12.0 available.]\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
    "from datasets import load_dataset, load_from_disk, load_metric\n",
    "import torch\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import mlflow\n",
    "import dagshub\n",
    "import json\n",
    "\n",
    "class ModelEvaluation:\n",
    "    def __init__(self, config: ModelEvaluationConfig):\n",
    "        self.config = config\n",
    "\n",
    "    def generate_batch_sized_chunks(self, list_of_elements, batch_size):\n",
    "        \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
    "        Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
    "        for i in range(0, len(list_of_elements), batch_size):\n",
    "            yield list_of_elements[i : i + batch_size]\n",
    "\n",
    "    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, \n",
    "                               batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n",
    "                               column_text=\"article\", \n",
    "                               column_summary=\"highlights\"):\n",
    "        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
    "        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
    "\n",
    "        for article_batch, target_batch in tqdm(\n",
    "            zip(article_batches, target_batches), total=len(article_batches)):\n",
    "            \n",
    "            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, \n",
    "                            padding=\"max_length\", return_tensors=\"pt\")\n",
    "            \n",
    "            summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
    "                            attention_mask=inputs[\"attention_mask\"].to(device), \n",
    "                            length_penalty=0.8, num_beams=8, max_length=128)\n",
    "            \n",
    "            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
    "                                    clean_up_tokenization_spaces=True) \n",
    "                for s in summaries]      \n",
    "            \n",
    "            decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
    "            \n",
    "            metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
    "            \n",
    "        score = metric.compute()\n",
    "        return score\n",
    "\n",
    "    def evaluate(self):\n",
    "        # Set up MLflow tracking\n",
    "        dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)\n",
    "        mlflow.set_tracking_uri(\"https://dagshub.com/azizulhakim8291/text-summarization.mlflow\")\n",
    "        mlflow.set_experiment(\"text-summarization-evaluation\")\n",
    "\n",
    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
    "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
    "       \n",
    "        dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
    "\n",
    "        rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
    "        rouge_metric = load_metric('rouge')\n",
    "\n",
    "        with mlflow.start_run():\n",
    "            mlflow.log_param(\"model_name\", \"pegasus\")\n",
    "            mlflow.log_param(\"dataset\", \"samsum\")\n",
    "            mlflow.log_param('parameter name', 'value')\n",
    "\n",
    "            score = self.calculate_metric_on_test_ds(\n",
    "                dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
    "                batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
    "            )\n",
    "\n",
    "            rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
    "            mlflow.log_params(self.config.all_params)\n",
    "\n",
    "            # Log metrics to MLflow\n",
    "            for rouge_name, rouge_score in rouge_dict.items():\n",
    "                mlflow.log_metric(rouge_name, rouge_score)\n",
    "\n",
    "            # Save results as JSON\n",
    "            with open(self.config.metric_file_name, 'w') as f:\n",
    "                json.dump(rouge_dict, f, indent=4)\n",
    "\n",
    "            # Log the JSON file as an artifact\n",
    "            mlflow.log_artifact(self.config.metric_file_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 22:39:28,983: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
      "[2024-08-11 22:39:28,986: INFO: common: yaml file: params.yaml loaded successfully]\n",
      "[2024-08-11 22:39:28,989: INFO: common: created directory at: artifacts]\n",
      "[2024-08-11 22:39:28,992: INFO: common: created directory at: artifacts/model_evaluation]\n",
      "[2024-08-11 22:39:29,723: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Initialized MLflow to track repo <span style=\"color: #008000; text-decoration-color: #008000\">\"azizulhakim8291/text-summarization\"</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "Initialized MLflow to track repo \u001b[32m\"azizulhakim8291/text-summarization\"\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 22:39:29,731: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Repository azizulhakim8291/text-summarization initialized!\n",
       "</pre>\n"
      ],
      "text/plain": [
       "Repository azizulhakim8291/text-summarization initialized!\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 22:39:29,735: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
      "[2024-08-11 22:39:29,802: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:756: FutureWarning: The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/rouge/rouge.py\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "100%|██████████| 5/5 [00:17<00:00,  3.48s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 22:39:59,553: INFO: rouge_scorer: Using default tokenizer.]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    config = ConfigurationManager()\n",
    "    model_evaluation_config = config.get_model_evaluation_config()\n",
    "    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
    "    model_evaluation_config.evaluate()\n",
    "except Exception as e:\n",
    "    raise e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}