{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "@dataclass(frozen=True)\n", "class ModelEvaluationConfig:\n", " root_dir : Path\n", " data_path : Path\n", " model_path : Path\n", " all_params: dict\n", " tokenizer_path : Path\n", " metric_file_name : Path" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from textsummarizer.constants import *\n", "from textsummarizer.utils.common import read_yaml, create_directories, save_json, load_json\n", "\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", "\n", " \n", " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n", " config = self.config.model_evaluation\n", " params = self.params.TrainingArguments\n", "\n", " create_directories([config.root_dir])\n", "\n", " model_evaluation_config = ModelEvaluationConfig(\n", " root_dir=config.root_dir,\n", " data_path=config.data_path,\n", " model_path = config.model_path,\n", " tokenizer_path = config.tokenizer_path,\n", " metric_file_name = config.metric_file_name,\n", " all_params = params\n", " \n", " )\n", "\n", " return model_evaluation_config" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 20:23:00,587: INFO: config: PyTorch version 2.2.2+cu121 available.]\n", "[2024-08-11 20:23:00,589: INFO: config: TensorFlow version 2.12.0 available.]\n" ] } ], "source": [ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", "from datasets import load_dataset, load_from_disk, load_metric\n", "import torch\n", "import pandas as pd\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import mlflow\n", "import dagshub\n", "import json\n", "\n", "class ModelEvaluation:\n", " def __init__(self, config: ModelEvaluationConfig):\n", " self.config = config\n", "\n", " def generate_batch_sized_chunks(self, list_of_elements, batch_size):\n", " \"\"\"split the dataset into smaller batches that we can process simultaneously\n", " Yield successive batch-sized chunks from list_of_elements.\"\"\"\n", " for i in range(0, len(list_of_elements), batch_size):\n", " yield list_of_elements[i : i + batch_size]\n", "\n", " def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, \n", " batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n", " column_text=\"article\", \n", " column_summary=\"highlights\"):\n", " article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n", " target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n", "\n", " for article_batch, target_batch in tqdm(\n", " zip(article_batches, target_batches), total=len(article_batches)):\n", " \n", " inputs = tokenizer(article_batch, max_length=1024, truncation=True, \n", " padding=\"max_length\", return_tensors=\"pt\")\n", " \n", " summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n", " attention_mask=inputs[\"attention_mask\"].to(device), \n", " length_penalty=0.8, num_beams=8, max_length=128)\n", " \n", " decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n", " clean_up_tokenization_spaces=True) \n", " for s in summaries] \n", " \n", " decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n", " \n", " metric.add_batch(predictions=decoded_summaries, references=target_batch)\n", " \n", " score = metric.compute()\n", " return score\n", "\n", " def evaluate(self):\n", " # Set up MLflow tracking\n", " dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)\n", " mlflow.set_tracking_uri(\"https://dagshub.com/azizulhakim8291/text-summarization.mlflow\")\n", " mlflow.set_experiment(\"text-summarization-evaluation\")\n", "\n", " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", " tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n", " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n", " \n", " dataset_samsum_pt = load_from_disk(self.config.data_path)\n", "\n", " rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n", " rouge_metric = load_metric('rouge')\n", "\n", " with mlflow.start_run():\n", " mlflow.log_param(\"model_name\", \"pegasus\")\n", " mlflow.log_param(\"dataset\", \"samsum\")\n", " mlflow.log_param('parameter name', 'value')\n", "\n", " score = self.calculate_metric_on_test_ds(\n", " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n", " batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n", " )\n", "\n", " rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n", " mlflow.log_params(self.config.all_params)\n", "\n", " # Log metrics to MLflow\n", " for rouge_name, rouge_score in rouge_dict.items():\n", " mlflow.log_metric(rouge_name, rouge_score)\n", "\n", " # Save results as JSON\n", " with open(self.config.metric_file_name, 'w') as f:\n", " json.dump(rouge_dict, f, indent=4)\n", "\n", " # Log the JSON file as an artifact\n", " mlflow.log_artifact(self.config.metric_file_name)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 22:39:28,983: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-08-11 22:39:28,986: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-08-11 22:39:28,989: INFO: common: created directory at: artifacts]\n", "[2024-08-11 22:39:28,992: INFO: common: created directory at: artifacts/model_evaluation]\n", "[2024-08-11 22:39:29,723: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n" ] }, { "data": { "text/html": [ "
Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"\n",
       "
\n" ], "text/plain": [ "Initialized MLflow to track repo \u001b[32m\"azizulhakim8291/text-summarization\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 22:39:29,731: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n" ] }, { "data": { "text/html": [ "
Repository azizulhakim8291/text-summarization initialized!\n",
       "
\n" ], "text/plain": [ "Repository azizulhakim8291/text-summarization initialized!\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 22:39:29,735: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n", "[2024-08-11 22:39:29,802: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:756: FutureWarning: The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/rouge/rouge.py\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n", " warnings.warn(\n", "100%|██████████| 5/5 [00:17<00:00, 3.48s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 22:39:59,553: INFO: rouge_scorer: Using default tokenizer.]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "try:\n", " config = ConfigurationManager()\n", " model_evaluation_config = config.get_model_evaluation_config()\n", " model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n", " model_evaluation_config.evaluate()\n", "except Exception as e:\n", " raise e" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }