Spaces:

Md-Hakim
/

text-summarization

Sleeping

App Files Files Community

hakim commited on Aug 11, 2024

Commit

f68f6ad

1 Parent(s): cb7cafb

notebook added

Browse files

Files changed (2) hide show

research/data_transformation.ipynb +104 -16
research/data_validation.ipynb +110 -3

research/data_transformation.ipynb CHANGED Viewed

@@ -107,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,7 +118,7 @@
     "        \n",
     "        \n",
     "    def convert_examples_to_features(self, example_batch):\n",
-    "        input_encoding = self.tokenizer(example_batch['dialogue'], max_lenght = 1024, truncation = True)\n",
     "        \n",
     "        with self.tokenizer.as_target_tokenizer():\n",
     "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
@@ -137,30 +137,118 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-08-11 18:13:05,753: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
-      "[2024-08-11 18:13:05,757: INFO: common: yaml file: params.yaml loaded successfully]\n",
-      "[2024-08-11 18:13:05,758: INFO: common: created directory at: artifacts]\n",
-      "[2024-08-11 18:13:05,760: INFO: common: created directory at: artifacts/data_transformation]\n"
      ]
     },
     {
-     "ename": "TypeError",
-     "evalue": "DataTransformation() takes no arguments",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m     data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
-      "Cell \u001b[1;32mIn[7], line 4\u001b[0m\n\u001b[0;32m      2\u001b[0m     config \u001b[38;5;241m=\u001b[39m ConfigurationManager()\n\u001b[0;32m      3\u001b[0m     data_transformation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_data_transformation_config()\n\u001b[1;32m----> 4\u001b[0m     data_transformation \u001b[38;5;241m=\u001b[39m \u001b[43mDataTransformation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_transformation_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      5\u001b[0m     data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "\u001b[1;31mTypeError\u001b[0m: DataTransformation() takes no arguments"
      ]
     }
    ],
    "source": [

   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "        \n",
     "        \n",
     "    def convert_examples_to_features(self, example_batch):\n",
+    "        input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)\n",
     "        \n",
     "        with self.tokenizer.as_target_tokenizer():\n",
     "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2024-08-11 18:13:44,678: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-08-11 18:13:44,681: INFO: common: yaml file: params.yaml loaded successfully]\n",
+      "[2024-08-11 18:13:44,684: INFO: common: created directory at: artifacts]\n",
+      "[2024-08-11 18:13:44,686: INFO: common: created directory at: artifacts/data_transformation]\n"
      ]
     },
     {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bdedbcfbff63497081e37ad9b20a6c31",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/14732 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3b8826d099004000a2a037e32bbdf1cc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/819 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a5e1728a7d142d3b767f7b9c8f14c6f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/818 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54194e4ec3de42738a2107fa26673aef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0209e20f794e4e3ab60ef282b98b8bb3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0a89fb4c0a96413782a55206d087a2a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [

research/data_validation.ipynb CHANGED Viewed

@@ -32,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,9 +41,116 @@
     "@dataclass(frozen=True)\n",
     "class DataValidationConfig:\n",
     "    root_dir : Path\n",
-    "    STATUS_FILE : Path\n",
-    "    ALL_REQUIRED_FILES : "
    ]
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "@dataclass(frozen=True)\n",
     "class DataValidationConfig:\n",
     "    root_dir : Path\n",
+    "    STATUS_FILE : str\n",
+    "    ALL_REQUIRED_FILES : list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from textsummarizer.constants import *\n",
+    "from textsummarizer.utils.common import read_yaml, create_directories\n",
+    "\n",
+    "class ConfigurationManager:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        config_filepath = CONFIG_FILE_PATH,\n",
+    "        params_filepath = PARAMS_FILE_PATH):\n",
+    "\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    def get_data_validation_config(self) -> DataValidationConfig:\n",
+    "        config = self.config.data_validation\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        data_validation_config = DataValidationConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            STATUS_FILE=config.STATUS_FILE,\n",
+    "            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
+    "        )\n",
+    "\n",
+    "        return data_validation_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from textsummarizer.logging import logger\n",
+    "\n",
+    "class DataValiadtion:\n",
+    "    def __init__(self, config: DataValidationConfig):\n",
+    "        self.config = config\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    def validate_all_files_exist(self)-> bool:\n",
+    "        try:\n",
+    "            validation_status = None\n",
+    "\n",
+    "            all_files = os.listdir(os.path.join(\"artifacts\",\"data_ingestion\",\"samsum_dataset\"))\n",
+    "\n",
+    "            for file in all_files:\n",
+    "                if file not in self.config.ALL_REQUIRED_FILES:\n",
+    "                    validation_status = False\n",
+    "                    with open(self.config.STATUS_FILE, 'w') as f:\n",
+    "                        f.write(f\"Validation status: {validation_status}\")\n",
+    "                else:\n",
+    "                    validation_status = True\n",
+    "                    with open(self.config.STATUS_FILE, 'w') as f:\n",
+    "                        f.write(f\"Validation status: {validation_status}\")\n",
+    "\n",
+    "            return validation_status\n",
+    "        \n",
+    "        except Exception as e:\n",
+    "            raise e\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 16:18:48,704: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-08-11 16:18:48,706: INFO: common: yaml file: params.yaml loaded successfully]\n",
+      "[2024-08-11 16:18:48,707: INFO: common: created directory at: artifacts]\n",
+      "[2024-08-11 16:18:48,708: INFO: common: created directory at: artifacts/data_validation]\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config  = ConfigurationManager()\n",
+    "    data_validataion_config = config.get_data_validation_config()\n",
+    "    data_validation = DataValiadtion(config=data_validataion_config)\n",
+    "    data_validation.validate_all_files_exist()\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {