hakim commited on
Commit
f68f6ad
·
1 Parent(s): cb7cafb

notebook added

Browse files
research/data_transformation.ipynb CHANGED
@@ -107,7 +107,7 @@
107
  },
108
  {
109
  "cell_type": "code",
110
- "execution_count": 6,
111
  "metadata": {},
112
  "outputs": [],
113
  "source": [
@@ -118,7 +118,7 @@
118
  " \n",
119
  " \n",
120
  " def convert_examples_to_features(self, example_batch):\n",
121
- " input_encoding = self.tokenizer(example_batch['dialogue'], max_lenght = 1024, truncation = True)\n",
122
  " \n",
123
  " with self.tokenizer.as_target_tokenizer():\n",
124
  " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
@@ -137,30 +137,118 @@
137
  },
138
  {
139
  "cell_type": "code",
140
- "execution_count": 7,
141
  "metadata": {},
142
  "outputs": [
143
  {
144
  "name": "stdout",
145
  "output_type": "stream",
146
  "text": [
147
- "[2024-08-11 18:13:05,753: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
148
- "[2024-08-11 18:13:05,757: INFO: common: yaml file: params.yaml loaded successfully]\n",
149
- "[2024-08-11 18:13:05,758: INFO: common: created directory at: artifacts]\n",
150
- "[2024-08-11 18:13:05,760: INFO: common: created directory at: artifacts/data_transformation]\n"
151
  ]
152
  },
153
  {
154
- "ename": "TypeError",
155
- "evalue": "DataTransformation() takes no arguments",
156
- "output_type": "error",
157
- "traceback": [
158
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
159
- "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
160
- "Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
161
- "Cell \u001b[1;32mIn[7], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m config \u001b[38;5;241m=\u001b[39m ConfigurationManager()\n\u001b[0;32m 3\u001b[0m data_transformation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_data_transformation_config()\n\u001b[1;32m----> 4\u001b[0m data_transformation \u001b[38;5;241m=\u001b[39m \u001b[43mDataTransformation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_transformation_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
162
- "\u001b[1;31mTypeError\u001b[0m: DataTransformation() takes no arguments"
163
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  }
165
  ],
166
  "source": [
 
107
  },
108
  {
109
  "cell_type": "code",
110
+ "execution_count": 10,
111
  "metadata": {},
112
  "outputs": [],
113
  "source": [
 
118
  " \n",
119
  " \n",
120
  " def convert_examples_to_features(self, example_batch):\n",
121
+ " input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)\n",
122
  " \n",
123
  " with self.tokenizer.as_target_tokenizer():\n",
124
  " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
 
137
  },
138
  {
139
  "cell_type": "code",
140
+ "execution_count": 11,
141
  "metadata": {},
142
  "outputs": [
143
  {
144
  "name": "stdout",
145
  "output_type": "stream",
146
  "text": [
147
+ "[2024-08-11 18:13:44,678: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
148
+ "[2024-08-11 18:13:44,681: INFO: common: yaml file: params.yaml loaded successfully]\n",
149
+ "[2024-08-11 18:13:44,684: INFO: common: created directory at: artifacts]\n",
150
+ "[2024-08-11 18:13:44,686: INFO: common: created directory at: artifacts/data_transformation]\n"
151
  ]
152
  },
153
  {
154
+ "name": "stderr",
155
+ "output_type": "stream",
156
+ "text": [
157
+ "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
158
+ " warnings.warn(\n"
 
 
 
 
159
  ]
160
+ },
161
+ {
162
+ "data": {
163
+ "application/vnd.jupyter.widget-view+json": {
164
+ "model_id": "bdedbcfbff63497081e37ad9b20a6c31",
165
+ "version_major": 2,
166
+ "version_minor": 0
167
+ },
168
+ "text/plain": [
169
+ "Map: 0%| | 0/14732 [00:00<?, ? examples/s]"
170
+ ]
171
+ },
172
+ "metadata": {},
173
+ "output_type": "display_data"
174
+ },
175
+ {
176
+ "name": "stderr",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
180
+ " warnings.warn(\n"
181
+ ]
182
+ },
183
+ {
184
+ "data": {
185
+ "application/vnd.jupyter.widget-view+json": {
186
+ "model_id": "3b8826d099004000a2a037e32bbdf1cc",
187
+ "version_major": 2,
188
+ "version_minor": 0
189
+ },
190
+ "text/plain": [
191
+ "Map: 0%| | 0/819 [00:00<?, ? examples/s]"
192
+ ]
193
+ },
194
+ "metadata": {},
195
+ "output_type": "display_data"
196
+ },
197
+ {
198
+ "data": {
199
+ "application/vnd.jupyter.widget-view+json": {
200
+ "model_id": "4a5e1728a7d142d3b767f7b9c8f14c6f",
201
+ "version_major": 2,
202
+ "version_minor": 0
203
+ },
204
+ "text/plain": [
205
+ "Map: 0%| | 0/818 [00:00<?, ? examples/s]"
206
+ ]
207
+ },
208
+ "metadata": {},
209
+ "output_type": "display_data"
210
+ },
211
+ {
212
+ "data": {
213
+ "application/vnd.jupyter.widget-view+json": {
214
+ "model_id": "54194e4ec3de42738a2107fa26673aef",
215
+ "version_major": 2,
216
+ "version_minor": 0
217
+ },
218
+ "text/plain": [
219
+ "Saving the dataset (0/1 shards): 0%| | 0/14732 [00:00<?, ? examples/s]"
220
+ ]
221
+ },
222
+ "metadata": {},
223
+ "output_type": "display_data"
224
+ },
225
+ {
226
+ "data": {
227
+ "application/vnd.jupyter.widget-view+json": {
228
+ "model_id": "0209e20f794e4e3ab60ef282b98b8bb3",
229
+ "version_major": 2,
230
+ "version_minor": 0
231
+ },
232
+ "text/plain": [
233
+ "Saving the dataset (0/1 shards): 0%| | 0/819 [00:00<?, ? examples/s]"
234
+ ]
235
+ },
236
+ "metadata": {},
237
+ "output_type": "display_data"
238
+ },
239
+ {
240
+ "data": {
241
+ "application/vnd.jupyter.widget-view+json": {
242
+ "model_id": "0a89fb4c0a96413782a55206d087a2a6",
243
+ "version_major": 2,
244
+ "version_minor": 0
245
+ },
246
+ "text/plain": [
247
+ "Saving the dataset (0/1 shards): 0%| | 0/818 [00:00<?, ? examples/s]"
248
+ ]
249
+ },
250
+ "metadata": {},
251
+ "output_type": "display_data"
252
  }
253
  ],
254
  "source": [
research/data_validation.ipynb CHANGED
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": null,
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
@@ -41,9 +41,116 @@
41
  "@dataclass(frozen=True)\n",
42
  "class DataValidationConfig:\n",
43
  " root_dir : Path\n",
44
- " STATUS_FILE : Path\n",
45
- " ALL_REQUIRED_FILES : "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  }
48
  ],
49
  "metadata": {
 
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": 3,
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
 
41
  "@dataclass(frozen=True)\n",
42
  "class DataValidationConfig:\n",
43
  " root_dir : Path\n",
44
+ " STATUS_FILE : str\n",
45
+ " ALL_REQUIRED_FILES : list"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 5,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "from textsummarizer.constants import *\n",
55
+ "from textsummarizer.utils.common import read_yaml, create_directories\n",
56
+ "\n",
57
+ "class ConfigurationManager:\n",
58
+ " def __init__(\n",
59
+ " self,\n",
60
+ " config_filepath = CONFIG_FILE_PATH,\n",
61
+ " params_filepath = PARAMS_FILE_PATH):\n",
62
+ "\n",
63
+ " self.config = read_yaml(config_filepath)\n",
64
+ " self.params = read_yaml(params_filepath)\n",
65
+ "\n",
66
+ " create_directories([self.config.artifacts_root])\n",
67
+ "\n",
68
+ "\n",
69
+ " \n",
70
+ " def get_data_validation_config(self) -> DataValidationConfig:\n",
71
+ " config = self.config.data_validation\n",
72
+ "\n",
73
+ " create_directories([config.root_dir])\n",
74
+ "\n",
75
+ " data_validation_config = DataValidationConfig(\n",
76
+ " root_dir=config.root_dir,\n",
77
+ " STATUS_FILE=config.STATUS_FILE,\n",
78
+ " ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
79
+ " )\n",
80
+ "\n",
81
+ " return data_validation_config"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 8,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "import os\n",
91
+ "from textsummarizer.logging import logger\n",
92
+ "\n",
93
+ "class DataValiadtion:\n",
94
+ " def __init__(self, config: DataValidationConfig):\n",
95
+ " self.config = config\n",
96
+ "\n",
97
+ "\n",
98
+ " \n",
99
+ " def validate_all_files_exist(self)-> bool:\n",
100
+ " try:\n",
101
+ " validation_status = None\n",
102
+ "\n",
103
+ " all_files = os.listdir(os.path.join(\"artifacts\",\"data_ingestion\",\"samsum_dataset\"))\n",
104
+ "\n",
105
+ " for file in all_files:\n",
106
+ " if file not in self.config.ALL_REQUIRED_FILES:\n",
107
+ " validation_status = False\n",
108
+ " with open(self.config.STATUS_FILE, 'w') as f:\n",
109
+ " f.write(f\"Validation status: {validation_status}\")\n",
110
+ " else:\n",
111
+ " validation_status = True\n",
112
+ " with open(self.config.STATUS_FILE, 'w') as f:\n",
113
+ " f.write(f\"Validation status: {validation_status}\")\n",
114
+ "\n",
115
+ " return validation_status\n",
116
+ " \n",
117
+ " except Exception as e:\n",
118
+ " raise e\n"
119
  ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 9,
124
+ "metadata": {},
125
+ "outputs": [
126
+ {
127
+ "name": "stdout",
128
+ "output_type": "stream",
129
+ "text": [
130
+ "[2024-08-11 16:18:48,704: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
131
+ "[2024-08-11 16:18:48,706: INFO: common: yaml file: params.yaml loaded successfully]\n",
132
+ "[2024-08-11 16:18:48,707: INFO: common: created directory at: artifacts]\n",
133
+ "[2024-08-11 16:18:48,708: INFO: common: created directory at: artifacts/data_validation]\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "try:\n",
139
+ " config = ConfigurationManager()\n",
140
+ " data_validataion_config = config.get_data_validation_config()\n",
141
+ " data_validation = DataValiadtion(config=data_validataion_config)\n",
142
+ " data_validation.validate_all_files_exist()\n",
143
+ " \n",
144
+ "except Exception as e:\n",
145
+ " raise e"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": []
154
  }
155
  ],
156
  "metadata": {