Spaces:
Sleeping
Sleeping
hakim
commited on
Commit
·
f68f6ad
1
Parent(s):
cb7cafb
notebook added
Browse files- research/data_transformation.ipynb +104 -16
- research/data_validation.ipynb +110 -3
research/data_transformation.ipynb
CHANGED
@@ -107,7 +107,7 @@
|
|
107 |
},
|
108 |
{
|
109 |
"cell_type": "code",
|
110 |
-
"execution_count":
|
111 |
"metadata": {},
|
112 |
"outputs": [],
|
113 |
"source": [
|
@@ -118,7 +118,7 @@
|
|
118 |
" \n",
|
119 |
" \n",
|
120 |
" def convert_examples_to_features(self, example_batch):\n",
|
121 |
-
" input_encoding = self.tokenizer(example_batch['dialogue'],
|
122 |
" \n",
|
123 |
" with self.tokenizer.as_target_tokenizer():\n",
|
124 |
" target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
|
@@ -137,30 +137,118 @@
|
|
137 |
},
|
138 |
{
|
139 |
"cell_type": "code",
|
140 |
-
"execution_count":
|
141 |
"metadata": {},
|
142 |
"outputs": [
|
143 |
{
|
144 |
"name": "stdout",
|
145 |
"output_type": "stream",
|
146 |
"text": [
|
147 |
-
"[2024-08-11 18:13:
|
148 |
-
"[2024-08-11 18:13:
|
149 |
-
"[2024-08-11 18:13:
|
150 |
-
"[2024-08-11 18:13:
|
151 |
]
|
152 |
},
|
153 |
{
|
154 |
-
"
|
155 |
-
"
|
156 |
-
"
|
157 |
-
|
158 |
-
"\
|
159 |
-
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
160 |
-
"Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
|
161 |
-
"Cell \u001b[1;32mIn[7], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m config \u001b[38;5;241m=\u001b[39m ConfigurationManager()\n\u001b[0;32m 3\u001b[0m data_transformation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_data_transformation_config()\n\u001b[1;32m----> 4\u001b[0m data_transformation \u001b[38;5;241m=\u001b[39m \u001b[43mDataTransformation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_transformation_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
162 |
-
"\u001b[1;31mTypeError\u001b[0m: DataTransformation() takes no arguments"
|
163 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
}
|
165 |
],
|
166 |
"source": [
|
|
|
107 |
},
|
108 |
{
|
109 |
"cell_type": "code",
|
110 |
+
"execution_count": 10,
|
111 |
"metadata": {},
|
112 |
"outputs": [],
|
113 |
"source": [
|
|
|
118 |
" \n",
|
119 |
" \n",
|
120 |
" def convert_examples_to_features(self, example_batch):\n",
|
121 |
+
" input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)\n",
|
122 |
" \n",
|
123 |
" with self.tokenizer.as_target_tokenizer():\n",
|
124 |
" target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
|
|
|
137 |
},
|
138 |
{
|
139 |
"cell_type": "code",
|
140 |
+
"execution_count": 11,
|
141 |
"metadata": {},
|
142 |
"outputs": [
|
143 |
{
|
144 |
"name": "stdout",
|
145 |
"output_type": "stream",
|
146 |
"text": [
|
147 |
+
"[2024-08-11 18:13:44,678: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
|
148 |
+
"[2024-08-11 18:13:44,681: INFO: common: yaml file: params.yaml loaded successfully]\n",
|
149 |
+
"[2024-08-11 18:13:44,684: INFO: common: created directory at: artifacts]\n",
|
150 |
+
"[2024-08-11 18:13:44,686: INFO: common: created directory at: artifacts/data_transformation]\n"
|
151 |
]
|
152 |
},
|
153 |
{
|
154 |
+
"name": "stderr",
|
155 |
+
"output_type": "stream",
|
156 |
+
"text": [
|
157 |
+
"c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
158 |
+
" warnings.warn(\n"
|
|
|
|
|
|
|
|
|
159 |
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"data": {
|
163 |
+
"application/vnd.jupyter.widget-view+json": {
|
164 |
+
"model_id": "bdedbcfbff63497081e37ad9b20a6c31",
|
165 |
+
"version_major": 2,
|
166 |
+
"version_minor": 0
|
167 |
+
},
|
168 |
+
"text/plain": [
|
169 |
+
"Map: 0%| | 0/14732 [00:00<?, ? examples/s]"
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "display_data"
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"name": "stderr",
|
177 |
+
"output_type": "stream",
|
178 |
+
"text": [
|
179 |
+
"c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
|
180 |
+
" warnings.warn(\n"
|
181 |
+
]
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"data": {
|
185 |
+
"application/vnd.jupyter.widget-view+json": {
|
186 |
+
"model_id": "3b8826d099004000a2a037e32bbdf1cc",
|
187 |
+
"version_major": 2,
|
188 |
+
"version_minor": 0
|
189 |
+
},
|
190 |
+
"text/plain": [
|
191 |
+
"Map: 0%| | 0/819 [00:00<?, ? examples/s]"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
"metadata": {},
|
195 |
+
"output_type": "display_data"
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"data": {
|
199 |
+
"application/vnd.jupyter.widget-view+json": {
|
200 |
+
"model_id": "4a5e1728a7d142d3b767f7b9c8f14c6f",
|
201 |
+
"version_major": 2,
|
202 |
+
"version_minor": 0
|
203 |
+
},
|
204 |
+
"text/plain": [
|
205 |
+
"Map: 0%| | 0/818 [00:00<?, ? examples/s]"
|
206 |
+
]
|
207 |
+
},
|
208 |
+
"metadata": {},
|
209 |
+
"output_type": "display_data"
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"data": {
|
213 |
+
"application/vnd.jupyter.widget-view+json": {
|
214 |
+
"model_id": "54194e4ec3de42738a2107fa26673aef",
|
215 |
+
"version_major": 2,
|
216 |
+
"version_minor": 0
|
217 |
+
},
|
218 |
+
"text/plain": [
|
219 |
+
"Saving the dataset (0/1 shards): 0%| | 0/14732 [00:00<?, ? examples/s]"
|
220 |
+
]
|
221 |
+
},
|
222 |
+
"metadata": {},
|
223 |
+
"output_type": "display_data"
|
224 |
+
},
|
225 |
+
{
|
226 |
+
"data": {
|
227 |
+
"application/vnd.jupyter.widget-view+json": {
|
228 |
+
"model_id": "0209e20f794e4e3ab60ef282b98b8bb3",
|
229 |
+
"version_major": 2,
|
230 |
+
"version_minor": 0
|
231 |
+
},
|
232 |
+
"text/plain": [
|
233 |
+
"Saving the dataset (0/1 shards): 0%| | 0/819 [00:00<?, ? examples/s]"
|
234 |
+
]
|
235 |
+
},
|
236 |
+
"metadata": {},
|
237 |
+
"output_type": "display_data"
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"data": {
|
241 |
+
"application/vnd.jupyter.widget-view+json": {
|
242 |
+
"model_id": "0a89fb4c0a96413782a55206d087a2a6",
|
243 |
+
"version_major": 2,
|
244 |
+
"version_minor": 0
|
245 |
+
},
|
246 |
+
"text/plain": [
|
247 |
+
"Saving the dataset (0/1 shards): 0%| | 0/818 [00:00<?, ? examples/s]"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
"metadata": {},
|
251 |
+
"output_type": "display_data"
|
252 |
}
|
253 |
],
|
254 |
"source": [
|
research/data_validation.ipynb
CHANGED
@@ -32,7 +32,7 @@
|
|
32 |
},
|
33 |
{
|
34 |
"cell_type": "code",
|
35 |
-
"execution_count":
|
36 |
"metadata": {},
|
37 |
"outputs": [],
|
38 |
"source": [
|
@@ -41,9 +41,116 @@
|
|
41 |
"@dataclass(frozen=True)\n",
|
42 |
"class DataValidationConfig:\n",
|
43 |
" root_dir : Path\n",
|
44 |
-
" STATUS_FILE :
|
45 |
-
" ALL_REQUIRED_FILES : "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
}
|
48 |
],
|
49 |
"metadata": {
|
|
|
32 |
},
|
33 |
{
|
34 |
"cell_type": "code",
|
35 |
+
"execution_count": 3,
|
36 |
"metadata": {},
|
37 |
"outputs": [],
|
38 |
"source": [
|
|
|
41 |
"@dataclass(frozen=True)\n",
|
42 |
"class DataValidationConfig:\n",
|
43 |
" root_dir : Path\n",
|
44 |
+
" STATUS_FILE : str\n",
|
45 |
+
" ALL_REQUIRED_FILES : list"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"cell_type": "code",
|
50 |
+
"execution_count": 5,
|
51 |
+
"metadata": {},
|
52 |
+
"outputs": [],
|
53 |
+
"source": [
|
54 |
+
"from textsummarizer.constants import *\n",
|
55 |
+
"from textsummarizer.utils.common import read_yaml, create_directories\n",
|
56 |
+
"\n",
|
57 |
+
"class ConfigurationManager:\n",
|
58 |
+
" def __init__(\n",
|
59 |
+
" self,\n",
|
60 |
+
" config_filepath = CONFIG_FILE_PATH,\n",
|
61 |
+
" params_filepath = PARAMS_FILE_PATH):\n",
|
62 |
+
"\n",
|
63 |
+
" self.config = read_yaml(config_filepath)\n",
|
64 |
+
" self.params = read_yaml(params_filepath)\n",
|
65 |
+
"\n",
|
66 |
+
" create_directories([self.config.artifacts_root])\n",
|
67 |
+
"\n",
|
68 |
+
"\n",
|
69 |
+
" \n",
|
70 |
+
" def get_data_validation_config(self) -> DataValidationConfig:\n",
|
71 |
+
" config = self.config.data_validation\n",
|
72 |
+
"\n",
|
73 |
+
" create_directories([config.root_dir])\n",
|
74 |
+
"\n",
|
75 |
+
" data_validation_config = DataValidationConfig(\n",
|
76 |
+
" root_dir=config.root_dir,\n",
|
77 |
+
" STATUS_FILE=config.STATUS_FILE,\n",
|
78 |
+
" ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
|
79 |
+
" )\n",
|
80 |
+
"\n",
|
81 |
+
" return data_validation_config"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": 8,
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [],
|
89 |
+
"source": [
|
90 |
+
"import os\n",
|
91 |
+
"from textsummarizer.logging import logger\n",
|
92 |
+
"\n",
|
93 |
+
"class DataValiadtion:\n",
|
94 |
+
" def __init__(self, config: DataValidationConfig):\n",
|
95 |
+
" self.config = config\n",
|
96 |
+
"\n",
|
97 |
+
"\n",
|
98 |
+
" \n",
|
99 |
+
" def validate_all_files_exist(self)-> bool:\n",
|
100 |
+
" try:\n",
|
101 |
+
" validation_status = None\n",
|
102 |
+
"\n",
|
103 |
+
" all_files = os.listdir(os.path.join(\"artifacts\",\"data_ingestion\",\"samsum_dataset\"))\n",
|
104 |
+
"\n",
|
105 |
+
" for file in all_files:\n",
|
106 |
+
" if file not in self.config.ALL_REQUIRED_FILES:\n",
|
107 |
+
" validation_status = False\n",
|
108 |
+
" with open(self.config.STATUS_FILE, 'w') as f:\n",
|
109 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
110 |
+
" else:\n",
|
111 |
+
" validation_status = True\n",
|
112 |
+
" with open(self.config.STATUS_FILE, 'w') as f:\n",
|
113 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
114 |
+
"\n",
|
115 |
+
" return validation_status\n",
|
116 |
+
" \n",
|
117 |
+
" except Exception as e:\n",
|
118 |
+
" raise e\n"
|
119 |
]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"cell_type": "code",
|
123 |
+
"execution_count": 9,
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [
|
126 |
+
{
|
127 |
+
"name": "stdout",
|
128 |
+
"output_type": "stream",
|
129 |
+
"text": [
|
130 |
+
"[2024-08-11 16:18:48,704: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
|
131 |
+
"[2024-08-11 16:18:48,706: INFO: common: yaml file: params.yaml loaded successfully]\n",
|
132 |
+
"[2024-08-11 16:18:48,707: INFO: common: created directory at: artifacts]\n",
|
133 |
+
"[2024-08-11 16:18:48,708: INFO: common: created directory at: artifacts/data_validation]\n"
|
134 |
+
]
|
135 |
+
}
|
136 |
+
],
|
137 |
+
"source": [
|
138 |
+
"try:\n",
|
139 |
+
" config = ConfigurationManager()\n",
|
140 |
+
" data_validataion_config = config.get_data_validation_config()\n",
|
141 |
+
" data_validation = DataValiadtion(config=data_validataion_config)\n",
|
142 |
+
" data_validation.validate_all_files_exist()\n",
|
143 |
+
" \n",
|
144 |
+
"except Exception as e:\n",
|
145 |
+
" raise e"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": null,
|
151 |
+
"metadata": {},
|
152 |
+
"outputs": [],
|
153 |
+
"source": []
|
154 |
}
|
155 |
],
|
156 |
"metadata": {
|