akoksal commited on
Commit
61cccae
·
1 Parent(s): 44fad9e

Delete Training Notebook (Simple NER v2).ipynb

Browse files
Training Notebook (Simple NER v2).ipynb DELETED
@@ -1,1374 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "c88f989c",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "import os\n",
11
- "os.environ['CUDA_VISIBLE_DEVICES']='7'"
12
- ]
13
- },
14
- {
15
- "cell_type": "code",
16
- "execution_count": 2,
17
- "id": "bfdbe247",
18
- "metadata": {
19
- "scrolled": true
20
- },
21
- "outputs": [
22
- {
23
- "name": "stderr",
24
- "output_type": "stream",
25
- "text": [
26
- "2023-02-26 02:35:07.275938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
27
- "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
28
- "2023-02-26 02:35:07.472394: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
29
- "2023-02-26 02:35:07.472434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
30
- "2023-02-26 02:35:07.503598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
31
- "2023-02-26 02:35:08.603575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
32
- "2023-02-26 02:35:08.603678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
33
- "2023-02-26 02:35:08.603689: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
34
- "2023-02-26 02:35:15.326595: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
35
- "2023-02-26 02:35:15.326728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory\n",
36
- "2023-02-26 02:35:15.326831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory\n",
37
- "2023-02-26 02:35:15.327013: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory\n",
38
- "2023-02-26 02:35:15.327108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory\n",
39
- "2023-02-26 02:35:15.327205: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n",
40
- "2023-02-26 02:35:15.327224: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
41
- "Skipping registering GPU devices...\n"
42
- ]
43
- }
44
- ],
45
- "source": [
46
- "from transformers import AutoTokenizer\n",
47
- "import re\n",
48
- "import numpy as np\n",
49
- "from random import Random\n",
50
- "import torch\n",
51
- "import pandas as pd\n",
52
- "import spacy\n",
53
- "import random\n",
54
- "from datasets import load_dataset\n",
55
- "from transformers import (\n",
56
- " AutoModelForTokenClassification,\n",
57
- " AutoTokenizer,\n",
58
- " DataCollatorForTokenClassification,\n",
59
- " TrainingArguments,\n",
60
- " Trainer,\n",
61
- " set_seed)\n",
62
- "import numpy as np\n",
63
- "import datasets\n",
64
- "from collections import defaultdict\n",
65
- "from datasets import load_metric"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": 3,
71
- "id": "7a916e9f",
72
- "metadata": {},
73
- "outputs": [],
74
- "source": [
75
- "# !pip install seqeval"
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": 4,
81
- "id": "4b0590b7",
82
- "metadata": {},
83
- "outputs": [],
84
- "source": [
85
- "per_device_train_batch_size = 16\n",
86
- "per_device_eval_batch_size = 32\n",
87
- "num_train_epochs = 5\n",
88
- "weight_decay = 0.1\n",
89
- "warmup_ratio = 0.1\n",
90
- "learning_rate = 5e-5\n",
91
- "load_best_model_at_end = True\n",
92
- "output_dir = \"../akoksal/earthquake_ner_models/\"\n",
93
- "old_data_path = \"annotated_address_dataset_07022023_766train_192test/\"\n",
94
- "data_path = \"deprem-private/ner_v12\"\n",
95
- "cache_dir = \"../akoksal/hf_cache\"\n",
96
- "saved_models_path = \"../akoksal/earthquake_ner_models/\"\n",
97
- "device = \"cuda\"\n",
98
- "seed = 42\n",
99
- "model_names = [\"dbmdz/bert-base-turkish-cased\",\n",
100
- " \"dbmdz/electra-base-turkish-mc4-cased-discriminator\",\n",
101
- " \"dbmdz/bert-base-turkish-128k-cased\",\n",
102
- " \"dbmdz/convbert-base-turkish-cased\",\n",
103
- " \"bert-base-multilingual-cased\",\n",
104
- " \"xlm-roberta-base\"]\n",
105
- "model_name = model_names[2]"
106
- ]
107
- },
108
- {
109
- "cell_type": "code",
110
- "execution_count": 5,
111
- "id": "9aeb3dbe",
112
- "metadata": {},
113
- "outputs": [
114
- {
115
- "data": {
116
- "text/plain": [
117
- "'dbmdz/bert-base-turkish-128k-cased'"
118
- ]
119
- },
120
- "execution_count": 5,
121
- "metadata": {},
122
- "output_type": "execute_result"
123
- }
124
- ],
125
- "source": [
126
- "model_name"
127
- ]
128
- },
129
- {
130
- "cell_type": "code",
131
- "execution_count": 6,
132
- "id": "ffeb73e4",
133
- "metadata": {},
134
- "outputs": [],
135
- "source": [
136
- "set_seed(seed)"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": 7,
142
- "id": "a876c516",
143
- "metadata": {},
144
- "outputs": [],
145
- "source": [
146
- "id2label = {\n",
147
- " 0: \"O\",\n",
148
- " 1: \"B-bina\",\n",
149
- " 2: \"I-bina\",\n",
150
- " 3: \"B-bulvar\",\n",
151
- " 4: \"I-bulvar\",\n",
152
- " 5: \"B-cadde\",\n",
153
- " 6: \"I-cadde\",\n",
154
- " 7: \"B-diskapino\",\n",
155
- " 8: \"I-diskapino\",\n",
156
- " 9: \"B-ilce\",\n",
157
- " 10: \"I-ilce\",\n",
158
- " 11: \"B-isim\",\n",
159
- " 12: \"I-isim\",\n",
160
- " 13: \"B-mahalle\",\n",
161
- " 14: \"I-mahalle\",\n",
162
- " 15: \"B-sehir\",\n",
163
- " 16: \"I-sehir\",\n",
164
- " 17: \"B-site\",\n",
165
- " 18: \"I-site\",\n",
166
- " 19: \"B-sokak\",\n",
167
- " 20: \"I-sokak\",\n",
168
- " 21: \"B-soyisim\",\n",
169
- " 22: \"I-soyisim\",\n",
170
- " 23: \"B-telefonno\",\n",
171
- " 24: \"I-telefonno\",\n",
172
- "}\n",
173
- "\n",
174
- "label2id = {label: idx for idx, label in id2label.items()}\n",
175
- "label_names = list(label2id.keys())"
176
- ]
177
- },
178
- {
179
- "cell_type": "code",
180
- "execution_count": 8,
181
- "id": "2e0caffc",
182
- "metadata": {},
183
- "outputs": [],
184
- "source": [
185
- "# from huggingface_hub import login\n",
186
- "# login()"
187
- ]
188
- },
189
- {
190
- "cell_type": "code",
191
- "execution_count": 9,
192
- "id": "c74850f9",
193
- "metadata": {},
194
- "outputs": [
195
- {
196
- "name": "stderr",
197
- "output_type": "stream",
198
- "text": [
199
- "Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
200
- "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
201
- "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
202
- "Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
203
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
204
- ]
205
- }
206
- ],
207
- "source": [
208
- "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
209
- "model = AutoModelForTokenClassification.from_pretrained(model_name,\n",
210
- " num_labels=len(label_names),\n",
211
- " id2label=id2label,\n",
212
- " cache_dir=cache_dir).to(device)"
213
- ]
214
- },
215
- {
216
- "cell_type": "code",
217
- "execution_count": 10,
218
- "id": "4c1fe653",
219
- "metadata": {},
220
- "outputs": [
221
- {
222
- "name": "stderr",
223
- "output_type": "stream",
224
- "text": [
225
- "Using custom data configuration deprem-private--ner_v12-e2f61c5a18a7a738\n",
226
- "Found cached dataset text (/mounts/Users/cisintern/akoksal/.cache/huggingface/datasets/deprem-private___text/deprem-private--ner_v12-e2f61c5a18a7a738/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n"
227
- ]
228
- },
229
- {
230
- "data": {
231
- "application/vnd.jupyter.widget-view+json": {
232
- "model_id": "22bc5f5f97204b41b2bc5dc3b71036e1",
233
- "version_major": 2,
234
- "version_minor": 0
235
- },
236
- "text/plain": [
237
- " 0%| | 0/3 [00:00<?, ?it/s]"
238
- ]
239
- },
240
- "metadata": {},
241
- "output_type": "display_data"
242
- }
243
- ],
244
- "source": [
245
- "raw_dataset = datasets.load_dataset(\"deprem-private/ner_v12\", use_auth_token=True)\n",
246
- "\n",
247
- "new_dataset_json = {}\n",
248
- "for split in [\"train\", \"validation\", \"test\"]:\n",
249
- " ids = []\n",
250
- " sentences = []\n",
251
- " labels = []\n",
252
- " ids = []\n",
253
- " cur_idx = 0\n",
254
- " unique_labels = set()\n",
255
- " temp_sent = []\n",
256
- " temp_labels = []\n",
257
- " for word in raw_dataset[split][\"text\"]:\n",
258
- " \n",
259
- " if word!=\"\":\n",
260
- " temp_sent.append((word.split()[0]))\n",
261
- " temp_labels.append(label2id[(word.split()[1])])\n",
262
- " else:\n",
263
- " sentences.append(temp_sent)\n",
264
- " labels.append(temp_labels)\n",
265
- " ids.append(cur_idx)\n",
266
- " cur_idx+=1\n",
267
- " temp_sent = []\n",
268
- " temp_labels = []\n",
269
- " new_dataset_json[split] = {\"tokens\":sentences, \"ner_tags\":labels, \"ids\":ids}\n",
270
- "\n",
271
- "dataset = datasets.DatasetDict()\n",
272
- "# using your `Dict` object\n",
273
- "for k,v in new_dataset_json.items():\n",
274
- " dataset[k] = datasets.Dataset.from_dict(v)"
275
- ]
276
- },
277
- {
278
- "cell_type": "code",
279
- "execution_count": 11,
280
- "id": "65a66af9",
281
- "metadata": {},
282
- "outputs": [
283
- {
284
- "data": {
285
- "application/vnd.jupyter.widget-view+json": {
286
- "model_id": "a403f5fadb3041f4b18acc7ec41a2d36",
287
- "version_major": 2,
288
- "version_minor": 0
289
- },
290
- "text/plain": [
291
- " 0%| | 0/1 [00:00<?, ?ba/s]"
292
- ]
293
- },
294
- "metadata": {},
295
- "output_type": "display_data"
296
- },
297
- {
298
- "data": {
299
- "application/vnd.jupyter.widget-view+json": {
300
- "model_id": "e2410f6106514cfd8207d8b42748c66d",
301
- "version_major": 2,
302
- "version_minor": 0
303
- },
304
- "text/plain": [
305
- " 0%| | 0/1 [00:00<?, ?ba/s]"
306
- ]
307
- },
308
- "metadata": {},
309
- "output_type": "display_data"
310
- },
311
- {
312
- "data": {
313
- "application/vnd.jupyter.widget-view+json": {
314
- "model_id": "227e163e07b2414da9abdbe11cb0c6bf",
315
- "version_major": 2,
316
- "version_minor": 0
317
- },
318
- "text/plain": [
319
- " 0%| | 0/1 [00:00<?, ?ba/s]"
320
- ]
321
- },
322
- "metadata": {},
323
- "output_type": "display_data"
324
- }
325
- ],
326
- "source": [
327
- "# dataset = datasets.load_from_disk(old_data_path)\n",
328
- "def tokenize_and_align_labels(examples):\n",
329
- " tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
330
- "\n",
331
- " labels = []\n",
332
- " for i, label in enumerate(examples[f\"ner_tags\"]):\n",
333
- " word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.\n",
334
- " previous_word_idx = None\n",
335
- " label_ids = []\n",
336
- " for word_idx in word_ids: # Set the special tokens to -100.\n",
337
- " if word_idx is None:\n",
338
- " label_ids.append(-100)\n",
339
- " elif word_idx != previous_word_idx: # Only label the first token of a given word.\n",
340
- " label_ids.append(label[word_idx])\n",
341
- " else:\n",
342
- " label_ids.append(-100)\n",
343
- " previous_word_idx = word_idx\n",
344
- " labels.append(label_ids)\n",
345
- "\n",
346
- " tokenized_inputs[\"labels\"] = labels\n",
347
- " return tokenized_inputs\n",
348
- "\n",
349
- "tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)"
350
- ]
351
- },
352
- {
353
- "cell_type": "code",
354
- "execution_count": 12,
355
- "id": "6b43934d",
356
- "metadata": {},
357
- "outputs": [],
358
- "source": [
359
- "data_collator = DataCollatorForTokenClassification(tokenizer)"
360
- ]
361
- },
362
- {
363
- "cell_type": "code",
364
- "execution_count": 13,
365
- "id": "c24f52db",
366
- "metadata": {},
367
- "outputs": [
368
- {
369
- "name": "stderr",
370
- "output_type": "stream",
371
- "text": [
372
- "/tmp/ipykernel_2652487/885599324.py:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
373
- " metric = load_metric(\"seqeval\")\n"
374
- ]
375
- }
376
- ],
377
- "source": [
378
- "metric = load_metric(\"seqeval\")\n",
379
- "def compute_metrics(p):\n",
380
- " predictions, labels = p\n",
381
- " predictions = np.argmax(predictions, axis=2)\n",
382
- "\n",
383
- " # Remove ignored index (special tokens)\n",
384
- " true_predictions = [\n",
385
- " [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
386
- " for prediction, label in zip(predictions, labels)\n",
387
- " ]\n",
388
- " true_labels = [\n",
389
- " [label_names[l] for (p, l) in zip(prediction, label) if l != -100]\n",
390
- " for prediction, label in zip(predictions, labels)\n",
391
- " ]\n",
392
- "\n",
393
- " results = metric.compute(predictions=true_predictions, references=true_labels)\n",
394
- " flattened_results = {\n",
395
- " \"overall_precision\": results[\"overall_precision\"],\n",
396
- " \"overall_recall\": results[\"overall_recall\"],\n",
397
- " \"overall_f1\": results[\"overall_f1\"],\n",
398
- " \"overall_accuracy\": results[\"overall_accuracy\"],\n",
399
- " }\n",
400
- " for k in results.keys():\n",
401
- " if(k not in flattened_results.keys()):\n",
402
- " flattened_results[k+\"_f1\"]=results[k][\"f1\"]\n",
403
- " flattened_results[k+\"_recall\"]=results[k][\"recall\"]\n",
404
- " flattened_results[k+\"_precision\"]=results[k][\"precision\"]\n",
405
- " flattened_results[k+\"_support\"]=results[k][\"number\"]\n",
406
- "\n",
407
- " return flattened_results"
408
- ]
409
- },
410
- {
411
- "cell_type": "code",
412
- "execution_count": 14,
413
- "id": "a955fd51",
414
- "metadata": {},
415
- "outputs": [],
416
- "source": [
417
- "training_args = TrainingArguments(\n",
418
- " output_dir=saved_models_path,\n",
419
- " evaluation_strategy=\"epoch\",\n",
420
- " learning_rate=learning_rate,\n",
421
- " per_device_train_batch_size=per_device_train_batch_size,\n",
422
- " per_device_eval_batch_size=per_device_eval_batch_size,\n",
423
- " num_train_epochs=num_train_epochs,\n",
424
- " warmup_ratio=warmup_ratio,\n",
425
- " weight_decay=weight_decay,\n",
426
- " run_name = \"turkish_ner\",\n",
427
- " save_strategy='epoch',\n",
428
- " logging_strategy=\"epoch\",\n",
429
- " save_total_limit=3,\n",
430
- " load_best_model_at_end=load_best_model_at_end,\n",
431
- " \n",
432
- ")\n",
433
- "trainer = Trainer(\n",
434
- " model=model,\n",
435
- " args=training_args,\n",
436
- " train_dataset=tokenized_dataset[\"train\"],\n",
437
- " eval_dataset=tokenized_dataset[\"validation\"],\n",
438
- " data_collator=data_collator,\n",
439
- " tokenizer=tokenizer,\n",
440
- " compute_metrics=compute_metrics\n",
441
- ")"
442
- ]
443
- },
444
- {
445
- "cell_type": "code",
446
- "execution_count": 15,
447
- "id": "9f78efdc",
448
- "metadata": {},
449
- "outputs": [
450
- {
451
- "name": "stderr",
452
- "output_type": "stream",
453
- "text": [
454
- "The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
455
- "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
456
- " warnings.warn(\n",
457
- "***** Running training *****\n",
458
- " Num examples = 799\n",
459
- " Num Epochs = 5\n",
460
- " Instantaneous batch size per device = 16\n",
461
- " Total train batch size (w. parallel, distributed & accumulation) = 16\n",
462
- " Gradient Accumulation steps = 1\n",
463
- " Total optimization steps = 250\n",
464
- " Number of trainable parameters = 183773977\n",
465
- "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
466
- ]
467
- },
468
- {
469
- "data": {
470
- "text/html": [
471
- "\n",
472
- " <div>\n",
473
- " \n",
474
- " <progress value='250' max='250' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
475
- " [250/250 01:12, Epoch 5/5]\n",
476
- " </div>\n",
477
- " <table border=\"1\" class=\"dataframe\">\n",
478
- " <thead>\n",
479
- " <tr style=\"text-align: left;\">\n",
480
- " <th>Epoch</th>\n",
481
- " <th>Training Loss</th>\n",
482
- " <th>Validation Loss</th>\n",
483
- " <th>Overall Precision</th>\n",
484
- " <th>Overall Recall</th>\n",
485
- " <th>Overall F1</th>\n",
486
- " <th>Overall Accuracy</th>\n",
487
- " <th>Bina F1</th>\n",
488
- " <th>Bina Recall</th>\n",
489
- " <th>Bina Precision</th>\n",
490
- " <th>Bina Support</th>\n",
491
- " <th>Bulvar F1</th>\n",
492
- " <th>Bulvar Recall</th>\n",
493
- " <th>Bulvar Precision</th>\n",
494
- " <th>Bulvar Support</th>\n",
495
- " <th>Cadde F1</th>\n",
496
- " <th>Cadde Recall</th>\n",
497
- " <th>Cadde Precision</th>\n",
498
- " <th>Cadde Support</th>\n",
499
- " <th>Diskapino F1</th>\n",
500
- " <th>Diskapino Recall</th>\n",
501
- " <th>Diskapino Precision</th>\n",
502
- " <th>Diskapino Support</th>\n",
503
- " <th>Ilce F1</th>\n",
504
- " <th>Ilce Recall</th>\n",
505
- " <th>Ilce Precision</th>\n",
506
- " <th>Ilce Support</th>\n",
507
- " <th>Isim F1</th>\n",
508
- " <th>Isim Recall</th>\n",
509
- " <th>Isim Precision</th>\n",
510
- " <th>Isim Support</th>\n",
511
- " <th>Mahalle F1</th>\n",
512
- " <th>Mahalle Recall</th>\n",
513
- " <th>Mahalle Precision</th>\n",
514
- " <th>Mahalle Support</th>\n",
515
- " <th>Sehir F1</th>\n",
516
- " <th>Sehir Recall</th>\n",
517
- " <th>Sehir Precision</th>\n",
518
- " <th>Sehir Support</th>\n",
519
- " <th>Site F1</th>\n",
520
- " <th>Site Recall</th>\n",
521
- " <th>Site Precision</th>\n",
522
- " <th>Site Support</th>\n",
523
- " <th>Sokak F1</th>\n",
524
- " <th>Sokak Recall</th>\n",
525
- " <th>Sokak Precision</th>\n",
526
- " <th>Sokak Support</th>\n",
527
- " <th>Soyisim F1</th>\n",
528
- " <th>Soyisim Recall</th>\n",
529
- " <th>Soyisim Precision</th>\n",
530
- " <th>Soyisim Support</th>\n",
531
- " <th>Telefonno F1</th>\n",
532
- " <th>Telefonno Recall</th>\n",
533
- " <th>Telefonno Precision</th>\n",
534
- " <th>Telefonno Support</th>\n",
535
- " </tr>\n",
536
- " </thead>\n",
537
- " <tbody>\n",
538
- " <tr>\n",
539
- " <td>1</td>\n",
540
- " <td>1.349500</td>\n",
541
- " <td>0.357321</td>\n",
542
- " <td>0.783270</td>\n",
543
- " <td>0.828974</td>\n",
544
- " <td>0.805474</td>\n",
545
- " <td>0.908936</td>\n",
546
- " <td>0.600000</td>\n",
547
- " <td>0.705882</td>\n",
548
- " <td>0.521739</td>\n",
549
- " <td>34</td>\n",
550
- " <td>0.000000</td>\n",
551
- " <td>0.000000</td>\n",
552
- " <td>0.000000</td>\n",
553
- " <td>5</td>\n",
554
- " <td>0.588235</td>\n",
555
- " <td>0.833333</td>\n",
556
- " <td>0.454545</td>\n",
557
- " <td>24</td>\n",
558
- " <td>0.769231</td>\n",
559
- " <td>0.892857</td>\n",
560
- " <td>0.675676</td>\n",
561
- " <td>28</td>\n",
562
- " <td>0.830508</td>\n",
563
- " <td>0.816667</td>\n",
564
- " <td>0.844828</td>\n",
565
- " <td>60</td>\n",
566
- " <td>0.888889</td>\n",
567
- " <td>0.926829</td>\n",
568
- " <td>0.853933</td>\n",
569
- " <td>82</td>\n",
570
- " <td>0.750000</td>\n",
571
- " <td>0.792453</td>\n",
572
- " <td>0.711864</td>\n",
573
- " <td>53</td>\n",
574
- " <td>0.867133</td>\n",
575
- " <td>0.861111</td>\n",
576
- " <td>0.873239</td>\n",
577
- " <td>72</td>\n",
578
- " <td>0.000000</td>\n",
579
- " <td>0.000000</td>\n",
580
- " <td>0.000000</td>\n",
581
- " <td>6</td>\n",
582
- " <td>0.750000</td>\n",
583
- " <td>0.620690</td>\n",
584
- " <td>0.947368</td>\n",
585
- " <td>29</td>\n",
586
- " <td>0.900000</td>\n",
587
- " <td>0.887324</td>\n",
588
- " <td>0.913043</td>\n",
589
- " <td>71</td>\n",
590
- " <td>0.985075</td>\n",
591
- " <td>1.000000</td>\n",
592
- " <td>0.970588</td>\n",
593
- " <td>33</td>\n",
594
- " </tr>\n",
595
- " <tr>\n",
596
- " <td>2</td>\n",
597
- " <td>0.264700</td>\n",
598
- " <td>0.220467</td>\n",
599
- " <td>0.885149</td>\n",
600
- " <td>0.899396</td>\n",
601
- " <td>0.892216</td>\n",
602
- " <td>0.944792</td>\n",
603
- " <td>0.782609</td>\n",
604
- " <td>0.794118</td>\n",
605
- " <td>0.771429</td>\n",
606
- " <td>34</td>\n",
607
- " <td>0.666667</td>\n",
608
- " <td>0.800000</td>\n",
609
- " <td>0.571429</td>\n",
610
- " <td>5</td>\n",
611
- " <td>0.875000</td>\n",
612
- " <td>0.875000</td>\n",
613
- " <td>0.875000</td>\n",
614
- " <td>24</td>\n",
615
- " <td>0.862069</td>\n",
616
- " <td>0.892857</td>\n",
617
- " <td>0.833333</td>\n",
618
- " <td>28</td>\n",
619
- " <td>0.894309</td>\n",
620
- " <td>0.916667</td>\n",
621
- " <td>0.873016</td>\n",
622
- " <td>60</td>\n",
623
- " <td>0.884848</td>\n",
624
- " <td>0.890244</td>\n",
625
- " <td>0.879518</td>\n",
626
- " <td>82</td>\n",
627
- " <td>0.897196</td>\n",
628
- " <td>0.905660</td>\n",
629
- " <td>0.888889</td>\n",
630
- " <td>53</td>\n",
631
- " <td>0.915493</td>\n",
632
- " <td>0.902778</td>\n",
633
- " <td>0.928571</td>\n",
634
- " <td>72</td>\n",
635
- " <td>0.181818</td>\n",
636
- " <td>0.166667</td>\n",
637
- " <td>0.200000</td>\n",
638
- " <td>6</td>\n",
639
- " <td>0.949153</td>\n",
640
- " <td>0.965517</td>\n",
641
- " <td>0.933333</td>\n",
642
- " <td>29</td>\n",
643
- " <td>0.950355</td>\n",
644
- " <td>0.943662</td>\n",
645
- " <td>0.957143</td>\n",
646
- " <td>71</td>\n",
647
- " <td>0.985075</td>\n",
648
- " <td>1.000000</td>\n",
649
- " <td>0.970588</td>\n",
650
- " <td>33</td>\n",
651
- " </tr>\n",
652
- " <tr>\n",
653
- " <td>3</td>\n",
654
- " <td>0.158700</td>\n",
655
- " <td>0.219565</td>\n",
656
- " <td>0.876768</td>\n",
657
- " <td>0.873239</td>\n",
658
- " <td>0.875000</td>\n",
659
- " <td>0.940808</td>\n",
660
- " <td>0.805556</td>\n",
661
- " <td>0.852941</td>\n",
662
- " <td>0.763158</td>\n",
663
- " <td>34</td>\n",
664
- " <td>0.666667</td>\n",
665
- " <td>1.000000</td>\n",
666
- " <td>0.500000</td>\n",
667
- " <td>5</td>\n",
668
- " <td>0.880000</td>\n",
669
- " <td>0.916667</td>\n",
670
- " <td>0.846154</td>\n",
671
- " <td>24</td>\n",
672
- " <td>0.827586</td>\n",
673
- " <td>0.857143</td>\n",
674
- " <td>0.800000</td>\n",
675
- " <td>28</td>\n",
676
- " <td>0.881356</td>\n",
677
- " <td>0.866667</td>\n",
678
- " <td>0.896552</td>\n",
679
- " <td>60</td>\n",
680
- " <td>0.822785</td>\n",
681
- " <td>0.792683</td>\n",
682
- " <td>0.855263</td>\n",
683
- " <td>82</td>\n",
684
- " <td>0.886792</td>\n",
685
- " <td>0.886792</td>\n",
686
- " <td>0.886792</td>\n",
687
- " <td>53</td>\n",
688
- " <td>0.892086</td>\n",
689
- " <td>0.861111</td>\n",
690
- " <td>0.925373</td>\n",
691
- " <td>72</td>\n",
692
- " <td>0.400000</td>\n",
693
- " <td>0.333333</td>\n",
694
- " <td>0.500000</td>\n",
695
- " <td>6</td>\n",
696
- " <td>0.881356</td>\n",
697
- " <td>0.896552</td>\n",
698
- " <td>0.866667</td>\n",
699
- " <td>29</td>\n",
700
- " <td>0.957143</td>\n",
701
- " <td>0.943662</td>\n",
702
- " <td>0.971014</td>\n",
703
- " <td>71</td>\n",
704
- " <td>0.985075</td>\n",
705
- " <td>1.000000</td>\n",
706
- " <td>0.970588</td>\n",
707
- " <td>33</td>\n",
708
- " </tr>\n",
709
- " <tr>\n",
710
- " <td>4</td>\n",
711
- " <td>0.115000</td>\n",
712
- " <td>0.215329</td>\n",
713
- " <td>0.897541</td>\n",
714
- " <td>0.881288</td>\n",
715
- " <td>0.889340</td>\n",
716
- " <td>0.946500</td>\n",
717
- " <td>0.857143</td>\n",
718
- " <td>0.882353</td>\n",
719
- " <td>0.833333</td>\n",
720
- " <td>34</td>\n",
721
- " <td>0.909091</td>\n",
722
- " <td>1.000000</td>\n",
723
- " <td>0.833333</td>\n",
724
- " <td>5</td>\n",
725
- " <td>0.897959</td>\n",
726
- " <td>0.916667</td>\n",
727
- " <td>0.880000</td>\n",
728
- " <td>24</td>\n",
729
- " <td>0.862069</td>\n",
730
- " <td>0.892857</td>\n",
731
- " <td>0.833333</td>\n",
732
- " <td>28</td>\n",
733
- " <td>0.881356</td>\n",
734
- " <td>0.866667</td>\n",
735
- " <td>0.896552</td>\n",
736
- " <td>60</td>\n",
737
- " <td>0.810127</td>\n",
738
- " <td>0.780488</td>\n",
739
- " <td>0.842105</td>\n",
740
- " <td>82</td>\n",
741
- " <td>0.886792</td>\n",
742
- " <td>0.886792</td>\n",
743
- " <td>0.886792</td>\n",
744
- " <td>53</td>\n",
745
- " <td>0.890511</td>\n",
746
- " <td>0.847222</td>\n",
747
- " <td>0.938462</td>\n",
748
- " <td>72</td>\n",
749
- " <td>0.727273</td>\n",
750
- " <td>0.666667</td>\n",
751
- " <td>0.800000</td>\n",
752
- " <td>6</td>\n",
753
- " <td>0.950820</td>\n",
754
- " <td>1.000000</td>\n",
755
- " <td>0.906250</td>\n",
756
- " <td>29</td>\n",
757
- " <td>0.949640</td>\n",
758
- " <td>0.929577</td>\n",
759
- " <td>0.970588</td>\n",
760
- " <td>71</td>\n",
761
- " <td>0.985075</td>\n",
762
- " <td>1.000000</td>\n",
763
- " <td>0.970588</td>\n",
764
- " <td>33</td>\n",
765
- " </tr>\n",
766
- " <tr>\n",
767
- " <td>5</td>\n",
768
- " <td>0.093800</td>\n",
769
- " <td>0.231558</td>\n",
770
- " <td>0.895492</td>\n",
771
- " <td>0.879276</td>\n",
772
- " <td>0.887310</td>\n",
773
- " <td>0.945361</td>\n",
774
- " <td>0.833333</td>\n",
775
- " <td>0.882353</td>\n",
776
- " <td>0.789474</td>\n",
777
- " <td>34</td>\n",
778
- " <td>0.909091</td>\n",
779
- " <td>1.000000</td>\n",
780
- " <td>0.833333</td>\n",
781
- " <td>5</td>\n",
782
- " <td>0.880000</td>\n",
783
- " <td>0.916667</td>\n",
784
- " <td>0.846154</td>\n",
785
- " <td>24</td>\n",
786
- " <td>0.813559</td>\n",
787
- " <td>0.857143</td>\n",
788
- " <td>0.774194</td>\n",
789
- " <td>28</td>\n",
790
- " <td>0.888889</td>\n",
791
- " <td>0.866667</td>\n",
792
- " <td>0.912281</td>\n",
793
- " <td>60</td>\n",
794
- " <td>0.833333</td>\n",
795
- " <td>0.792683</td>\n",
796
- " <td>0.878378</td>\n",
797
- " <td>82</td>\n",
798
- " <td>0.895238</td>\n",
799
- " <td>0.886792</td>\n",
800
- " <td>0.903846</td>\n",
801
- " <td>53</td>\n",
802
- " <td>0.898551</td>\n",
803
- " <td>0.861111</td>\n",
804
- " <td>0.939394</td>\n",
805
- " <td>72</td>\n",
806
- " <td>0.727273</td>\n",
807
- " <td>0.666667</td>\n",
808
- " <td>0.800000</td>\n",
809
- " <td>6</td>\n",
810
- " <td>0.881356</td>\n",
811
- " <td>0.896552</td>\n",
812
- " <td>0.866667</td>\n",
813
- " <td>29</td>\n",
814
- " <td>0.957143</td>\n",
815
- " <td>0.943662</td>\n",
816
- " <td>0.971014</td>\n",
817
- " <td>71</td>\n",
818
- " <td>0.985075</td>\n",
819
- " <td>1.000000</td>\n",
820
- " <td>0.970588</td>\n",
821
- " <td>33</td>\n",
822
- " </tr>\n",
823
- " </tbody>\n",
824
- "</table><p>"
825
- ],
826
- "text/plain": [
827
- "<IPython.core.display.HTML object>"
828
- ]
829
- },
830
- "metadata": {},
831
- "output_type": "display_data"
832
- },
833
- {
834
- "name": "stderr",
835
- "output_type": "stream",
836
- "text": [
837
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
838
- "***** Running Evaluation *****\n",
839
- " Num examples = 58\n",
840
- " Batch size = 32\n",
841
- "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
842
- " _warn_prf(average, modifier, msg_start, len(result))\n",
843
- "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-50\n",
844
- "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/config.json\n",
845
- "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/pytorch_model.bin\n",
846
- "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/tokenizer_config.json\n",
847
- "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/special_tokens_map.json\n",
848
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
849
- "***** Running Evaluation *****\n",
850
- " Num examples = 58\n",
851
- " Batch size = 32\n",
852
- "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-100\n",
853
- "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/config.json\n",
854
- "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/pytorch_model.bin\n",
855
- "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/tokenizer_config.json\n",
856
- "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/special_tokens_map.json\n",
857
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
858
- "***** Running Evaluation *****\n",
859
- " Num examples = 58\n",
860
- " Batch size = 32\n",
861
- "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-150\n",
862
- "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/config.json\n",
863
- "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/pytorch_model.bin\n",
864
- "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/tokenizer_config.json\n",
865
- "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/special_tokens_map.json\n",
866
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
867
- "***** Running Evaluation *****\n",
868
- " Num examples = 58\n",
869
- " Batch size = 32\n",
870
- "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-200\n",
871
- "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/config.json\n",
872
- "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/pytorch_model.bin\n",
873
- "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/tokenizer_config.json\n",
874
- "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/special_tokens_map.json\n",
875
- "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-50] due to args.save_total_limit\n",
876
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
877
- "***** Running Evaluation *****\n",
878
- " Num examples = 58\n",
879
- " Batch size = 32\n",
880
- "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-250\n",
881
- "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/config.json\n",
882
- "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/pytorch_model.bin\n",
883
- "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/tokenizer_config.json\n",
884
- "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/special_tokens_map.json\n",
885
- "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-100] due to args.save_total_limit\n",
886
- "\n",
887
- "\n",
888
- "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
889
- "\n",
890
- "\n",
891
- "Loading best model from /mounts/work/akoksal/earthquake_ner_models/checkpoint-200 (score: 0.21532948315143585).\n"
892
- ]
893
- },
894
- {
895
- "data": {
896
- "text/plain": [
897
- "TrainOutput(global_step=250, training_loss=0.3963502960205078, metrics={'train_runtime': 73.0701, 'train_samples_per_second': 54.674, 'train_steps_per_second': 3.421, 'total_flos': 129863927953500.0, 'train_loss': 0.3963502960205078, 'epoch': 5.0})"
898
- ]
899
- },
900
- "execution_count": 15,
901
- "metadata": {},
902
- "output_type": "execute_result"
903
- }
904
- ],
905
- "source": [
906
- "trainer.train()"
907
- ]
908
- },
909
- {
910
- "cell_type": "code",
911
- "execution_count": 16,
912
- "id": "4427c32d",
913
- "metadata": {},
914
- "outputs": [
915
- {
916
- "name": "stderr",
917
- "output_type": "stream",
918
- "text": [
919
- "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
920
- "***** Running Evaluation *****\n",
921
- " Num examples = 129\n",
922
- " Batch size = 32\n"
923
- ]
924
- },
925
- {
926
- "data": {
927
- "text/html": [
928
- "\n",
929
- " <div>\n",
930
- " \n",
931
- " <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
932
- " [5/5 00:00]\n",
933
- " </div>\n",
934
- " "
935
- ],
936
- "text/plain": [
937
- "<IPython.core.display.HTML object>"
938
- ]
939
- },
940
- "metadata": {},
941
- "output_type": "display_data"
942
- }
943
- ],
944
- "source": [
945
- "results = trainer.evaluate(tokenized_dataset[\"test\"])"
946
- ]
947
- },
948
- {
949
- "cell_type": "code",
950
- "execution_count": 24,
951
- "id": "aabbb977",
952
- "metadata": {},
953
- "outputs": [
954
- {
955
- "data": {
956
- "text/plain": [
957
- "{'eval_loss': 0.24822480976581573,\n",
958
- " 'eval_overall_precision': 0.8442211055276382,\n",
959
- " 'eval_overall_recall': 0.877742946708464,\n",
960
- " 'eval_overall_f1': 0.860655737704918,\n",
961
- " 'eval_overall_accuracy': 0.9401853411962932,\n",
962
- " 'eval_bina_f1': 0.7000000000000001,\n",
963
- " 'eval_bina_recall': 0.7424242424242424,\n",
964
- " 'eval_bina_precision': 0.6621621621621622,\n",
965
- " 'eval_bina_support': 66,\n",
966
- " 'eval_bulvar_f1': 0.9230769230769231,\n",
967
- " 'eval_bulvar_recall': 0.9230769230769231,\n",
968
- " 'eval_bulvar_precision': 0.9230769230769231,\n",
969
- " 'eval_bulvar_support': 13,\n",
970
- " 'eval_cadde_f1': 0.8067226890756302,\n",
971
- " 'eval_cadde_recall': 0.8421052631578947,\n",
972
- " 'eval_cadde_precision': 0.7741935483870968,\n",
973
- " 'eval_cadde_support': 57,\n",
974
- " 'eval_diskapino_f1': 0.7083333333333334,\n",
975
- " 'eval_diskapino_recall': 0.7285714285714285,\n",
976
- " 'eval_diskapino_precision': 0.6891891891891891,\n",
977
- " 'eval_diskapino_support': 70,\n",
978
- " 'eval_ilce_f1': 0.9218106995884773,\n",
979
- " 'eval_ilce_recall': 0.9572649572649573,\n",
980
- " 'eval_ilce_precision': 0.8888888888888888,\n",
981
- " 'eval_ilce_support': 117,\n",
982
- " 'eval_isim_f1': 0.8793103448275862,\n",
983
- " 'eval_isim_recall': 0.9026548672566371,\n",
984
- " 'eval_isim_precision': 0.8571428571428571,\n",
985
- " 'eval_isim_support': 113,\n",
986
- " 'eval_mahalle_f1': 0.7903225806451613,\n",
987
- " 'eval_mahalle_recall': 0.8166666666666667,\n",
988
- " 'eval_mahalle_precision': 0.765625,\n",
989
- " 'eval_mahalle_support': 120,\n",
990
- " 'eval_sehir_f1': 0.9724137931034483,\n",
991
- " 'eval_sehir_recall': 0.9657534246575342,\n",
992
- " 'eval_sehir_precision': 0.9791666666666666,\n",
993
- " 'eval_sehir_support': 146,\n",
994
- " 'eval_site_f1': 0.6875000000000001,\n",
995
- " 'eval_site_recall': 0.6111111111111112,\n",
996
- " 'eval_site_precision': 0.7857142857142857,\n",
997
- " 'eval_site_support': 18,\n",
998
- " 'eval_sokak_f1': 0.7301587301587302,\n",
999
- " 'eval_sokak_recall': 0.7419354838709677,\n",
1000
- " 'eval_sokak_precision': 0.71875,\n",
1001
- " 'eval_sokak_support': 62,\n",
1002
- " 'eval_soyisim_f1': 0.9441624365482234,\n",
1003
- " 'eval_soyisim_recall': 0.9489795918367347,\n",
1004
- " 'eval_soyisim_precision': 0.9393939393939394,\n",
1005
- " 'eval_soyisim_support': 98,\n",
1006
- " 'eval_telefonno_f1': 0.9935483870967742,\n",
1007
- " 'eval_telefonno_recall': 1.0,\n",
1008
- " 'eval_telefonno_precision': 0.9871794871794872,\n",
1009
- " 'eval_telefonno_support': 77,\n",
1010
- " 'eval_runtime': 0.3493,\n",
1011
- " 'eval_samples_per_second': 369.308,\n",
1012
- " 'eval_steps_per_second': 14.314,\n",
1013
- " 'epoch': 5.0}"
1014
- ]
1015
- },
1016
- "execution_count": 24,
1017
- "metadata": {},
1018
- "output_type": "execute_result"
1019
- }
1020
- ],
1021
- "source": [
1022
- "results"
1023
- ]
1024
- },
1025
- {
1026
- "cell_type": "code",
1027
- "execution_count": 18,
1028
- "id": "922a7237",
1029
- "metadata": {},
1030
- "outputs": [
1031
- {
1032
- "data": {
1033
- "text/html": [
1034
- "<div>\n",
1035
- "<style scoped>\n",
1036
- " .dataframe tbody tr th:only-of-type {\n",
1037
- " vertical-align: middle;\n",
1038
- " }\n",
1039
- "\n",
1040
- " .dataframe tbody tr th {\n",
1041
- " vertical-align: top;\n",
1042
- " }\n",
1043
- "\n",
1044
- " .dataframe thead th {\n",
1045
- " text-align: right;\n",
1046
- " }\n",
1047
- "</style>\n",
1048
- "<table border=\"1\" class=\"dataframe\">\n",
1049
- " <thead>\n",
1050
- " <tr style=\"text-align: right;\">\n",
1051
- " <th></th>\n",
1052
- " <th>support</th>\n",
1053
- " <th>precision</th>\n",
1054
- " <th>recall</th>\n",
1055
- " <th>f1</th>\n",
1056
- " <th>accuracy</th>\n",
1057
- " </tr>\n",
1058
- " </thead>\n",
1059
- " <tbody>\n",
1060
- " <tr>\n",
1061
- " <th>overall</th>\n",
1062
- " <td>957</td>\n",
1063
- " <td>0.84</td>\n",
1064
- " <td>0.88</td>\n",
1065
- " <td>0.86</td>\n",
1066
- " <td>0.94</td>\n",
1067
- " </tr>\n",
1068
- " <tr>\n",
1069
- " <th>bina</th>\n",
1070
- " <td>66</td>\n",
1071
- " <td>0.66</td>\n",
1072
- " <td>0.74</td>\n",
1073
- " <td>0.70</td>\n",
1074
- " <td>NaN</td>\n",
1075
- " </tr>\n",
1076
- " <tr>\n",
1077
- " <th>bulvar</th>\n",
1078
- " <td>13</td>\n",
1079
- " <td>0.92</td>\n",
1080
- " <td>0.92</td>\n",
1081
- " <td>0.92</td>\n",
1082
- " <td>NaN</td>\n",
1083
- " </tr>\n",
1084
- " <tr>\n",
1085
- " <th>cadde</th>\n",
1086
- " <td>57</td>\n",
1087
- " <td>0.77</td>\n",
1088
- " <td>0.84</td>\n",
1089
- " <td>0.81</td>\n",
1090
- " <td>NaN</td>\n",
1091
- " </tr>\n",
1092
- " <tr>\n",
1093
- " <th>diskapino</th>\n",
1094
- " <td>70</td>\n",
1095
- " <td>0.69</td>\n",
1096
- " <td>0.73</td>\n",
1097
- " <td>0.71</td>\n",
1098
- " <td>NaN</td>\n",
1099
- " </tr>\n",
1100
- " <tr>\n",
1101
- " <th>ilce</th>\n",
1102
- " <td>117</td>\n",
1103
- " <td>0.89</td>\n",
1104
- " <td>0.96</td>\n",
1105
- " <td>0.92</td>\n",
1106
- " <td>NaN</td>\n",
1107
- " </tr>\n",
1108
- " <tr>\n",
1109
- " <th>isim</th>\n",
1110
- " <td>113</td>\n",
1111
- " <td>0.86</td>\n",
1112
- " <td>0.90</td>\n",
1113
- " <td>0.88</td>\n",
1114
- " <td>NaN</td>\n",
1115
- " </tr>\n",
1116
- " <tr>\n",
1117
- " <th>mahalle</th>\n",
1118
- " <td>120</td>\n",
1119
- " <td>0.77</td>\n",
1120
- " <td>0.82</td>\n",
1121
- " <td>0.79</td>\n",
1122
- " <td>NaN</td>\n",
1123
- " </tr>\n",
1124
- " <tr>\n",
1125
- " <th>sehir</th>\n",
1126
- " <td>146</td>\n",
1127
- " <td>0.98</td>\n",
1128
- " <td>0.97</td>\n",
1129
- " <td>0.97</td>\n",
1130
- " <td>NaN</td>\n",
1131
- " </tr>\n",
1132
- " <tr>\n",
1133
- " <th>site</th>\n",
1134
- " <td>18</td>\n",
1135
- " <td>0.79</td>\n",
1136
- " <td>0.61</td>\n",
1137
- " <td>0.69</td>\n",
1138
- " <td>NaN</td>\n",
1139
- " </tr>\n",
1140
- " <tr>\n",
1141
- " <th>sokak</th>\n",
1142
- " <td>62</td>\n",
1143
- " <td>0.72</td>\n",
1144
- " <td>0.74</td>\n",
1145
- " <td>0.73</td>\n",
1146
- " <td>NaN</td>\n",
1147
- " </tr>\n",
1148
- " <tr>\n",
1149
- " <th>soyisim</th>\n",
1150
- " <td>98</td>\n",
1151
- " <td>0.94</td>\n",
1152
- " <td>0.95</td>\n",
1153
- " <td>0.94</td>\n",
1154
- " <td>NaN</td>\n",
1155
- " </tr>\n",
1156
- " <tr>\n",
1157
- " <th>telefonno</th>\n",
1158
- " <td>77</td>\n",
1159
- " <td>0.99</td>\n",
1160
- " <td>1.00</td>\n",
1161
- " <td>0.99</td>\n",
1162
- " <td>NaN</td>\n",
1163
- " </tr>\n",
1164
- " </tbody>\n",
1165
- "</table>\n",
1166
- "</div>"
1167
- ],
1168
- "text/plain": [
1169
- " support precision recall f1 accuracy\n",
1170
- "overall 957 0.84 0.88 0.86 0.94\n",
1171
- "bina 66 0.66 0.74 0.70 NaN\n",
1172
- "bulvar 13 0.92 0.92 0.92 NaN\n",
1173
- "cadde 57 0.77 0.84 0.81 NaN\n",
1174
- "diskapino 70 0.69 0.73 0.71 NaN\n",
1175
- "ilce 117 0.89 0.96 0.92 NaN\n",
1176
- "isim 113 0.86 0.90 0.88 NaN\n",
1177
- "mahalle 120 0.77 0.82 0.79 NaN\n",
1178
- "sehir 146 0.98 0.97 0.97 NaN\n",
1179
- "site 18 0.79 0.61 0.69 NaN\n",
1180
- "sokak 62 0.72 0.74 0.73 NaN\n",
1181
- "soyisim 98 0.94 0.95 0.94 NaN\n",
1182
- "telefonno 77 0.99 1.00 0.99 NaN"
1183
- ]
1184
- },
1185
- "execution_count": 18,
1186
- "metadata": {},
1187
- "output_type": "execute_result"
1188
- }
1189
- ],
1190
- "source": [
1191
- "structured_results = defaultdict(dict)\n",
1192
- "structured_results[\"overall\"][\"support\"]=0\n",
1193
- "for x, y in results.items():\n",
1194
- " if len(x.split(\"_\"))==3:\n",
1195
- " structured_results[x.split(\"_\")[1]][x.split(\"_\")[2]] = y\n",
1196
- " if x.split(\"_\")[2]==\"support\":\n",
1197
- " structured_results[\"overall\"][\"support\"]+=y\n",
1198
- "results_pd = pd.DataFrame(structured_results).T\n",
1199
- "results_pd.support = results_pd.support.astype(int)\n",
1200
- "results_pd.round(2)"
1201
- ]
1202
- },
1203
- {
1204
- "cell_type": "markdown",
1205
- "id": "3c3de283",
1206
- "metadata": {},
1207
- "source": [
1208
- "## Predictions"
1209
- ]
1210
- },
1211
- {
1212
- "cell_type": "code",
1213
- "execution_count": 19,
1214
- "id": "ed165edb",
1215
- "metadata": {},
1216
- "outputs": [],
1217
- "source": [
1218
- "from transformers import pipeline\n",
1219
- "nlp = pipeline(\"ner\", model=model.to(device), tokenizer=tokenizer, aggregation_strategy=\"first\", device=0 if device==\"cuda\" else -1)"
1220
- ]
1221
- },
1222
- {
1223
- "cell_type": "code",
1224
- "execution_count": 20,
1225
- "id": "0e350503",
1226
- "metadata": {},
1227
- "outputs": [],
1228
- "source": [
1229
- "# Source: https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy\n",
1230
- "def get_entities_html(text, ner_result, title=None):\n",
1231
- " \"\"\"Visualize NER with the help of SpaCy\"\"\"\n",
1232
- " ents = []\n",
1233
- " for ent in ner_result:\n",
1234
- " e = {}\n",
1235
- " # add the start and end positions of the entity\n",
1236
- " e[\"start\"] = ent[\"start\"]\n",
1237
- " e[\"end\"] = ent[\"end\"]\n",
1238
- " # add the score if you want in the label\n",
1239
- " # e[\"label\"] = f\"{ent[\"entity\"]}-{ent['score']:.2f}\"\n",
1240
- " e[\"label\"] = ent[\"entity_group\"]\n",
1241
- " if ents and -1 <= ent[\"start\"] - ents[-1][\"end\"] <= 1 and ents[-1][\"label\"] == e[\"label\"]:\n",
1242
- " # if the current entity is shared with previous entity\n",
1243
- " # simply extend the entity end position instead of adding a new one\n",
1244
- " ents[-1][\"end\"] = e[\"end\"]\n",
1245
- " continue\n",
1246
- " ents.append(e)\n",
1247
- " # construct data required for displacy.render() method\n",
1248
- " render_data = [\n",
1249
- " {\n",
1250
- " \"text\": text,\n",
1251
- " \"ents\": ents,\n",
1252
- " \"title\": title,\n",
1253
- " }\n",
1254
- " ]\n",
1255
- " spacy.displacy.render(render_data, style=\"ent\", manual=True, jupyter=True)"
1256
- ]
1257
- },
1258
- {
1259
- "cell_type": "code",
1260
- "execution_count": 21,
1261
- "id": "f98a6902",
1262
- "metadata": {},
1263
- "outputs": [
1264
- {
1265
- "data": {
1266
- "text/html": [
1267
- "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Lütfen yardım \n",
1268
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1269
- " Akevler\n",
1270
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
1271
- "</mark>\n",
1272
- " mahallesi \n",
1273
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1274
- " Rüzgar\n",
1275
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sokak</span>\n",
1276
- "</mark>\n",
1277
- " sokak \n",
1278
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1279
- " Tuncay\n",
1280
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
1281
- "</mark>\n",
1282
- " apartmanı zemin kat \n",
1283
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1284
- " Antakya\n",
1285
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
1286
- "</mark>\n",
1287
- " akrabalarım göçük altında #hatay #Afad</div></span>"
1288
- ],
1289
- "text/plain": [
1290
- "<IPython.core.display.HTML object>"
1291
- ]
1292
- },
1293
- "metadata": {},
1294
- "output_type": "display_data"
1295
- }
1296
- ],
1297
- "source": [
1298
- "sentence = \"\"\"Lütfen yardım Akevler mahallesi Rüzgar sokak Tuncay apartmanı zemin kat Antakya akrabalarım göçük altında #hatay #Afad\"\"\"\n",
1299
- "\n",
1300
- "get_entities_html(sentence, nlp(sentence))"
1301
- ]
1302
- },
1303
- {
1304
- "cell_type": "code",
1305
- "execution_count": 22,
1306
- "id": "80b823ff",
1307
- "metadata": {},
1308
- "outputs": [
1309
- {
1310
- "data": {
1311
- "text/html": [
1312
- "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
1313
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1314
- " Kahramanmaraş\n",
1315
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sehir</span>\n",
1316
- "</mark>\n",
1317
- " \n",
1318
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1319
- " merkez\n",
1320
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
1321
- "</mark>\n",
1322
- " \n",
1323
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1324
- " Şazibey\n",
1325
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
1326
- "</mark>\n",
1327
- " Mahallesi \n",
1328
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1329
- " Ebrar\n",
1330
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">site</span>\n",
1331
- "</mark>\n",
1332
- " Sitesi \n",
1333
- "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1334
- " Z\n",
1335
- " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
1336
- "</mark>\n",
1337
- " blok arka tarafı için acil en az 150 tonluk vinç lazım lütfen paylaşır mısınız</div></span>"
1338
- ],
1339
- "text/plain": [
1340
- "<IPython.core.display.HTML object>"
1341
- ]
1342
- },
1343
- "metadata": {},
1344
- "output_type": "display_data"
1345
- }
1346
- ],
1347
- "source": [
1348
- "sentence = \" \".join(dataset[\"train\"][433][\"tokens\"])\n",
1349
- "get_entities_html(sentence, nlp(sentence))"
1350
- ]
1351
- }
1352
- ],
1353
- "metadata": {
1354
- "kernelspec": {
1355
- "display_name": "Python 3 (ipykernel)",
1356
- "language": "python",
1357
- "name": "python3"
1358
- },
1359
- "language_info": {
1360
- "codemirror_mode": {
1361
- "name": "ipython",
1362
- "version": 3
1363
- },
1364
- "file_extension": ".py",
1365
- "mimetype": "text/x-python",
1366
- "name": "python",
1367
- "nbconvert_exporter": "python",
1368
- "pygments_lexer": "ipython3",
1369
- "version": "3.9.12"
1370
- }
1371
- },
1372
- "nbformat": 4,
1373
- "nbformat_minor": 5
1374
- }