Tanor commited on
Commit
2dcbbd2
·
verified ·
1 Parent(s): a9d514a

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sr_ner_tesla_bbmc-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ transformer/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - sr
7
+ license: cc-by-sa-3.0
8
+ model-index:
9
+ - name: sr_ner_tesla_bbmc
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.9463341916
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.944797727
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.9455653351
24
+ ---
25
+ sr_ner_tesla_bbmc is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on bert-base-multilingual-cased, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.
26
+
27
+ | Feature | Description |
28
+ | --- | --- |
29
+ | **Name** | `sr_ner_tesla_bbmc` |
30
+ | **Version** | `1.0.0` |
31
+ | **spaCy** | `>=3.7.2,<3.8.0` |
32
+ | **Default Pipeline** | `transformer`, `ner` |
33
+ | **Components** | `transformer`, `ner` |
34
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
35
+ | **Sources** | n/a |
36
+ | **License** | `CC BY-SA 3.0` |
37
+ | **Author** | [Milica Ikonić Nešić, Saša Petalinkar, Mihailo Škorić, Ranka Stanković](https://tesla.rgf.bg.ac.rs/) |
38
+
39
+ ### Label Scheme
40
+
41
+ <details>
42
+
43
+ <summary>View label scheme (7 labels for 1 components)</summary>
44
+
45
+ | Component | Labels |
46
+ | --- | --- |
47
+ | **`ner`** | `DEMO`, `EVENT`, `LOC`, `ORG`, `PERS`, `ROLE`, `WORK` |
48
+
49
+ </details>
50
+
51
+ ### Accuracy
52
+
53
+ | Type | Score |
54
+ | --- | --- |
55
+ | `ENTS_F` | 94.56 |
56
+ | `ENTS_P` | 94.63 |
57
+ | `ENTS_R` | 94.48 |
58
+ | `TRANSFORMER_LOSS` | 140356.48 |
59
+ | `NER_LOSS` | 318152.41 |
config.cfg ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "./train.spacy"
3
+ dev = "./dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "sr"
13
+ pipeline = ["transformer","ner"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+ vectors = {"@vectors":"spacy.Vectors.v1"}
21
+
22
+ [components]
23
+
24
+ [components.ner]
25
+ factory = "ner"
26
+ incorrect_spans_key = null
27
+ moves = null
28
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
+ update_with_oracle_cut_size = 100
30
+
31
+ [components.ner.model]
32
+ @architectures = "spacy.TransitionBasedParser.v2"
33
+ state_type = "ner"
34
+ extra_state_tokens = false
35
+ hidden_width = 64
36
+ maxout_pieces = 2
37
+ use_upper = false
38
+ nO = null
39
+
40
+ [components.ner.model.tok2vec]
41
+ @architectures = "spacy-transformers.TransformerListener.v1"
42
+ grad_factor = 1.0
43
+ pooling = {"@layers":"reduce_mean.v1"}
44
+ upstream = "*"
45
+
46
+ [components.transformer]
47
+ factory = "transformer"
48
+ max_batch_items = 4096
49
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
50
+
51
+ [components.transformer.model]
52
+ @architectures = "spacy-transformers.TransformerModel.v3"
53
+ name = "bert-base-multilingual-uncased"
54
+ mixed_precision = false
55
+
56
+ [components.transformer.model.get_spans]
57
+ @span_getters = "spacy-transformers.strided_spans.v1"
58
+ window = 128
59
+ stride = 96
60
+
61
+ [components.transformer.model.grad_scaler_config]
62
+
63
+ [components.transformer.model.tokenizer_config]
64
+ use_fast = true
65
+
66
+ [components.transformer.model.transformer_config]
67
+
68
+ [corpora]
69
+
70
+ [corpora.dev]
71
+ @readers = "spacy.Corpus.v1"
72
+ path = ${paths.dev}
73
+ max_length = 0
74
+ gold_preproc = false
75
+ limit = 0
76
+ augmenter = null
77
+
78
+ [corpora.train]
79
+ @readers = "spacy.Corpus.v1"
80
+ path = ${paths.train}
81
+ max_length = 0
82
+ gold_preproc = false
83
+ limit = 0
84
+ augmenter = null
85
+
86
+ [training]
87
+ accumulate_gradient = 3
88
+ dev_corpus = "corpora.dev"
89
+ train_corpus = "corpora.train"
90
+ seed = ${system.seed}
91
+ gpu_allocator = ${system.gpu_allocator}
92
+ dropout = 0.1
93
+ patience = 1600
94
+ max_epochs = 0
95
+ max_steps = 20000
96
+ eval_frequency = 200
97
+ frozen_components = []
98
+ annotating_components = []
99
+ before_to_disk = null
100
+ before_update = null
101
+
102
+ [training.batcher]
103
+ @batchers = "spacy.batch_by_padded.v1"
104
+ discard_oversize = true
105
+ size = 2000
106
+ buffer = 256
107
+ get_length = null
108
+
109
+ [training.logger]
110
+ @loggers = "spacy.ConsoleLogger.v1"
111
+ progress_bar = false
112
+
113
+ [training.optimizer]
114
+ @optimizers = "Adam.v1"
115
+ beta1 = 0.9
116
+ beta2 = 0.999
117
+ L2_is_weight_decay = true
118
+ L2 = 0.01
119
+ grad_clip = 1.0
120
+ use_averages = false
121
+ eps = 0.00000001
122
+
123
+ [training.optimizer.learn_rate]
124
+ @schedules = "warmup_linear.v1"
125
+ warmup_steps = 250
126
+ total_steps = 20000
127
+ initial_rate = 0.00005
128
+
129
+ [training.score_weights]
130
+ ents_f = 1.0
131
+ ents_p = 0.0
132
+ ents_r = 0.0
133
+ ents_per_type = null
134
+
135
+ [pretraining]
136
+
137
+ [initialize]
138
+ vectors = ${paths.vectors}
139
+ init_tok2vec = ${paths.init_tok2vec}
140
+ vocab_data = null
141
+ lookups = null
142
+ before_init = null
143
+ after_init = null
144
+
145
+ [initialize.components]
146
+
147
+ [initialize.tokenizer]
meta.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"sr",
3
+ "name":"ner_tesla_bbmc",
4
+ "version":"1.0.0",
5
+ "description":"sr_ner_tesla_bbmc is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on bert-base-multilingual-cased, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.",
6
+ "author":"Milica Ikoni\u0107 Ne\u0161i\u0107, Sa\u0161a Petalinkar, Mihailo \u0160kori\u0107, Ranka Stankovi\u0107",
7
+ "email":"",
8
+ "url":"https://tesla.rgf.bg.ac.rs/",
9
+ "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.7.2,<3.8.0",
11
+ "spacy_git_version":"a89eae928",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "ner":[
23
+ "DEMO",
24
+ "EVENT",
25
+ "LOC",
26
+ "ORG",
27
+ "PERS",
28
+ "ROLE",
29
+ "WORK"
30
+ ]
31
+ },
32
+ "pipeline":[
33
+ "transformer",
34
+ "ner"
35
+ ],
36
+ "components":[
37
+ "transformer",
38
+ "ner"
39
+ ],
40
+ "disabled":[
41
+
42
+ ],
43
+ "performance":{
44
+ "ents_f":0.9455653351,
45
+ "ents_p":0.9463341916,
46
+ "ents_r":0.944797727,
47
+ "ents_per_type":{
48
+ "ROLE":{
49
+ "p":0.8529784537,
50
+ "r":0.8837820092,
51
+ "f":0.8681070622
52
+ },
53
+ "PERS":{
54
+ "p":0.9752672135,
55
+ "r":0.9814884303,
56
+ "f":0.9783679322
57
+ },
58
+ "LOC":{
59
+ "p":0.9577316881,
60
+ "r":0.9615181866,
61
+ "f":0.9596212022
62
+ },
63
+ "DEMO":{
64
+ "p":0.9162640902,
65
+ "r":0.8960629921,
66
+ "f":0.9060509554
67
+ },
68
+ "ORG":{
69
+ "p":0.7981790592,
70
+ "r":0.718579235,
71
+ "f":0.7562904385
72
+ },
73
+ "WORK":{
74
+ "p":0.72,
75
+ "r":0.2535211268,
76
+ "f":0.375
77
+ },
78
+ "EVENT":{
79
+ "p":0.6,
80
+ "r":0.375,
81
+ "f":0.4615384615
82
+ }
83
+ },
84
+ "transformer_loss":1403.5648144565,
85
+ "ner_loss":3181.5240740386
86
+ },
87
+ "requirements":[
88
+ "spacy-transformers>=1.3.4,<1.4.0"
89
+ ]
90
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (245 kB). View file
 
ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"2":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"3":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"4":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546,"":1},"5":{"":1}}�cfg��neg_key�
sr_ner_tesla_bbmc-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7ae21f1cea6e318111607c04402eb50a570306c2a72dea2f1c3d6ab71c45460
3
+ size 623299186
tokenizer ADDED
Binary file (32.6 kB). View file
 
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33c92e0c13afcbdcc7526b4f58a015beb8f4e9a34830258408e5807f3673cf63
3
+ size 672945736
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }