philipp-zettl commited on
Commit
9ca47ca
·
verified ·
1 Parent(s): f4ce594

Update spaCy pipeline

Browse files
Files changed (9) hide show
  1. .gitattributes +1 -0
  2. README.md +12 -80
  3. config.cfg +25 -36
  4. meta.json +3 -27
  5. ner/model +0 -0
  6. ner/moves +1 -1
  7. tok2vec/model +2 -2
  8. vocab/strings.json +0 -0
  9. xx_eb_ner-any-py3-none-any.whl +2 -2
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tok2vec/model filter=lfs diff=lfs merge=lfs -text
37
  xx_eb_ner-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tok2vec/model filter=lfs diff=lfs merge=lfs -text
37
  xx_eb_ner-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
38
+ ner/model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -4,51 +4,18 @@ tags:
4
  - token-classification
5
  language:
6
  - multilingual
7
- widget:
8
- - text: I'm looking for courses in machine learning.
9
- example_title: Course example
10
- - text: Can you send me some sales jobs?
11
- example_title: Job example
12
- - text: I'm from Berlin, Germany. Can you recommend courses to me?
13
- example_title: Location example
14
- - text: >-
15
- Next month I will be moving to London. I'm looking for software development
16
- jobs there. And can you recommend any language courses so I can meet new
17
- people?
18
- example_title: Mixed example
19
- model-index:
20
- - name: xx_eb_ner
21
- results:
22
- - task:
23
- name: NER
24
- type: token-classification
25
- metrics:
26
- - name: NER Precision
27
- type: precision
28
- value: 0.99471974
29
- - name: NER Recall
30
- type: recall
31
- value: 0.9937070263
32
- - name: NER F Score
33
- type: f_score
34
- value: 0.9942131253
35
- license: mit
36
- library_name: spacy
37
  ---
38
-
39
-
40
- | Feature | Description |
41
- | --- | --- |
42
- | **Name** | `xx_eb_ner` |
43
- | **Version** | `0.2.0` |
44
- | **spaCy** | `>=3.6.1,<3.7.0` |
45
- | **Default Pipeline** | `tok2vec`, `ner` |
46
- | **Components** | `tok2vec`, `ner` |
47
- | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
48
- | **Sources** | n/a |
49
- | **License** | mit |
50
- | **Author** | [philipp-zettl](https://huggingface.co/philipp-zettl) |
51
-
52
 
53
  ### Label Scheme
54
 
@@ -60,39 +27,4 @@ library_name: spacy
60
  | --- | --- |
61
  | **`ner`** | `COURSE_NAME`, `JOB_TITLE`, `LOCATION` |
62
 
63
- </details>
64
-
65
- ### Accuracy
66
-
67
- | Type | Score |
68
- | --- | --- |
69
- | `ENTS_F` | 99.54 |
70
- | `ENTS_P` | 99.56 |
71
- | `ENTS_R` | 99.52 |
72
- | `TOK2VEC_LOSS` | 35345.94 |
73
- | `NER_LOSS` | 32265.61 |
74
-
75
-
76
-
77
- ### Usage
78
- Install the model via pip:
79
- ```shell
80
- pip install https://huggingface.co/philipp-zettl/xx_eb_ner/resolve/main/xx_eb_ner-any-py3-none-any.whl
81
- ```
82
-
83
- For specific versions, please use the commits provided in the [source repository](https://huggingface.co/philipp-zettl/xx_eb_ner).
84
- Example: version v0.2.0
85
- ```shell
86
- pip install https://huggingface.co/philipp-zettl/xx_eb_ner/resolve/v0.2.0/xx_eb_ner-any-py3-none-any.whl
87
- ```
88
-
89
- After installing the model with it's dependencies, you can use it like any other SpaCy model:
90
- ```python
91
- # Using spacy.load().
92
- import spacy
93
- nlp = spacy.load("xx_eb_ner")
94
-
95
- # Importing as module.
96
- import xx_eb_ner
97
- nlp = xx_eb_ner.load()
98
- ```
 
4
  - token-classification
5
  language:
6
  - multilingual
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ---
8
+ | Feature | Description |
9
+ | --- | --- |
10
+ | **Name** | `xx_eb_ner` |
11
+ | **Version** | `0.2.1` |
12
+ | **spaCy** | `>=3.7.4,<3.8.0` |
13
+ | **Default Pipeline** | `tok2vec`, `ner` |
14
+ | **Components** | `tok2vec`, `ner` |
15
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
16
+ | **Sources** | n/a |
17
+ | **License** | n/a |
18
+ | **Author** | [n/a]() |
 
 
 
19
 
20
  ### Label Scheme
21
 
 
27
  | --- | --- |
28
  | **`ner`** | `COURSE_NAME`, `JOB_TITLE`, `LOCATION` |
29
 
30
+ </details>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.cfg CHANGED
@@ -5,18 +5,19 @@ vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
- gpu_allocator = null
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "xx"
13
  pipeline = ["tok2vec","ner"]
14
- tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
18
  after_pipeline_creation = null
19
  batch_size = 1000
 
 
20
 
21
  [components]
22
 
@@ -37,51 +38,47 @@ use_upper = true
37
  nO = null
38
 
39
  [components.ner.model.tok2vec]
40
- @architectures = "spacy.Tok2VecListener.v1"
41
- width = ${components.tok2vec.model.encode.width}
42
- upstream = "*"
 
 
 
 
 
43
 
44
  [components.tok2vec]
45
  factory = "tok2vec"
46
 
47
  [components.tok2vec.model]
48
- @architectures = "spacy.Tok2Vec.v2"
49
-
50
- [components.tok2vec.model.embed]
51
- @architectures = "spacy.MultiHashEmbed.v2"
52
- width = ${components.tok2vec.model.encode.width}
53
- attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
54
- rows = [5000,1000,2500,2500]
55
- include_static_vectors = true
56
-
57
- [components.tok2vec.model.encode]
58
- @architectures = "spacy.MaxoutWindowEncoder.v2"
59
- width = 256
60
- depth = 8
61
  window_size = 1
62
  maxout_pieces = 3
 
63
 
64
  [corpora]
65
 
66
  [corpora.dev]
67
  @readers = "spacy.Corpus.v1"
68
  path = ${paths.dev}
69
- max_length = 0
70
  gold_preproc = false
 
71
  limit = 0
72
  augmenter = null
73
 
74
  [corpora.train]
75
  @readers = "spacy.Corpus.v1"
76
  path = ${paths.train}
77
- max_length = 0
78
  gold_preproc = false
 
79
  limit = 0
80
  augmenter = null
81
 
82
  [training]
83
- dev_corpus = "corpora.dev"
84
- train_corpus = "corpora.train"
85
  seed = ${system.seed}
86
  gpu_allocator = ${system.gpu_allocator}
87
  dropout = 0.1
@@ -92,28 +89,25 @@ max_steps = 20000
92
  eval_frequency = 200
93
  frozen_components = []
94
  annotating_components = []
 
 
95
  before_to_disk = null
96
  before_update = null
97
 
98
  [training.batcher]
99
  @batchers = "spacy.batch_by_words.v1"
 
100
  discard_oversize = false
101
  tolerance = 0.2
102
  get_length = null
103
 
104
- [training.batcher.size]
105
- @schedules = "compounding.v1"
106
- start = 100
107
- stop = 1000
108
- compound = 1.001
109
- t = 0.0
110
-
111
  [training.logger]
112
  @loggers = "spacy.ConsoleLogger.v1"
113
  progress_bar = false
114
 
115
  [training.optimizer]
116
  @optimizers = "Adam.v1"
 
117
  beta1 = 0.9
118
  beta2 = 0.999
119
  L2_is_weight_decay = true
@@ -121,7 +115,6 @@ L2 = 0.01
121
  grad_clip = 1.0
122
  use_averages = false
123
  eps = 0.00000001
124
- learn_rate = 0.001
125
 
126
  [training.score_weights]
127
  ents_f = 1.0
@@ -132,17 +125,13 @@ ents_per_type = null
132
  [pretraining]
133
 
134
  [initialize]
135
- vectors = ${paths.vectors}
136
  init_tok2vec = ${paths.init_tok2vec}
137
  vocab_data = null
 
138
  before_init = null
139
  after_init = null
140
 
141
  [initialize.components]
142
 
143
- [initialize.lookups]
144
- @misc = "spacy.LookupsDataLoader.v1"
145
- lang = ${nlp.lang}
146
- tables = []
147
-
148
  [initialize.tokenizer]
 
5
  init_tok2vec = null
6
 
7
  [system]
8
+ gpu_allocator = "\"pytorch\" # Use GPU memory management, if available"
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "xx"
13
  pipeline = ["tok2vec","ner"]
 
14
  disabled = []
15
  before_creation = null
16
  after_creation = null
17
  after_pipeline_creation = null
18
  batch_size = 1000
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+ vectors = {"@vectors":"spacy.Vectors.v1"}
21
 
22
  [components]
23
 
 
38
  nO = null
39
 
40
  [components.ner.model.tok2vec]
41
+ @architectures = "spacy.HashEmbedCNN.v2"
42
+ pretrained_vectors = null
43
+ width = 96
44
+ depth = 4
45
+ embed_size = 2000
46
+ window_size = 1
47
+ maxout_pieces = 3
48
+ subword_features = true
49
 
50
  [components.tok2vec]
51
  factory = "tok2vec"
52
 
53
  [components.tok2vec.model]
54
+ @architectures = "spacy.HashEmbedCNN.v2"
55
+ pretrained_vectors = null
56
+ width = 96
57
+ depth = 4
58
+ embed_size = 2000
 
 
 
 
 
 
 
 
59
  window_size = 1
60
  maxout_pieces = 3
61
+ subword_features = true
62
 
63
  [corpora]
64
 
65
  [corpora.dev]
66
  @readers = "spacy.Corpus.v1"
67
  path = ${paths.dev}
 
68
  gold_preproc = false
69
+ max_length = 0
70
  limit = 0
71
  augmenter = null
72
 
73
  [corpora.train]
74
  @readers = "spacy.Corpus.v1"
75
  path = ${paths.train}
 
76
  gold_preproc = false
77
+ max_length = 0
78
  limit = 0
79
  augmenter = null
80
 
81
  [training]
 
 
82
  seed = ${system.seed}
83
  gpu_allocator = ${system.gpu_allocator}
84
  dropout = 0.1
 
89
  eval_frequency = 200
90
  frozen_components = []
91
  annotating_components = []
92
+ dev_corpus = "corpora.dev"
93
+ train_corpus = "corpora.train"
94
  before_to_disk = null
95
  before_update = null
96
 
97
  [training.batcher]
98
  @batchers = "spacy.batch_by_words.v1"
99
+ size = 1000
100
  discard_oversize = false
101
  tolerance = 0.2
102
  get_length = null
103
 
 
 
 
 
 
 
 
104
  [training.logger]
105
  @loggers = "spacy.ConsoleLogger.v1"
106
  progress_bar = false
107
 
108
  [training.optimizer]
109
  @optimizers = "Adam.v1"
110
+ learn_rate = 0.001
111
  beta1 = 0.9
112
  beta2 = 0.999
113
  L2_is_weight_decay = true
 
115
  grad_clip = 1.0
116
  use_averages = false
117
  eps = 0.00000001
 
118
 
119
  [training.score_weights]
120
  ents_f = 1.0
 
125
  [pretraining]
126
 
127
  [initialize]
128
+ vectors = null
129
  init_tok2vec = ${paths.init_tok2vec}
130
  vocab_data = null
131
+ lookups = null
132
  before_init = null
133
  after_init = null
134
 
135
  [initialize.components]
136
 
 
 
 
 
 
137
  [initialize.tokenizer]
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"xx",
3
  "name":"eb_ner",
4
- "version":"0.2.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.6.1,<3.7.0",
11
- "spacy_git_version":"458bc5f45",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -36,30 +36,6 @@
36
  "disabled":[
37
 
38
  ],
39
- "performance":{
40
- "ents_f":0.9954006007,
41
- "ents_p":0.9956087463,
42
- "ents_r":0.9951925422,
43
- "ents_per_type":{
44
- "COURSE_NAME":{
45
- "p":0.997066476,
46
- "r":0.9973899387,
47
- "f":0.9972281811
48
- },
49
- "LOCATION":{
50
- "p":0.9958545643,
51
- "r":0.9980093474,
52
- "f":0.9969307915
53
- },
54
- "JOB_TITLE":{
55
- "p":0.9908607864,
56
- "r":0.9827357238,
57
- "f":0.9867815301
58
- }
59
- },
60
- "tok2vec_loss":353.4594166442,
61
- "ner_loss":322.656127662
62
- },
63
  "requirements":[
64
 
65
  ]
 
1
  {
2
  "lang":"xx",
3
  "name":"eb_ner",
4
+ "version":"0.2.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.7.4,<3.8.0",
11
+ "spacy_git_version":"bff8725f4",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
36
  "disabled":[
37
 
38
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "requirements":[
40
 
41
  ]
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves�{"0":{},"1":{"COURSE_NAME":490453,"JOB_TITLE":219750,"LOCATION":200713},"2":{"COURSE_NAME":490453,"JOB_TITLE":219750,"LOCATION":200713},"3":{"COURSE_NAME":490453,"JOB_TITLE":219750,"LOCATION":200713},"4":{"COURSE_NAME":490453,"JOB_TITLE":219750,"LOCATION":200713,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves�${"0":{},"1":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"2":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"3":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"4":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534,"":1},"5":{"":1}}�cfg��neg_key�
tok2vec/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c120186477af10d01381b8c0de2eafbb21ceac1e75e1ce3fafcb07733588b02
3
- size 34126801
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c887bb91d8fb38fbfdf3dbf994d61cfb54b7ce900b89bec0e6c92b8f023f7ee7
3
+ size 3705091
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff
 
xx_eb_ner-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83e0f69bc0c92307bd6dcb402a148b47f2e94035351ea480fc878e0c1a351d45
3
- size 31846488
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38309fa0b6607c4d8b130659210e2cd400241ce588039e6fede3efbbd4ab1912
3
+ size 7796938