adrianeboyd commited on
Commit
360106f
1 Parent(s): cbd9cb3

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -14,27 +14,27 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.8356510746
18
  - name: NER Recall
19
  type: recall
20
- value: 0.8314465409
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.8335435057
24
  - task:
25
  name: TAG
26
  type: token-classification
27
  metrics:
28
  - name: TAG (XPOS) Accuracy
29
  type: accuracy
30
- value: 0.9713305562
31
  - task:
32
  name: POS
33
  type: token-classification
34
  metrics:
35
  - name: POS (UPOS) Accuracy
36
  type: accuracy
37
- value: 0.9798190675
38
  - task:
39
  name: MORPH
40
  type: token-classification
@@ -48,38 +48,38 @@ model-index:
48
  metrics:
49
  - name: Lemma Accuracy
50
  type: accuracy
51
- value: 0.9670526831
52
  - task:
53
  name: UNLABELED_DEPENDENCIES
54
  type: token-classification
55
  metrics:
56
  - name: Unlabeled Attachment Score (UAS)
57
  type: f_score
58
- value: 0.9311959654
59
  - task:
60
  name: LABELED_DEPENDENCIES
61
  type: token-classification
62
  metrics:
63
  - name: Labeled Attachment Score (LAS)
64
  type: f_score
65
- value: 0.9202934425
66
  - task:
67
  name: SENTS
68
  type: token-classification
69
  metrics:
70
  - name: Sentences F-Score
71
  type: f_score
72
- value: 0.9639727361
73
  ---
74
  ### Details: https://spacy.io/models/ja#ja_core_news_trf
75
 
76
- Japanese transformer pipeline (cl-tohoku/bert-base-japanese-char-v2). Components: transformer, morphologizer, parser, ner.
77
 
78
  | Feature | Description |
79
  | --- | --- |
80
  | **Name** | `ja_core_news_trf` |
81
- | **Version** | `3.6.1` |
82
- | **spaCy** | `>=3.6.0,<3.7.0` |
83
  | **Default Pipeline** | `transformer`, `morphologizer`, `parser`, `attribute_ruler`, `ner` |
84
  | **Components** | `transformer`, `morphologizer`, `parser`, `attribute_ruler`, `ner` |
85
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -106,21 +106,21 @@ Japanese transformer pipeline (cl-tohoku/bert-base-japanese-char-v2). Components
106
  | Type | Score |
107
  | --- | --- |
108
  | `TOKEN_ACC` | 99.37 |
109
- | `TOKEN_P` | 97.65 |
110
- | `TOKEN_R` | 97.90 |
111
- | `TOKEN_F` | 97.77 |
112
- | `POS_ACC` | 97.98 |
113
  | `MORPH_ACC` | 0.00 |
114
  | `MORPH_MICRO_P` | 34.01 |
115
  | `MORPH_MICRO_R` | 98.04 |
116
  | `MORPH_MICRO_F` | 50.51 |
117
- | `SENTS_P` | 95.19 |
118
- | `SENTS_R` | 97.63 |
119
- | `SENTS_F` | 96.40 |
120
- | `DEP_UAS` | 93.12 |
121
- | `DEP_LAS` | 92.03 |
122
  | `TAG_ACC` | 97.13 |
123
- | `LEMMA_ACC` | 96.71 |
124
- | `ENTS_P` | 83.57 |
125
- | `ENTS_R` | 83.14 |
126
- | `ENTS_F` | 83.35 |
 
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
+ value: 0.8227383863
18
  - name: NER Recall
19
  type: recall
20
+ value: 0.8465408805
21
  - name: NER F Score
22
  type: f_score
23
+ value: 0.8344699318
24
  - task:
25
  name: TAG
26
  type: token-classification
27
  metrics:
28
  - name: TAG (XPOS) Accuracy
29
  type: accuracy
30
+ value: 0.9713282143
31
  - task:
32
  name: POS
33
  type: token-classification
34
  metrics:
35
  - name: POS (UPOS) Accuracy
36
  type: accuracy
37
+ value: 0.979409718
38
  - task:
39
  name: MORPH
40
  type: token-classification
 
48
  metrics:
49
  - name: Lemma Accuracy
50
  type: accuracy
51
+ value: 0.9670499959
52
  - task:
53
  name: UNLABELED_DEPENDENCIES
54
  type: token-classification
55
  metrics:
56
  - name: Unlabeled Attachment Score (UAS)
57
  type: f_score
58
+ value: 0.9304880245
59
  - task:
60
  name: LABELED_DEPENDENCIES
61
  type: token-classification
62
  metrics:
63
  - name: Labeled Attachment Score (LAS)
64
  type: f_score
65
+ value: 0.9178365731
66
  - task:
67
  name: SENTS
68
  type: token-classification
69
  metrics:
70
  - name: Sentences F-Score
71
  type: f_score
72
+ value: 0.9507246377
73
  ---
74
  ### Details: https://spacy.io/models/ja#ja_core_news_trf
75
 
76
+ Japanese transformer pipeline (Transformer(name='cl-tohoku/bert-base-japanese-char-v2', piece_encoder='char', stride=160, type='bert', width=768, window=216, vocab_size=6144)). Components: transformer, morphologizer, parser, ner.
77
 
78
  | Feature | Description |
79
  | --- | --- |
80
  | **Name** | `ja_core_news_trf` |
81
+ | **Version** | `3.7.2` |
82
+ | **spaCy** | `>=3.7.0,<3.8.0` |
83
  | **Default Pipeline** | `transformer`, `morphologizer`, `parser`, `attribute_ruler`, `ner` |
84
  | **Components** | `transformer`, `morphologizer`, `parser`, `attribute_ruler`, `ner` |
85
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 
106
  | Type | Score |
107
  | --- | --- |
108
  | `TOKEN_ACC` | 99.37 |
109
+ | `TOKEN_P` | 97.64 |
110
+ | `TOKEN_R` | 97.88 |
111
+ | `TOKEN_F` | 97.76 |
112
+ | `POS_ACC` | 97.94 |
113
  | `MORPH_ACC` | 0.00 |
114
  | `MORPH_MICRO_P` | 34.01 |
115
  | `MORPH_MICRO_R` | 98.04 |
116
  | `MORPH_MICRO_F` | 50.51 |
117
+ | `SENTS_P` | 93.18 |
118
+ | `SENTS_R` | 97.04 |
119
+ | `SENTS_F` | 95.07 |
120
+ | `DEP_UAS` | 93.05 |
121
+ | `DEP_LAS` | 91.78 |
122
  | `TAG_ACC` | 97.13 |
123
+ | `LEMMA_ACC` | 96.70 |
124
+ | `ENTS_P` | 82.27 |
125
+ | `ENTS_R` | 84.65 |
126
+ | `ENTS_F` | 83.45 |
accuracy.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "token_acc": 0.9937494927,
3
- "token_p": 0.9764591282,
4
- "token_r": 0.9790021974,
5
- "token_f": 0.9777290092,
6
- "pos_acc": 0.9798190675,
7
  "morph_acc": 0.0,
8
  "morph_micro_p": 0.3401360544,
9
  "morph_micro_r": 0.9803921569,
@@ -25,66 +25,66 @@
25
  "f": 0.0
26
  }
27
  },
28
- "sents_p": 0.9519230769,
29
- "sents_r": 0.9763313609,
30
- "sents_f": 0.9639727361,
31
- "dep_uas": 0.9311959654,
32
- "dep_las": 0.9202934425,
33
  "dep_las_per_type": {
34
  "cc": {
35
- "p": 0.8936170213,
36
- "r": 0.875,
37
- "f": 0.8842105263
38
  },
39
  "compound": {
40
- "p": 0.9544917258,
41
- "r": 0.9103720406,
42
- "f": 0.9319099827
43
  },
44
  "obl": {
45
- "p": 0.8436724566,
46
- "r": 0.8489388265,
47
- "f": 0.8462974487
48
  },
49
  "case": {
50
- "p": 0.9832317073,
51
- "r": 0.9802431611,
52
- "f": 0.9817351598
53
  },
54
  "dislocated": {
55
- "p": 0.6666666667,
56
- "r": 0.6153846154,
57
- "f": 0.64
58
  },
59
  "nsubj": {
60
- "p": 0.8618677043,
61
- "r": 0.8502879079,
62
- "f": 0.8560386473
63
  },
64
  "nmod": {
65
- "p": 0.9235588972,
66
- "r": 0.8619883041,
67
- "f": 0.8917120387
68
  },
69
  "root": {
70
- "p": 0.9359223301,
71
- "r": 0.9506903353,
72
- "f": 0.9432485323
73
  },
74
  "aux": {
75
- "p": 0.9765698219,
76
- "r": 0.9675023213,
77
- "f": 0.9720149254
78
  },
79
  "advcl": {
80
- "p": 0.7379310345,
81
- "r": 0.7213483146,
82
- "f": 0.7295454545
83
  },
84
  "mark": {
85
- "p": 0.9756592292,
86
- "r": 0.962,
87
- "f": 0.9687814703
88
  },
89
  "fixed": {
90
  "p": 0.9661921708,
@@ -92,14 +92,14 @@
92
  "f": 0.976618705
93
  },
94
  "acl": {
95
- "p": 0.8942731278,
96
- "r": 0.8923076923,
97
- "f": 0.8932893289
98
  },
99
  "obj": {
100
- "p": 0.9539877301,
101
- "r": 0.9395770393,
102
- "f": 0.9467275495
103
  },
104
  "nummod": {
105
  "p": 0.987012987,
@@ -107,131 +107,131 @@
107
  "f": 0.9411764706
108
  },
109
  "advmod": {
110
- "p": 0.7537313433,
111
- "r": 0.7214285714,
112
- "f": 0.7372262774
113
  },
114
  "amod": {
115
- "p": 0.9375,
116
  "r": 0.8108108108,
117
- "f": 0.8695652174
118
  },
119
  "cop": {
120
- "p": 0.9764705882,
121
- "r": 0.9651162791,
122
- "f": 0.9707602339
123
  },
124
  "ccomp": {
125
- "p": 0.9047619048,
126
- "r": 0.8636363636,
127
- "f": 0.8837209302
128
  },
129
  "det": {
130
  "p": 1.0,
131
- "r": 0.9811320755,
132
- "f": 0.9904761905
133
  },
134
  "csubj": {
135
- "p": 0.7333333333,
136
  "r": 0.9166666667,
137
- "f": 0.8148148148
138
  },
139
  "dep": {
140
- "p": 0.25,
141
  "r": 0.2857142857,
142
- "f": 0.2666666667
143
  }
144
  },
145
- "tag_acc": 0.9713305562,
146
- "lemma_acc": 0.9670526831,
147
- "ents_p": 0.8356510746,
148
- "ents_r": 0.8314465409,
149
- "ents_f": 0.8335435057,
150
  "ents_per_type": {
151
  "DATE": {
152
- "p": 0.9727272727,
153
- "r": 0.9816513761,
154
- "f": 0.9771689498
155
  },
156
  "ORG": {
157
- "p": 0.7906976744,
158
- "r": 0.7445255474,
159
- "f": 0.7669172932
160
  },
161
  "TITLE_AFFIX": {
162
- "p": 0.9230769231,
163
- "r": 0.8,
164
  "f": 0.8571428571
165
  },
166
  "PERSON": {
167
- "p": 0.8865248227,
168
- "r": 0.8992805755,
169
- "f": 0.8928571429
170
  },
171
  "GPE": {
172
- "p": 0.8404255319,
173
- "r": 0.8404255319,
174
- "f": 0.8404255319
175
  },
176
  "PRODUCT": {
177
- "p": 0.625,
178
- "r": 0.5952380952,
179
- "f": 0.6097560976
180
  },
181
  "TIME": {
182
- "p": 1.0,
183
  "r": 1.0,
184
- "f": 1.0
185
  },
186
  "QUANTITY": {
187
- "p": 0.875,
188
  "r": 0.9545454545,
189
- "f": 0.9130434783
190
  },
191
  "NORP": {
192
- "p": 0.6875,
193
- "r": 0.6875,
194
- "f": 0.6875
195
  },
196
  "ORDINAL": {
197
- "p": 0.64,
198
- "r": 0.7272727273,
199
- "f": 0.6808510638
200
  },
201
  "WORK_OF_ART": {
202
- "p": 0.8,
203
- "r": 0.7058823529,
204
- "f": 0.75
205
  },
206
  "CARDINAL": {
207
  "p": 1.0,
208
  "r": 0.5,
209
  "f": 0.6666666667
210
  },
211
- "LOC": {
212
- "p": 0.5714285714,
213
- "r": 0.8,
214
- "f": 0.6666666667
215
- },
216
  "PERCENT": {
217
  "p": 1.0,
218
- "r": 1.0,
219
- "f": 1.0
220
  },
221
  "EVENT": {
222
- "p": 0.9130434783,
223
- "r": 0.8076923077,
224
- "f": 0.8571428571
225
  },
226
  "FAC": {
227
- "p": 0.7179487179,
228
- "r": 0.7567567568,
229
- "f": 0.7368421053
 
 
 
 
 
230
  },
231
  "MOVEMENT": {
232
- "p": 0.6666666667,
233
- "r": 0.4,
234
- "f": 0.5
235
  },
236
  "LAW": {
237
  "p": 0.6666666667,
@@ -249,5 +249,5 @@
249
  "f": 1.0
250
  }
251
  },
252
- "speed": 668.4231187225
253
  }
 
1
  {
2
+ "token_acc": 0.9936678032,
3
+ "token_p": 0.9763760351,
4
+ "token_r": 0.9788394238,
5
+ "token_f": 0.9776061776,
6
+ "pos_acc": 0.979409718,
7
  "morph_acc": 0.0,
8
  "morph_micro_p": 0.3401360544,
9
  "morph_micro_r": 0.9803921569,
 
25
  "f": 0.0
26
  }
27
  },
28
+ "sents_p": 0.9318181818,
29
+ "sents_r": 0.9704142012,
30
+ "sents_f": 0.9507246377,
31
+ "dep_uas": 0.9304880245,
32
+ "dep_las": 0.9178365731,
33
  "dep_las_per_type": {
34
  "cc": {
35
+ "p": 0.8723404255,
36
+ "r": 0.8541666667,
37
+ "f": 0.8631578947
38
  },
39
  "compound": {
40
+ "p": 0.9507125891,
41
+ "r": 0.9024802706,
42
+ "f": 0.9259687681
43
  },
44
  "obl": {
45
+ "p": 0.8445273632,
46
+ "r": 0.847690387,
47
+ "f": 0.846105919
48
  },
49
  "case": {
50
+ "p": 0.9813048455,
51
+ "r": 0.9772036474,
52
+ "f": 0.9792499524
53
  },
54
  "dislocated": {
55
+ "p": 0.5454545455,
56
+ "r": 0.4615384615,
57
+ "f": 0.5
58
  },
59
  "nsubj": {
60
+ "p": 0.8700787402,
61
+ "r": 0.8483685221,
62
+ "f": 0.8590864917
63
  },
64
  "nmod": {
65
+ "p": 0.8964241677,
66
+ "r": 0.8502923977,
67
+ "f": 0.8727490996
68
  },
69
  "root": {
70
+ "p": 0.9300567108,
71
+ "r": 0.9704142012,
72
+ "f": 0.9498069498
73
  },
74
  "aux": {
75
+ "p": 0.9747663551,
76
+ "r": 0.9684308264,
77
+ "f": 0.9715882627
78
  },
79
  "advcl": {
80
+ "p": 0.7568807339,
81
+ "r": 0.7415730337,
82
+ "f": 0.7491486947
83
  },
84
  "mark": {
85
+ "p": 0.9757575758,
86
+ "r": 0.966,
87
+ "f": 0.9708542714
88
  },
89
  "fixed": {
90
  "p": 0.9661921708,
 
92
  "f": 0.976618705
93
  },
94
  "acl": {
95
+ "p": 0.8711790393,
96
+ "r": 0.8769230769,
97
+ "f": 0.874041621
98
  },
99
  "obj": {
100
+ "p": 0.9630769231,
101
+ "r": 0.9456193353,
102
+ "f": 0.9542682927
103
  },
104
  "nummod": {
105
  "p": 0.987012987,
 
107
  "f": 0.9411764706
108
  },
109
  "advmod": {
110
+ "p": 0.7352941176,
111
+ "r": 0.7142857143,
112
+ "f": 0.7246376812
113
  },
114
  "amod": {
115
+ "p": 0.9090909091,
116
  "r": 0.8108108108,
117
+ "f": 0.8571428571
118
  },
119
  "cop": {
120
+ "p": 0.9647058824,
121
+ "r": 0.9534883721,
122
+ "f": 0.9590643275
123
  },
124
  "ccomp": {
125
+ "p": 0.9,
126
+ "r": 0.8181818182,
127
+ "f": 0.8571428571
128
  },
129
  "det": {
130
  "p": 1.0,
131
+ "r": 0.9622641509,
132
+ "f": 0.9807692308
133
  },
134
  "csubj": {
135
+ "p": 0.7857142857,
136
  "r": 0.9166666667,
137
+ "f": 0.8461538462
138
  },
139
  "dep": {
140
+ "p": 0.2857142857,
141
  "r": 0.2857142857,
142
+ "f": 0.2857142857
143
  }
144
  },
145
+ "tag_acc": 0.9713282143,
146
+ "lemma_acc": 0.9670499959,
147
+ "ents_p": 0.8227383863,
148
+ "ents_r": 0.8465408805,
149
+ "ents_f": 0.8344699318,
150
  "ents_per_type": {
151
  "DATE": {
152
+ "p": 0.9464285714,
153
+ "r": 0.9724770642,
154
+ "f": 0.9592760181
155
  },
156
  "ORG": {
157
+ "p": 0.6918238994,
158
+ "r": 0.802919708,
159
+ "f": 0.7432432432
160
  },
161
  "TITLE_AFFIX": {
162
+ "p": 0.8181818182,
163
+ "r": 0.9,
164
  "f": 0.8571428571
165
  },
166
  "PERSON": {
167
+ "p": 0.9270072993,
168
+ "r": 0.9136690647,
169
+ "f": 0.9202898551
170
  },
171
  "GPE": {
172
+ "p": 0.84375,
173
+ "r": 0.8617021277,
174
+ "f": 0.8526315789
175
  },
176
  "PRODUCT": {
177
+ "p": 0.5869565217,
178
+ "r": 0.6428571429,
179
+ "f": 0.6136363636
180
  },
181
  "TIME": {
182
+ "p": 0.8,
183
  "r": 1.0,
184
+ "f": 0.8888888889
185
  },
186
  "QUANTITY": {
187
+ "p": 0.8630136986,
188
  "r": 0.9545454545,
189
+ "f": 0.9064748201
190
  },
191
  "NORP": {
192
+ "p": 0.6666666667,
193
+ "r": 0.625,
194
+ "f": 0.6451612903
195
  },
196
  "ORDINAL": {
197
+ "p": 0.6538461538,
198
+ "r": 0.7727272727,
199
+ "f": 0.7083333333
200
  },
201
  "WORK_OF_ART": {
202
+ "p": 0.8461538462,
203
+ "r": 0.6470588235,
204
+ "f": 0.7333333333
205
  },
206
  "CARDINAL": {
207
  "p": 1.0,
208
  "r": 0.5,
209
  "f": 0.6666666667
210
  },
 
 
 
 
 
211
  "PERCENT": {
212
  "p": 1.0,
213
+ "r": 0.8571428571,
214
+ "f": 0.9230769231
215
  },
216
  "EVENT": {
217
+ "p": 0.9583333333,
218
+ "r": 0.8846153846,
219
+ "f": 0.92
220
  },
221
  "FAC": {
222
+ "p": 0.9285714286,
223
+ "r": 0.7027027027,
224
+ "f": 0.8
225
+ },
226
+ "LOC": {
227
+ "p": 0.8,
228
+ "r": 0.8,
229
+ "f": 0.8
230
  },
231
  "MOVEMENT": {
232
+ "p": 0.3333333333,
233
+ "r": 0.2,
234
+ "f": 0.25
235
  },
236
  "LAW": {
237
  "p": 0.6666666667,
 
249
  "f": 1.0
250
  }
251
  },
252
+ "speed": 567.0833083438
253
  }
config.cfg CHANGED
@@ -16,6 +16,7 @@ before_creation = null
16
  after_creation = null
17
  after_pipeline_creation = null
18
  batch_size = 64
 
19
 
20
  [nlp.tokenizer]
21
  @tokenizers = "spacy.ja.JapaneseTokenizer"
@@ -40,10 +41,11 @@ nO = null
40
  normalize = false
41
 
42
  [components.morphologizer.model.tok2vec]
43
- @architectures = "spacy-transformers.TransformerListener.v1"
44
- grad_factor = 1.0
45
  upstream = "transformer"
46
  pooling = {"@layers":"reduce_mean.v1"}
 
47
 
48
  [components.ner]
49
  factory = "ner"
@@ -62,10 +64,11 @@ use_upper = false
62
  nO = null
63
 
64
  [components.ner.model.tok2vec]
65
- @architectures = "spacy-transformers.TransformerListener.v1"
66
- grad_factor = 1.0
67
  upstream = "transformer"
68
  pooling = {"@layers":"reduce_mean.v1"}
 
69
 
70
  [components.parser]
71
  factory = "parser"
@@ -85,35 +88,44 @@ use_upper = false
85
  nO = null
86
 
87
  [components.parser.model.tok2vec]
88
- @architectures = "spacy-transformers.TransformerListener.v1"
89
- grad_factor = 1.0
90
  upstream = "transformer"
91
  pooling = {"@layers":"reduce_mean.v1"}
 
92
 
93
  [components.transformer]
94
- factory = "transformer"
95
- max_batch_items = 4096
96
- set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
97
 
98
  [components.transformer.model]
99
- name = "cl-tohoku/bert-base-japanese-char-v2"
100
- @architectures = "spacy-transformers.TransformerModel.v3"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  mixed_precision = false
102
-
103
- [components.transformer.model.get_spans]
104
- @span_getters = "spacy-transformers.strided_spans.v1"
105
- window = 128
106
- stride = 96
107
 
108
  [components.transformer.model.grad_scaler_config]
109
 
110
- [components.transformer.model.tokenizer_config]
111
- use_fast = false
112
- word_tokenizer_type = "basic"
113
- subword_tokenizer_type = "character"
114
- model_max_length = 512
115
-
116
- [components.transformer.model.transformer_config]
117
 
118
  [corpora]
119
 
@@ -150,11 +162,11 @@ annotating_components = []
150
  before_update = null
151
 
152
  [training.batcher]
153
- @batchers = "spacy.batch_by_padded.v1"
154
- discard_oversize = true
155
- get_length = null
156
  size = 2000
157
- buffer = 256
 
158
 
159
  [training.logger]
160
  @loggers = "spacy.ConsoleLogger.v1"
@@ -225,6 +237,18 @@ require = false
225
  path = "corpus/labels/parser.json"
226
  require = false
227
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  [initialize.lookups]
229
  @misc = "spacy.LookupsDataLoader.v1"
230
  lang = ${nlp.lang}
 
16
  after_creation = null
17
  after_pipeline_creation = null
18
  batch_size = 64
19
+ vectors = {"@vectors":"spacy.Vectors.v1"}
20
 
21
  [nlp.tokenizer]
22
  @tokenizers = "spacy.ja.JapaneseTokenizer"
 
41
  normalize = false
42
 
43
  [components.morphologizer.model.tok2vec]
44
+ @architectures = "spacy-curated-transformers.LastTransformerLayerListener.v1"
45
+ width = ${components.transformer.model.hidden_width}
46
  upstream = "transformer"
47
  pooling = {"@layers":"reduce_mean.v1"}
48
+ grad_factor = 1.0
49
 
50
  [components.ner]
51
  factory = "ner"
 
64
  nO = null
65
 
66
  [components.ner.model.tok2vec]
67
+ @architectures = "spacy-curated-transformers.LastTransformerLayerListener.v1"
68
+ width = ${components.transformer.model.hidden_width}
69
  upstream = "transformer"
70
  pooling = {"@layers":"reduce_mean.v1"}
71
+ grad_factor = 1.0
72
 
73
  [components.parser]
74
  factory = "parser"
 
88
  nO = null
89
 
90
  [components.parser.model.tok2vec]
91
+ @architectures = "spacy-curated-transformers.LastTransformerLayerListener.v1"
92
+ width = ${components.transformer.model.hidden_width}
93
  upstream = "transformer"
94
  pooling = {"@layers":"reduce_mean.v1"}
95
+ grad_factor = 1.0
96
 
97
  [components.transformer]
98
+ factory = "curated_transformer"
99
+ all_layer_outputs = false
100
+ frozen = false
101
 
102
  [components.transformer.model]
103
+ @architectures = "spacy-curated-transformers.BertTransformer.v1"
104
+ vocab_size = 6144
105
+ hidden_width = 768
106
+ piece_encoder = {"@architectures":"spacy-curated-transformers.CharEncoder.v1"}
107
+ attention_probs_dropout_prob = 0.1
108
+ hidden_act = "gelu"
109
+ hidden_dropout_prob = 0.1
110
+ intermediate_width = 3072
111
+ layer_norm_eps = 0.0
112
+ max_position_embeddings = 512
113
+ model_max_length = 512
114
+ num_attention_heads = 12
115
+ num_hidden_layers = 12
116
+ padding_idx = 0
117
+ type_vocab_size = 2
118
+ torchscript = false
119
  mixed_precision = false
120
+ wrapped_listener = null
 
 
 
 
121
 
122
  [components.transformer.model.grad_scaler_config]
123
 
124
+ [components.transformer.model.with_spans]
125
+ @architectures = "spacy-curated-transformers.WithStridedSpans.v1"
126
+ stride = 160
127
+ window = 216
128
+ batch_size = 384
 
 
129
 
130
  [corpora]
131
 
 
162
  before_update = null
163
 
164
  [training.batcher]
165
+ @batchers = "spacy.batch_by_words.v1"
166
+ discard_oversize = false
 
167
  size = 2000
168
+ tolerance = 0.2
169
+ get_length = null
170
 
171
  [training.logger]
172
  @loggers = "spacy.ConsoleLogger.v1"
 
237
  path = "corpus/labels/parser.json"
238
  require = false
239
 
240
+ [initialize.components.transformer]
241
+
242
+ [initialize.components.transformer.encoder_loader]
243
+ @model_loaders = "spacy-curated-transformers.HFTransformerEncoderLoader.v1"
244
+ name = "cl-tohoku/bert-base-japanese-char-v2"
245
+ revision = "main"
246
+
247
+ [initialize.components.transformer.piecer_loader]
248
+ @model_loaders = "spacy-curated-transformers.HFPieceEncoderLoader.v1"
249
+ name = "cl-tohoku/bert-base-japanese-char-v2"
250
+ revision = "main"
251
+
252
  [initialize.lookups]
253
  @misc = "spacy.LookupsDataLoader.v1"
254
  lang = ${nlp.lang}
ja_core_news_trf-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf7e1ee38ed3b7191498980ae170fc9b3dc3f6a979dc82e7e9efa8cc0b746ff4
3
- size 337889759
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85fb7bdb04bb7308ff8b728f6ecbceda198f1def857e6a922bd87ae089933d31
3
+ size 335692251
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"ja",
3
  "name":"core_news_trf",
4
- "version":"3.6.1",
5
- "description":"Japanese transformer pipeline (cl-tohoku/bert-base-japanese-char-v2). Components: transformer, morphologizer, parser, ner.",
6
  "author":"Explosion",
7
  "email":"[email protected]",
8
  "url":"https://explosion.ai",
9
  "license":"CC BY-SA 3.0",
10
- "spacy_version":">=3.6.0,<3.7.0",
11
- "spacy_git_version":"c067b5264",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -111,11 +111,11 @@
111
 
112
  ],
113
  "performance":{
114
- "token_acc":0.9937494927,
115
- "token_p":0.9764591282,
116
- "token_r":0.9790021974,
117
- "token_f":0.9777290092,
118
- "pos_acc":0.9798190675,
119
  "morph_acc":0.0,
120
  "morph_micro_p":0.3401360544,
121
  "morph_micro_r":0.9803921569,
@@ -137,66 +137,66 @@
137
  "f":0.0
138
  }
139
  },
140
- "sents_p":0.9519230769,
141
- "sents_r":0.9763313609,
142
- "sents_f":0.9639727361,
143
- "dep_uas":0.9311959654,
144
- "dep_las":0.9202934425,
145
  "dep_las_per_type":{
146
  "cc":{
147
- "p":0.8936170213,
148
- "r":0.875,
149
- "f":0.8842105263
150
  },
151
  "compound":{
152
- "p":0.9544917258,
153
- "r":0.9103720406,
154
- "f":0.9319099827
155
  },
156
  "obl":{
157
- "p":0.8436724566,
158
- "r":0.8489388265,
159
- "f":0.8462974487
160
  },
161
  "case":{
162
- "p":0.9832317073,
163
- "r":0.9802431611,
164
- "f":0.9817351598
165
  },
166
  "dislocated":{
167
- "p":0.6666666667,
168
- "r":0.6153846154,
169
- "f":0.64
170
  },
171
  "nsubj":{
172
- "p":0.8618677043,
173
- "r":0.8502879079,
174
- "f":0.8560386473
175
  },
176
  "nmod":{
177
- "p":0.9235588972,
178
- "r":0.8619883041,
179
- "f":0.8917120387
180
  },
181
  "root":{
182
- "p":0.9359223301,
183
- "r":0.9506903353,
184
- "f":0.9432485323
185
  },
186
  "aux":{
187
- "p":0.9765698219,
188
- "r":0.9675023213,
189
- "f":0.9720149254
190
  },
191
  "advcl":{
192
- "p":0.7379310345,
193
- "r":0.7213483146,
194
- "f":0.7295454545
195
  },
196
  "mark":{
197
- "p":0.9756592292,
198
- "r":0.962,
199
- "f":0.9687814703
200
  },
201
  "fixed":{
202
  "p":0.9661921708,
@@ -204,14 +204,14 @@
204
  "f":0.976618705
205
  },
206
  "acl":{
207
- "p":0.8942731278,
208
- "r":0.8923076923,
209
- "f":0.8932893289
210
  },
211
  "obj":{
212
- "p":0.9539877301,
213
- "r":0.9395770393,
214
- "f":0.9467275495
215
  },
216
  "nummod":{
217
  "p":0.987012987,
@@ -219,131 +219,131 @@
219
  "f":0.9411764706
220
  },
221
  "advmod":{
222
- "p":0.7537313433,
223
- "r":0.7214285714,
224
- "f":0.7372262774
225
  },
226
  "amod":{
227
- "p":0.9375,
228
  "r":0.8108108108,
229
- "f":0.8695652174
230
  },
231
  "cop":{
232
- "p":0.9764705882,
233
- "r":0.9651162791,
234
- "f":0.9707602339
235
  },
236
  "ccomp":{
237
- "p":0.9047619048,
238
- "r":0.8636363636,
239
- "f":0.8837209302
240
  },
241
  "det":{
242
  "p":1.0,
243
- "r":0.9811320755,
244
- "f":0.9904761905
245
  },
246
  "csubj":{
247
- "p":0.7333333333,
248
  "r":0.9166666667,
249
- "f":0.8148148148
250
  },
251
  "dep":{
252
- "p":0.25,
253
  "r":0.2857142857,
254
- "f":0.2666666667
255
  }
256
  },
257
- "tag_acc":0.9713305562,
258
- "lemma_acc":0.9670526831,
259
- "ents_p":0.8356510746,
260
- "ents_r":0.8314465409,
261
- "ents_f":0.8335435057,
262
  "ents_per_type":{
263
  "DATE":{
264
- "p":0.9727272727,
265
- "r":0.9816513761,
266
- "f":0.9771689498
267
  },
268
  "ORG":{
269
- "p":0.7906976744,
270
- "r":0.7445255474,
271
- "f":0.7669172932
272
  },
273
  "TITLE_AFFIX":{
274
- "p":0.9230769231,
275
- "r":0.8,
276
  "f":0.8571428571
277
  },
278
  "PERSON":{
279
- "p":0.8865248227,
280
- "r":0.8992805755,
281
- "f":0.8928571429
282
  },
283
  "GPE":{
284
- "p":0.8404255319,
285
- "r":0.8404255319,
286
- "f":0.8404255319
287
  },
288
  "PRODUCT":{
289
- "p":0.625,
290
- "r":0.5952380952,
291
- "f":0.6097560976
292
  },
293
  "TIME":{
294
- "p":1.0,
295
  "r":1.0,
296
- "f":1.0
297
  },
298
  "QUANTITY":{
299
- "p":0.875,
300
  "r":0.9545454545,
301
- "f":0.9130434783
302
  },
303
  "NORP":{
304
- "p":0.6875,
305
- "r":0.6875,
306
- "f":0.6875
307
  },
308
  "ORDINAL":{
309
- "p":0.64,
310
- "r":0.7272727273,
311
- "f":0.6808510638
312
  },
313
  "WORK_OF_ART":{
314
- "p":0.8,
315
- "r":0.7058823529,
316
- "f":0.75
317
  },
318
  "CARDINAL":{
319
  "p":1.0,
320
  "r":0.5,
321
  "f":0.6666666667
322
  },
323
- "LOC":{
324
- "p":0.5714285714,
325
- "r":0.8,
326
- "f":0.6666666667
327
- },
328
  "PERCENT":{
329
  "p":1.0,
330
- "r":1.0,
331
- "f":1.0
332
  },
333
  "EVENT":{
334
- "p":0.9130434783,
335
- "r":0.8076923077,
336
- "f":0.8571428571
337
  },
338
  "FAC":{
339
- "p":0.7179487179,
340
- "r":0.7567567568,
341
- "f":0.7368421053
 
 
 
 
 
342
  },
343
  "MOVEMENT":{
344
- "p":0.6666666667,
345
- "r":0.4,
346
- "f":0.5
347
  },
348
  "LAW":{
349
  "p":0.6666666667,
@@ -361,7 +361,7 @@
361
  "f":1.0
362
  }
363
  },
364
- "speed":668.4231187225
365
  },
366
  "sources":[
367
  {
@@ -384,7 +384,7 @@
384
  }
385
  ],
386
  "requirements":[
387
- "spacy-transformers>=1.2.2,<1.3.0",
388
  "sudachipy>=0.5.2,!=0.6.1",
389
  "sudachidict-core>=20211220"
390
  ]
 
1
  {
2
  "lang":"ja",
3
  "name":"core_news_trf",
4
+ "version":"3.7.2",
5
+ "description":"Japanese transformer pipeline (Transformer(name='cl-tohoku/bert-base-japanese-char-v2', piece_encoder='char', stride=160, type='bert', width=768, window=216, vocab_size=6144)). Components: transformer, morphologizer, parser, ner.",
6
  "author":"Explosion",
7
  "email":"[email protected]",
8
  "url":"https://explosion.ai",
9
  "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.7.0,<3.8.0",
11
+ "spacy_git_version":"6b4f77441",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
111
 
112
  ],
113
  "performance":{
114
+ "token_acc":0.9936678032,
115
+ "token_p":0.9763760351,
116
+ "token_r":0.9788394238,
117
+ "token_f":0.9776061776,
118
+ "pos_acc":0.979409718,
119
  "morph_acc":0.0,
120
  "morph_micro_p":0.3401360544,
121
  "morph_micro_r":0.9803921569,
 
137
  "f":0.0
138
  }
139
  },
140
+ "sents_p":0.9318181818,
141
+ "sents_r":0.9704142012,
142
+ "sents_f":0.9507246377,
143
+ "dep_uas":0.9304880245,
144
+ "dep_las":0.9178365731,
145
  "dep_las_per_type":{
146
  "cc":{
147
+ "p":0.8723404255,
148
+ "r":0.8541666667,
149
+ "f":0.8631578947
150
  },
151
  "compound":{
152
+ "p":0.9507125891,
153
+ "r":0.9024802706,
154
+ "f":0.9259687681
155
  },
156
  "obl":{
157
+ "p":0.8445273632,
158
+ "r":0.847690387,
159
+ "f":0.846105919
160
  },
161
  "case":{
162
+ "p":0.9813048455,
163
+ "r":0.9772036474,
164
+ "f":0.9792499524
165
  },
166
  "dislocated":{
167
+ "p":0.5454545455,
168
+ "r":0.4615384615,
169
+ "f":0.5
170
  },
171
  "nsubj":{
172
+ "p":0.8700787402,
173
+ "r":0.8483685221,
174
+ "f":0.8590864917
175
  },
176
  "nmod":{
177
+ "p":0.8964241677,
178
+ "r":0.8502923977,
179
+ "f":0.8727490996
180
  },
181
  "root":{
182
+ "p":0.9300567108,
183
+ "r":0.9704142012,
184
+ "f":0.9498069498
185
  },
186
  "aux":{
187
+ "p":0.9747663551,
188
+ "r":0.9684308264,
189
+ "f":0.9715882627
190
  },
191
  "advcl":{
192
+ "p":0.7568807339,
193
+ "r":0.7415730337,
194
+ "f":0.7491486947
195
  },
196
  "mark":{
197
+ "p":0.9757575758,
198
+ "r":0.966,
199
+ "f":0.9708542714
200
  },
201
  "fixed":{
202
  "p":0.9661921708,
 
204
  "f":0.976618705
205
  },
206
  "acl":{
207
+ "p":0.8711790393,
208
+ "r":0.8769230769,
209
+ "f":0.874041621
210
  },
211
  "obj":{
212
+ "p":0.9630769231,
213
+ "r":0.9456193353,
214
+ "f":0.9542682927
215
  },
216
  "nummod":{
217
  "p":0.987012987,
 
219
  "f":0.9411764706
220
  },
221
  "advmod":{
222
+ "p":0.7352941176,
223
+ "r":0.7142857143,
224
+ "f":0.7246376812
225
  },
226
  "amod":{
227
+ "p":0.9090909091,
228
  "r":0.8108108108,
229
+ "f":0.8571428571
230
  },
231
  "cop":{
232
+ "p":0.9647058824,
233
+ "r":0.9534883721,
234
+ "f":0.9590643275
235
  },
236
  "ccomp":{
237
+ "p":0.9,
238
+ "r":0.8181818182,
239
+ "f":0.8571428571
240
  },
241
  "det":{
242
  "p":1.0,
243
+ "r":0.9622641509,
244
+ "f":0.9807692308
245
  },
246
  "csubj":{
247
+ "p":0.7857142857,
248
  "r":0.9166666667,
249
+ "f":0.8461538462
250
  },
251
  "dep":{
252
+ "p":0.2857142857,
253
  "r":0.2857142857,
254
+ "f":0.2857142857
255
  }
256
  },
257
+ "tag_acc":0.9713282143,
258
+ "lemma_acc":0.9670499959,
259
+ "ents_p":0.8227383863,
260
+ "ents_r":0.8465408805,
261
+ "ents_f":0.8344699318,
262
  "ents_per_type":{
263
  "DATE":{
264
+ "p":0.9464285714,
265
+ "r":0.9724770642,
266
+ "f":0.9592760181
267
  },
268
  "ORG":{
269
+ "p":0.6918238994,
270
+ "r":0.802919708,
271
+ "f":0.7432432432
272
  },
273
  "TITLE_AFFIX":{
274
+ "p":0.8181818182,
275
+ "r":0.9,
276
  "f":0.8571428571
277
  },
278
  "PERSON":{
279
+ "p":0.9270072993,
280
+ "r":0.9136690647,
281
+ "f":0.9202898551
282
  },
283
  "GPE":{
284
+ "p":0.84375,
285
+ "r":0.8617021277,
286
+ "f":0.8526315789
287
  },
288
  "PRODUCT":{
289
+ "p":0.5869565217,
290
+ "r":0.6428571429,
291
+ "f":0.6136363636
292
  },
293
  "TIME":{
294
+ "p":0.8,
295
  "r":1.0,
296
+ "f":0.8888888889
297
  },
298
  "QUANTITY":{
299
+ "p":0.8630136986,
300
  "r":0.9545454545,
301
+ "f":0.9064748201
302
  },
303
  "NORP":{
304
+ "p":0.6666666667,
305
+ "r":0.625,
306
+ "f":0.6451612903
307
  },
308
  "ORDINAL":{
309
+ "p":0.6538461538,
310
+ "r":0.7727272727,
311
+ "f":0.7083333333
312
  },
313
  "WORK_OF_ART":{
314
+ "p":0.8461538462,
315
+ "r":0.6470588235,
316
+ "f":0.7333333333
317
  },
318
  "CARDINAL":{
319
  "p":1.0,
320
  "r":0.5,
321
  "f":0.6666666667
322
  },
 
 
 
 
 
323
  "PERCENT":{
324
  "p":1.0,
325
+ "r":0.8571428571,
326
+ "f":0.9230769231
327
  },
328
  "EVENT":{
329
+ "p":0.9583333333,
330
+ "r":0.8846153846,
331
+ "f":0.92
332
  },
333
  "FAC":{
334
+ "p":0.9285714286,
335
+ "r":0.7027027027,
336
+ "f":0.8
337
+ },
338
+ "LOC":{
339
+ "p":0.8,
340
+ "r":0.8,
341
+ "f":0.8
342
  },
343
  "MOVEMENT":{
344
+ "p":0.3333333333,
345
+ "r":0.2,
346
+ "f":0.25
347
  },
348
  "LAW":{
349
  "p":0.6666666667,
 
361
  "f":1.0
362
  }
363
  },
364
+ "speed":567.0833083438
365
  },
366
  "sources":[
367
  {
 
384
  }
385
  ],
386
  "requirements":[
387
+ "spacy-curated-transformers>=0.2.0,<0.3.0",
388
  "sudachipy>=0.5.2,!=0.6.1",
389
  "sudachidict-core>=20211220"
390
  ]
morphologizer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3a1049a275d37cece33da1393ca92b132ca87447073c704ebab7bc69d73efa1
3
- size 59084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f2a6535cbfde339dcea877a3ccdd29f7d2666c7f0968bc9074297e37245c25
3
+ size 59168
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be17299903d639e7f1bb4379616a9c8f53cd53f758ba9533ea680e799e2eec53
3
- size 338861
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f44450a5d4a4a8ed02d6e2ebba068c21a9a5e486e267120acdb39e210a4aba
3
+ size 338945
parser/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:596b2e4e3b94484870fdb6a818bd3d0e5cd5eb13a25ec17076d9f9595eb1ca54
3
- size 318612
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523876b7059ce72c6931fc7db42b99f281e09aa475f61704bbd86b47f2f056f0
3
+ size 318696
transformer/cfg CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "max_batch_items":4096
3
  }
 
1
  {
2
+
3
  }
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5148b11902ffcdbd01171380ba2b6e44769aa5bbbdca877d36d3081dd629ff8
3
- size 363145763
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bf2df390b54e39c69fbaadb224bb8ba08f87204602187c000d043961456d6b7
3
+ size 360777959
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a71f56abaa60650314209e675a9652a03240e9e7d8fff74f8aaed649519b2d6b
3
- size 1599987
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c4f6e5652ec2a3af30058be57ee1b11aae7744c983eb6a3fb228e9bfbbec069
3
+ size 1600200