KaleiNeely commited on
Commit
70edb73
1 Parent(s): 2ff3172

Delete tokenization_rwkv_world.py

Browse files
Files changed (1) hide show
  1. tokenization_rwkv_world.py +0 -549
tokenization_rwkv_world.py DELETED
@@ -1,549 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Tokenization classes for RWKV5."""
16
-
17
- import json
18
- import os
19
- from typing import TYPE_CHECKING, List, Optional, Tuple, Union
20
-
21
- from transformers.tokenization_utils import PreTrainedTokenizer
22
- from transformers.tokenization_utils_base import (
23
- BatchEncoding,
24
- EncodedInput,
25
- TextInput,
26
- TruncationStrategy,
27
- )
28
- from transformers.utils import PaddingStrategy, TensorType, logging, to_py_obj
29
-
30
-
31
- if TYPE_CHECKING:
32
- from transformers.pipelines.conversational import Conversation
33
-
34
- logger = logging.get_logger(__name__)
35
-
36
- VOCAB_FILES_NAMES = {
37
- "vocab_file": "rwkv_vocab_v20230424.txt",
38
- }
39
- PRETRAINED_VOCAB_FILES_MAP = {
40
- "vocab_file": {
41
- "RWKV/rwkv-5-world-169m": "https://huggingface.co/RWKV/rwkv-5-world-169m/blob/main/rwkv_vocab_v20230424.txt",
42
- },
43
- }
44
-
45
-
46
- class TRIE:
47
- __slots__ = tuple("ch,to,values,front".split(","))
48
- to: list
49
- values: set
50
-
51
- def __init__(self, front=None, ch=None):
52
- self.ch = ch
53
- self.to = [None for ch in range(256)]
54
- self.values = set()
55
- self.front = front
56
-
57
- def __repr__(self):
58
- fr = self
59
- ret = []
60
- while fr is not None:
61
- if fr.ch is not None:
62
- ret.append(fr.ch)
63
- fr = fr.front
64
- return "<TRIE %s %s>" % (ret[::-1], self.values)
65
-
66
- def add(self, key: bytes, idx: int = 0, val=None):
67
- if idx == len(key):
68
- if val is None:
69
- val = key
70
- self.values.add(val)
71
- return self
72
- ch = key[idx]
73
- if self.to[ch] is None:
74
- self.to[ch] = TRIE(front=self, ch=ch)
75
- return self.to[ch].add(key, idx=idx + 1, val=val)
76
-
77
- def find_longest(self, key: bytes, idx: int = 0):
78
- u: TRIE = self
79
- ch: int = key[idx]
80
-
81
- while u.to[ch] is not None:
82
- u = u.to[ch]
83
- idx += 1
84
- if u.values:
85
- ret = idx, u, u.values
86
- if idx == len(key):
87
- break
88
- ch = key[idx]
89
- return ret
90
-
91
-
92
- class RWKVWorldTokenizer(PreTrainedTokenizer):
93
- vocab_files_names = VOCAB_FILES_NAMES
94
- model_input_names = ["input_ids", "attention_mask"]
95
-
96
- def __init__(self, vocab_file, errors="replace", pad_token="0", **kwargs):
97
- self.add_bos_token = False
98
- self.encoder = {}
99
- sorted = [] # must be already sorted
100
- with open(vocab_file, "r", encoding="utf-8") as f:
101
- lines = f.readlines()
102
- for l in lines:
103
- idx = int(l[: l.index(" ")])
104
- x = eval(l[l.index(" ") : l.rindex(" ")])
105
- x = x.encode("utf-8") if isinstance(x, str) else x
106
- assert isinstance(x, bytes)
107
- assert len(x) == int(l[l.rindex(" ") :])
108
- sorted += [x]
109
- self.encoder[idx] = x
110
-
111
- self.decoder = {}
112
- for k, v in self.encoder.items():
113
- self.decoder[v] = int(k)
114
-
115
- self.trie = TRIE()
116
- for t, i in self.decoder.items():
117
- _ = self.trie.add(t, val=(t, i))
118
- self.errors = errors # how to handle errors in decoding
119
- self.cache = {}
120
- self.first_max_length = 0
121
- super().__init__(
122
- errors=errors,
123
- **kwargs,
124
- )
125
-
126
- @property
127
- def eos_token_id(self) -> Optional[int]:
128
- return 0
129
-
130
- @property
131
- def eot_token_id(self) -> Optional[int]:
132
- return 0
133
-
134
- @property
135
- def pad_token_id(self) -> Optional[int]:
136
- return 0
137
-
138
- @property
139
- def vocab_size(self):
140
- return len(self.encoder)
141
-
142
- def get_vocab(self):
143
- return dict(self.encoder, **self.added_tokens_encoder)
144
-
145
- def add_tokens(self, new_tokens, special_tokens: bool = False):
146
- for token in new_tokens:
147
- token_id = self.convert_tokens_to_ids(token)
148
- self.added_tokens_decoder[token_id] = token
149
-
150
- def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
151
- if isinstance(ids, int):
152
- ids = [ids]
153
- tokens = []
154
- for id_ in ids:
155
- if id_ in self.added_tokens_decoder:
156
- tokens.append(self.added_tokens_decoder[id_])
157
- else:
158
- tokens.append(self._convert_id_to_token(id_))
159
- return tokens
160
-
161
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
162
- if self.add_bos_token:
163
- bos_token_ids = [self.bos_token_id]
164
- else:
165
- bos_token_ids = []
166
-
167
- output = bos_token_ids + token_ids_0
168
-
169
- if token_ids_1 is None:
170
- return output
171
-
172
- return output + bos_token_ids + token_ids_1
173
-
174
- def get_special_tokens_mask(
175
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
176
- ) -> List[int]:
177
- """
178
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
179
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
180
-
181
- Args:
182
- token_ids_0 (`List[int]`):
183
- List of IDs.
184
- token_ids_1 (`List[int]`, *optional*):
185
- Optional second list of IDs for sequence pairs.
186
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
187
- Whether or not the token list is already formatted with special tokens for the model.
188
-
189
- Returns:
190
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
191
- """
192
- if already_has_special_tokens:
193
- return super().get_special_tokens_mask(
194
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
195
- )
196
-
197
- if not self.add_bos_token:
198
- return super().get_special_tokens_mask(
199
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
200
- )
201
-
202
- if token_ids_1 is None:
203
- return [1] + ([0] * len(token_ids_0))
204
- return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
205
-
206
- def encodeBytes(self, src: bytes):
207
- idx: int = 0
208
- tokens = []
209
- while idx < len(src):
210
- _idx: int = idx
211
- idx, _, values = self.trie.find_longest(src, idx)
212
- assert idx != _idx
213
- _, token = next(iter(values))
214
- tokens.append(token)
215
- return tokens
216
-
217
- def decodeBytes(self, tokens):
218
- return b"".join(map(lambda i: self.encoder[i], tokens)) # noqa
219
-
220
- def _tokenize(self, text, **kwargs):
221
- """Tokenize a string."""
222
- return self.encodeBytes(text.encode("utf-8"))
223
-
224
- def _decode_tokens(self, tokens):
225
- try:
226
- return self.decodeBytes(tokens).decode("utf-8")
227
- except Exception:
228
- return "\ufffd" # bad utf-8
229
-
230
- def _decode(
231
- self,
232
- token_ids: Union[int, List[int]],
233
- skip_special_tokens: bool = False,
234
- **kwargs,
235
- ) -> str:
236
- def remove_zeros_from_first_segment(token_ids, first_max_length):
237
- first_segment = token_ids[:first_max_length]
238
- first_segment_cleaned = [token for token in first_segment if token != 0]
239
- return first_segment_cleaned + token_ids[first_max_length:]
240
-
241
- # Convert inputs to python lists
242
- token_ids = to_py_obj(token_ids)
243
- token_ids = remove_zeros_from_first_segment(token_ids, self.first_max_length)
244
- if isinstance(token_ids, int):
245
- if token_ids in self.all_special_ids and skip_special_tokens:
246
- return ""
247
- return self.encoder.get(token_ids, self.unk_token)
248
- elif isinstance(token_ids, list):
249
- self.first_max_length
250
- out_str = ""
251
- out_last = 0
252
- out_tokens = []
253
- for i, token in enumerate(token_ids):
254
- if token == 0:
255
- break
256
- out_tokens += [token]
257
- tmp = self._decode_tokens(out_tokens[out_last:])
258
- if "\ufffd" not in tmp:
259
- out_str += tmp
260
- out_last = i + 1
261
- return out_str
262
- else:
263
- return token_ids
264
-
265
- def _convert_token_to_id(self, token):
266
- """Converts a token (str) in an id using the vocab."""
267
- return self.encoder.get(token, self.encoder.get(self.unk_token))
268
-
269
- def _convert_id_to_token(self, index):
270
- """Converts an index (integer) in a token (str) using the vocab."""
271
- return self.decoder.get(index)
272
-
273
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
274
- if not os.path.exists(save_directory):
275
- os.mkdir(save_directory)
276
- if not os.path.isdir(save_directory):
277
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
278
- return
279
- vocab_file = os.path.join(
280
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
281
- )
282
-
283
- with open(vocab_file, "w", encoding="utf-8") as f:
284
- for idx, x in self.encoder.items():
285
- if isinstance(x, str):
286
- x = x.decode("utf-8")
287
- line = f"{idx} {repr(x)} {len(x)}\n"
288
- f.write(line)
289
-
290
- return (vocab_file,)
291
-
292
- def prepare_for_tokenization(self, text, **kwargs):
293
- return (text, kwargs)
294
-
295
- def _get_padding_truncation_strategies(
296
- self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
297
- ):
298
- return PaddingStrategy.LONGEST, TruncationStrategy.DO_NOT_TRUNCATE, -1, kwargs
299
-
300
- def _encode_plus(
301
- self,
302
- text: Union[TextInput, EncodedInput],
303
- add_special_tokens: bool = True,
304
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
305
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
306
- max_length: Optional[int] = None,
307
- stride: int = 0,
308
- pad_to_multiple_of: Optional[int] = None,
309
- return_tensors: Optional[Union[str, TensorType]] = None,
310
- return_token_type_ids: Optional[bool] = None,
311
- return_attention_mask: Optional[bool] = None,
312
- return_overflowing_tokens: bool = False,
313
- return_special_tokens_mask: bool = False,
314
- return_offsets_mapping: bool = False,
315
- return_length: bool = False,
316
- verbose: bool = True,
317
- **kwargs,
318
- ) -> BatchEncoding:
319
- def get_input_ids(text, max_length=None, pad_token_id=0):
320
- def pad_sequence(seq, max_len, pad_tok):
321
- return [pad_tok] * (max_len - len(seq)) + seq
322
-
323
- if isinstance(text, str):
324
- tokens = self._tokenize(text)
325
- if max_length is not None:
326
- tokens = pad_sequence(tokens, max_length, pad_token_id)
327
- return tokens
328
-
329
- elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
330
- tokenized_texts = [self._tokenize(t) for t in text]
331
- if max_length is None:
332
- max_length = max(len(t) for t in tokenized_texts)
333
- return [pad_sequence(t, max_length, pad_token_id) for t in tokenized_texts]
334
-
335
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
336
- if max_length is not None and len(text) < max_length:
337
- return pad_sequence(text, max_length, pad_token_id)
338
- return text
339
-
340
- else:
341
- raise ValueError(
342
- "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
343
- )
344
-
345
- if return_offsets_mapping:
346
- raise NotImplementedError(
347
- "return_offset_mapping is not available when using Python tokenizers. "
348
- "To use this feature, change your tokenizer to one deriving from "
349
- "transformers.PreTrainedTokenizerFast. "
350
- "More information on available tokenizers at "
351
- "https://github.com/huggingface/transformers/pull/2674"
352
- )
353
-
354
- first_ids = get_input_ids(text)
355
-
356
- return self.prepare_for_model(
357
- first_ids,
358
- pair_ids=None,
359
- add_special_tokens=add_special_tokens,
360
- padding=padding_strategy.value,
361
- truncation=truncation_strategy.value,
362
- max_length=max_length,
363
- stride=stride,
364
- pad_to_multiple_of=pad_to_multiple_of,
365
- return_tensors=return_tensors,
366
- prepend_batch_axis=True,
367
- return_attention_mask=return_attention_mask,
368
- return_token_type_ids=return_token_type_ids,
369
- return_overflowing_tokens=return_overflowing_tokens,
370
- return_special_tokens_mask=return_special_tokens_mask,
371
- return_length=return_length,
372
- verbose=verbose,
373
- )
374
-
375
- def _batch_encode_plus(
376
- self,
377
- batch_text_or_text_pairs: Union[
378
- List[TextInput],
379
- List[EncodedInput],
380
- ],
381
- add_special_tokens: bool = True,
382
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
383
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
384
- max_length: Optional[int] = None,
385
- stride: int = 0,
386
- pad_to_multiple_of: Optional[int] = None,
387
- return_tensors: Optional[Union[str, TensorType]] = None,
388
- return_token_type_ids: Optional[bool] = None,
389
- return_attention_mask: Optional[bool] = None,
390
- return_overflowing_tokens: bool = False,
391
- return_special_tokens_mask: bool = False,
392
- return_offsets_mapping: bool = False,
393
- return_length: bool = False,
394
- verbose: bool = True,
395
- **kwargs,
396
- ) -> BatchEncoding:
397
- def get_input_ids(text, max_length=None, pad_token_id=0):
398
- def pad_sequence(seq, max_len, pad_tok):
399
- return [pad_tok] * (max_len - len(seq)) + seq
400
-
401
- if isinstance(text, str):
402
- tokens = self._tokenize(text)
403
- if max_length is not None:
404
- tokens = pad_sequence(tokens, max_length, pad_token_id)
405
- return tokens
406
-
407
- elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
408
- tokenized_texts = [self._tokenize(t) for t in text]
409
- if max_length is None:
410
- max_length = max(len(t) for t in tokenized_texts)
411
- return [pad_sequence(t, max_length, pad_token_id) for t in tokenized_texts]
412
-
413
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
414
- if max_length is not None and len(text) < max_length:
415
- return pad_sequence(text, max_length, pad_token_id)
416
- return text
417
-
418
- else:
419
- raise ValueError(
420
- "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
421
- )
422
-
423
- if return_offsets_mapping:
424
- raise NotImplementedError(
425
- "return_offset_mapping is not available when using Python tokenizers. "
426
- "To use this feature, change your tokenizer to one deriving from "
427
- "transformers.PreTrainedTokenizerFast."
428
- )
429
-
430
- first_max_length = 0
431
- second_max_length = 0
432
- for ids_or_pair_ids in batch_text_or_text_pairs:
433
- if not isinstance(ids_or_pair_ids, (list, tuple)):
434
- ids, pair_ids = ids_or_pair_ids, None
435
- else:
436
- ids, pair_ids = ids_or_pair_ids
437
- first_ids = get_input_ids(ids)
438
- second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
439
- first_max_length = max(first_max_length, len(first_ids))
440
- if second_ids is not None:
441
- second_max_length = max(second_max_length, len(second_ids))
442
-
443
- self.first_max_length = first_max_length
444
- input_ids = []
445
- for ids_or_pair_ids in batch_text_or_text_pairs:
446
- if not isinstance(ids_or_pair_ids, (list, tuple)):
447
- ids, pair_ids = ids_or_pair_ids, None
448
- else:
449
- ids, pair_ids = ids_or_pair_ids
450
-
451
- first_ids = get_input_ids(ids, max_length=first_max_length)
452
- second_ids = get_input_ids(pair_ids, max_length=second_max_length) if pair_ids is not None else None
453
- input_ids.append((first_ids, second_ids))
454
-
455
- batch_outputs = self._batch_prepare_for_model(
456
- input_ids,
457
- add_special_tokens=add_special_tokens,
458
- padding_strategy=padding_strategy,
459
- truncation_strategy=truncation_strategy,
460
- max_length=max_length,
461
- stride=stride,
462
- pad_to_multiple_of=pad_to_multiple_of,
463
- return_attention_mask=return_attention_mask,
464
- return_token_type_ids=return_token_type_ids,
465
- return_overflowing_tokens=return_overflowing_tokens,
466
- return_special_tokens_mask=return_special_tokens_mask,
467
- return_length=return_length,
468
- return_tensors=return_tensors,
469
- verbose=verbose,
470
- )
471
-
472
- return BatchEncoding(batch_outputs)
473
-
474
- def decode(
475
- self,
476
- token_ids: Union[int, List[int]],
477
- skip_special_tokens: bool = False,
478
- clean_up_tokenization_spaces: bool = None,
479
- **kwargs,
480
- ) -> str:
481
- """
482
- Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
483
- tokens and clean up tokenization spaces.
484
-
485
- Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
486
-
487
- Args:
488
- token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
489
- List of tokenized input ids. Can be obtained using the `__call__` method.
490
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
491
- Whether or not to remove special tokens in the decoding.
492
- clean_up_tokenization_spaces (`bool`, *optional*):
493
- Whether or not to clean up the tokenization spaces. If `None`, will default to
494
- `self.clean_up_tokenization_spaces`.
495
- kwargs (additional keyword arguments, *optional*):
496
- Will be passed to the underlying model specific decode method.
497
-
498
- Returns:
499
- `str`: The decoded sentence.
500
- """
501
- # Convert inputs to python lists
502
- return self._decode(
503
- token_ids=token_ids,
504
- skip_special_tokens=skip_special_tokens,
505
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
506
- **kwargs,
507
- )
508
-
509
- def batch_decode(
510
- self,
511
- sequences: Union[List[int], List[List[int]]],
512
- skip_special_tokens: bool = False,
513
- clean_up_tokenization_spaces: bool = None,
514
- **kwargs,
515
- ) -> List[str]:
516
- """
517
- Convert a list of lists of token ids into a list of strings by calling decode.
518
-
519
- Args:
520
- sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
521
- List of tokenized input ids. Can be obtained using the `__call__` method.
522
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
523
- Whether or not to remove special tokens in the decoding.
524
- clean_up_tokenization_spaces (`bool`, *optional*):
525
- Whether or not to clean up the tokenization spaces. If `None`, will default to
526
- `self.clean_up_tokenization_spaces`.
527
- kwargs (additional keyword arguments, *optional*):
528
- Will be passed to the underlying model specific decode method.
529
-
530
- Returns:
531
- `List[str]`: The list of decoded sentences.
532
- """
533
- return [
534
- self.decode(
535
- seq,
536
- skip_special_tokens=skip_special_tokens,
537
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
538
- **kwargs,
539
- )
540
- for seq in sequences
541
- ]
542
-
543
- def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
544
- input_ids = []
545
- for is_user, text in conversation.iter_texts():
546
- input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
547
- if len(input_ids) > self.model_max_length:
548
- input_ids = input_ids[-self.model_max_length :]
549
- return input_ids