diff options
author | Sang-Kil Park <sang.park@42dot.ai> | 2024-01-29 18:24:19 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-29 11:24:19 +0200 |
commit | e76627bcce9f77adb6034cb127b7ec93d4287b69 (patch) | |
tree | 2db53beb80ef991426980a0b134c080dad68717e | |
parent | fbe7dfa53caff0a7e830b676e6e949917a5c71b4 (diff) |
py : improve BPE tokenizer support (#5189)
-rwxr-xr-x | convert.py | 5 |
1 files changed, 4 insertions, 1 deletions
@@ -334,7 +334,10 @@ class Params: class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - self.vocab = self.bpe_tokenizer["model"]["vocab"] + try: + self.vocab = self.bpe_tokenizer["model"]["vocab"] + except: + self.vocab = self.bpe_tokenizer added_tokens: dict[str, int] if fname_added_tokens is not None: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. |