You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
7.9 KiB
Python

# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CodeGeeX tokenizers."""
from abc import ABC
from abc import abstractmethod
from .gpt2_tokenization import GPT2Tokenizer
from transformers import AutoTokenizer
def encode_whitespaces(text, start_extra_id: int, max_len: int):
"""Encode whitespaces to extra tokens in GPT-J.
>>> encode_whitespaces('a\\n b\\n c', 10, 10)
'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
"""
def push_acc_space(acc_len: int, text: str):
if acc_len == 0:
return text
if acc_len == 1:
return text + " "
assert (
acc_len <= max_len
), f"Max whitespace run length {max_len}, but found {acc_len}"
extra_id = start_extra_id - 2 + acc_len
extra_token = f"<|extratoken_{extra_id}|>"
return text + extra_token
acc_len = 0
res = ""
for ch in text:
if ch == " ":
acc_len += 1
if acc_len == max_len:
res = push_acc_space(acc_len, res)
acc_len = 0
else:
res = push_acc_space(acc_len, res)
acc_len = 0
res = res + ch
res = push_acc_space(acc_len, res)
return res
def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
"""Decode the whitespace-encoded strings produced by encode_whitespace.
>>> text = 'a\\n b\\n c'
>>> s, l = 10, 10
>>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
True
"""
for l in range(2, max_len + 1):
token_id = start_extra_id - 2 + l
token = f"<|extratoken_{token_id}|>"
text = text.replace(token, " " * l)
return text
def build_hgf_tokenizer(args):
"""Initialize tokenizer."""
tokenizer_path = args.tokenizer_path
if args.rank == 0:
print(f"> building huggingface tokenizer from {tokenizer_path} ...", flush=True)
assert tokenizer_path is not None, "Tokenizer path must be provided."
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
if args.rank == 0:
print(f" > eos_token = {tokenizer.eos_token}", flush=True)
ws_start_id = args.ws_encoding_start_id if "ws_encoding_start_id" in args else None
ws_len = args.ws_encoding_length if "ws_encoding_length" in args else None
return HgfTokenizerWrapper(
tokenizer, ws_start=ws_start_id, ws_len=ws_len
)
def build_tokenizer(args):
"""Initialize tokenizer."""
if "tokenizer_path" in args and args.tokenizer_path is not None:
# build huggingface tokenizer
tokenizer = build_hgf_tokenizer(args)
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
return tokenizer
if args.rank == 0:
print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
# Select and instantiate the tokenizer.
assert args.vocab_file is not None
if args.tokenizer_type == "GPT2BPETokenizer":
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
else:
raise NotImplementedError(
"{} tokenizer is not " "implemented.".format(args.tokenizer_type)
)
# Add vocab size.
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
return tokenizer
def _vocab_size_with_padding(orig_vocab_size, args):
"""Pad vocab size so it is divisible by model parallel size and
still having GPU friendly size."""
after = orig_vocab_size
if args.make_vocab_size_divisible_by > orig_vocab_size:
multiple = args.make_vocab_size_divisible_by
else:
multiple = args.make_vocab_size_divisible_by * args.tensor_model_parallel_size
while (after % multiple) != 0:
after += 1
if args.rank == 0:
print(
" > padded vocab (size: {}) with {} dummy tokens "
"(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
flush=True,
)
return after
class AbstractTokenizer(ABC):
"""Abstract class for tokenizer."""
def __init__(self, name):
self.name = name
super().__init__()
@property
@abstractmethod
def vocab_size(self):
pass
@property
@abstractmethod
def vocab(self):
"""Dictionary from vocab text token to id token."""
pass
@property
@abstractmethod
def inv_vocab(self):
"""Dictionary from vocab id token to text token."""
pass
@abstractmethod
def tokenize(self, text):
pass
def detokenize(self, token_ids):
raise NotImplementedError(
"detokenizer is not implemented for {} " "tokenizer".format(self.name)
)
@property
def cls(self):
raise NotImplementedError(
"CLS is not provided for {} " "tokenizer".format(self.name)
)
@property
def sep(self):
raise NotImplementedError(
"SEP is not provided for {} " "tokenizer".format(self.name)
)
@property
def pad(self):
raise NotImplementedError(
"PAD is not provided for {} " "tokenizer".format(self.name)
)
@property
def eod(self):
raise NotImplementedError(
"EOD is not provided for {} " "tokenizer".format(self.name)
)
@property
def mask(self):
raise NotImplementedError(
"MASK is not provided for {} " "tokenizer".format(self.name)
)
class _GPT2BPETokenizer(AbstractTokenizer):
"""Original GPT2 BPE tokenizer."""
def __init__(self, vocab_file, merge_file):
name = "GPT2 BPE"
super().__init__(name)
self.tokenizer = GPT2Tokenizer(
vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
)
self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
@property
def vocab_size(self):
return len(self.tokenizer.encoder)
@property
def vocab(self):
return self.tokenizer.encoder
@property
def inv_vocab(self):
return self.tokenizer.decoder
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)
@property
def eod(self):
return self.eod_id
class HgfTokenizerWrapper(AbstractTokenizer):
"""Wrapper for Hugging Face tokenizer."""
def __init__(
self,
tokenizer,
ws_start: int = None,
ws_len: int = None,
):
super(HgfTokenizerWrapper, self).__init__(tokenizer.__class__.__name__)
self.tokenizer = tokenizer
self.ws_start = ws_start
self.ws_len = ws_len
def tokenize(self, text):
if self.ws_start:
text = encode_whitespaces(text, self.ws_start, self.ws_len)
input_ids = self.tokenizer(text, is_split_into_words=False).input_ids
return input_ids
def detokenize(self, token_ids):
text = self.tokenizer.decode(token_ids, skip_special_tokens=False)
if self.ws_start:
text = decode_whitespaces(text, self.ws_start, self.ws_len)
return text
@property
def eod(self):
return self.tokenizer.eos_token_id
@property
def inv_vocab(self):
return len(self.tokenizer.decoder)
@property
def vocab(self):
return self.tokenizer.vocab
@property
def vocab_size(self):
return len(self.vocab)