mirror of https://github.com/THUDM/CodeGeeX.git
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
6.1 KiB
Python
205 lines
6.1 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
# copied from fairseq/fairseq/data/indexed_dataset.py
|
|
# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
|
|
# other slight modifications to remove fairseq dependencies
|
|
# Added document index to index file and made it accessible.
|
|
# An empty sentence no longer separates documents.
|
|
|
|
|
|
# copied from Megatron
|
|
# Used for building mmap datasets.
|
|
from functools import lru_cache
|
|
import struct
|
|
import numpy as np
|
|
import shutil
|
|
|
|
__all__ = ["make_mmap_builder"]
|
|
|
|
dtypes = {
|
|
1: np.uint8,
|
|
2: np.int8,
|
|
3: np.int16,
|
|
4: np.int32,
|
|
5: np.int64,
|
|
6: np.float,
|
|
7: np.double,
|
|
8: np.uint16,
|
|
}
|
|
|
|
|
|
def __best_fitting_dtype(vocab_size=None):
|
|
if vocab_size is not None and vocab_size < 65500:
|
|
return np.uint16
|
|
else:
|
|
return np.int32
|
|
|
|
|
|
def make_mmap_builder(out_file, vocab_size=None):
|
|
return MMapIndexedDatasetBuilder(
|
|
out_file, dtype=__best_fitting_dtype(vocab_size)
|
|
)
|
|
|
|
|
|
def code(dtype):
|
|
for k in dtypes.keys():
|
|
if dtypes[k] == dtype:
|
|
return k
|
|
raise ValueError(dtype)
|
|
|
|
|
|
def index_file_path(prefix_path):
|
|
return prefix_path + ".idx"
|
|
|
|
|
|
def data_file_path(prefix_path):
|
|
return prefix_path + ".bin"
|
|
|
|
|
|
class MMapIndexedDataset:
|
|
class Index(object):
|
|
_HDR_MAGIC = b"MMIDIDX\x00\x00"
|
|
|
|
@classmethod
|
|
def writer(cls, path, dtype):
|
|
class _Writer(object):
|
|
def __enter__(self):
|
|
self._file = open(path, "wb")
|
|
|
|
self._file.write(cls._HDR_MAGIC)
|
|
self._file.write(struct.pack("<Q", 1))
|
|
self._file.write(struct.pack("<B", code(dtype)))
|
|
|
|
return self
|
|
|
|
@staticmethod
|
|
def _get_pointers(sizes):
|
|
dtype_size = dtype().itemsize
|
|
address = 0
|
|
pointers = []
|
|
|
|
for size in sizes:
|
|
pointers.append(address)
|
|
address += size * dtype_size
|
|
|
|
return pointers
|
|
|
|
def write(self, sizes, doc_idx):
|
|
pointers = self._get_pointers(sizes)
|
|
|
|
self._file.write(struct.pack("<Q", len(sizes)))
|
|
self._file.write(struct.pack("<Q", len(doc_idx)))
|
|
|
|
sizes = np.array(sizes, dtype=np.int32)
|
|
self._file.write(sizes.tobytes(order="C"))
|
|
del sizes
|
|
|
|
pointers = np.array(pointers, dtype=np.int64)
|
|
self._file.write(pointers.tobytes(order="C"))
|
|
del pointers
|
|
|
|
doc_idx = np.array(doc_idx, dtype=np.int64)
|
|
self._file.write(doc_idx.tobytes(order="C"))
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self._file.close()
|
|
|
|
return _Writer()
|
|
|
|
def __init__(self, path, skip_warmup=False):
|
|
with open(path, "rb") as stream:
|
|
magic_test = stream.read(9)
|
|
assert self._HDR_MAGIC == magic_test, (
|
|
"Index file doesn't match expected format. "
|
|
"Make sure that --dataset-impl is configured properly."
|
|
)
|
|
version = struct.unpack("<Q", stream.read(8))
|
|
assert (1,) == version
|
|
|
|
(dtype_code,) = struct.unpack("<B", stream.read(1))
|
|
self._dtype = dtypes[dtype_code]
|
|
self._dtype_size = self._dtype().itemsize
|
|
|
|
self._len = struct.unpack("<Q", stream.read(8))[0]
|
|
self._doc_count = struct.unpack("<Q", stream.read(8))[0]
|
|
offset = stream.tell()
|
|
|
|
self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
|
|
self._bin_buffer = memoryview(self._bin_buffer_mmap)
|
|
self._sizes = np.frombuffer(
|
|
self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
|
|
)
|
|
self._pointers = np.frombuffer(
|
|
self._bin_buffer,
|
|
dtype=np.int64,
|
|
count=self._len,
|
|
offset=offset + self._sizes.nbytes,
|
|
)
|
|
self._doc_idx = np.frombuffer(
|
|
self._bin_buffer,
|
|
dtype=np.int64,
|
|
count=self._doc_count,
|
|
offset=offset + self._sizes.nbytes + self._pointers.nbytes,
|
|
)
|
|
|
|
def __del__(self):
|
|
self._bin_buffer_mmap._mmap.close()
|
|
del self._bin_buffer_mmap
|
|
|
|
@property
|
|
def dtype(self):
|
|
return self._dtype
|
|
|
|
@property
|
|
def sizes(self):
|
|
return self._sizes
|
|
|
|
@property
|
|
def doc_idx(self):
|
|
return self._doc_idx
|
|
|
|
@lru_cache(maxsize=8)
|
|
def __getitem__(self, i):
|
|
return self._pointers[i], self._sizes[i]
|
|
|
|
def __len__(self):
|
|
return self._len
|
|
|
|
|
|
class MMapIndexedDatasetBuilder(object):
|
|
def __init__(self, out_file, dtype=np.int64):
|
|
self._data_file = open(out_file, "wb")
|
|
self._dtype = dtype
|
|
self._sizes = []
|
|
self._doc_idx = [0]
|
|
|
|
def add_item(self, tensor):
|
|
np_array = np.array(tensor.numpy(), dtype=self._dtype)
|
|
self._data_file.write(np_array.tobytes(order="C"))
|
|
self._sizes.append(np_array.size)
|
|
|
|
def end_document(self):
|
|
self._doc_idx.append(len(self._sizes))
|
|
|
|
def merge_file_(self, another_file):
|
|
# Concatenate index
|
|
index = MMapIndexedDataset.Index(index_file_path(another_file))
|
|
assert index.dtype == self._dtype
|
|
|
|
for size in index.sizes:
|
|
self._sizes.append(size)
|
|
|
|
# Concatenate data
|
|
with open(data_file_path(another_file), "rb") as f:
|
|
shutil.copyfileobj(f, self._data_file)
|
|
|
|
def finalize(self, index_file):
|
|
self._data_file.close()
|
|
|
|
with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
|
|
index.write(self._sizes, self._doc_idx)
|