# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # copied from fairseq/fairseq/data/indexed_dataset.py # Removed IndexedRawTextDataset since it relied on Fairseq dictionary # other slight modifications to remove fairseq dependencies # Added document index to index file and made it accessible. # An empty sentence no longer separates documents. # copied from Megatron # Used for building mmap datasets. from functools import lru_cache import struct import numpy as np import shutil __all__ = ["make_mmap_builder"] dtypes = { 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float, 7: np.double, 8: np.uint16, } def __best_fitting_dtype(vocab_size=None): if vocab_size is not None and vocab_size < 65500: return np.uint16 else: return np.int32 def make_mmap_builder(out_file, vocab_size=None): return MMapIndexedDatasetBuilder( out_file, dtype=__best_fitting_dtype(vocab_size) ) def code(dtype): for k in dtypes.keys(): if dtypes[k] == dtype: return k raise ValueError(dtype) def index_file_path(prefix_path): return prefix_path + ".idx" def data_file_path(prefix_path): return prefix_path + ".bin" class MMapIndexedDataset: class Index(object): _HDR_MAGIC = b"MMIDIDX\x00\x00" @classmethod def writer(cls, path, dtype): class _Writer(object): def __enter__(self): self._file = open(path, "wb") self._file.write(cls._HDR_MAGIC) self._file.write(struct.pack("