|
|
# coding=utf-8
|
|
|
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
|
#
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
# You may obtain a copy of the License at
|
|
|
#
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
#
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
# See the License for the specific language governing permissions and
|
|
|
# limitations under the License.
|
|
|
|
|
|
"""Megatron arguments."""
|
|
|
|
|
|
import argparse
|
|
|
import os
|
|
|
|
|
|
import torch
|
|
|
import deepspeed
|
|
|
|
|
|
|
|
|
def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False):
|
|
|
"""Parse all arguments."""
|
|
|
parser = argparse.ArgumentParser(
|
|
|
description="Megatron-LM Arguments", allow_abbrev=False
|
|
|
)
|
|
|
|
|
|
# Standard arguments.
|
|
|
parser = _add_network_size_args(parser)
|
|
|
parser = _add_regularization_args(parser)
|
|
|
parser = _add_training_args(parser)
|
|
|
parser = _add_initialization_args(parser)
|
|
|
parser = _add_learning_rate_args(parser)
|
|
|
parser = _add_checkpointing_args(parser)
|
|
|
parser = _add_mixed_precision_args(parser)
|
|
|
parser = _add_distributed_args(parser)
|
|
|
parser = _add_validation_args(parser)
|
|
|
parser = _add_data_args(parser)
|
|
|
parser = _add_autoresume_args(parser)
|
|
|
parser = _add_biencoder_args(parser)
|
|
|
parser = _add_vit_args(parser)
|
|
|
parser = _add_logging_args(parser)
|
|
|
parser = _add_zero_args(parser)
|
|
|
parser = _add_memoryopt_args(parser)
|
|
|
parser = _add_activation_checkpoint_args(parser)
|
|
|
parser = _add_inference_args(parser)
|
|
|
|
|
|
# Custom arguments.
|
|
|
if extra_args_provider is not None:
|
|
|
parser = extra_args_provider(parser)
|
|
|
|
|
|
parser = deepspeed.add_config_arguments(parser)
|
|
|
|
|
|
# Parse.
|
|
|
if ignore_unknown_args:
|
|
|
args, _ = parser.parse_known_args()
|
|
|
else:
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
# helper argument to set deepspeed pipeline parallel or not
|
|
|
args.ds_pipeline_enabled = not args.no_pipeline_parallel
|
|
|
|
|
|
# Distributed args.
|
|
|
args.rank = int(os.getenv("RANK", "0"))
|
|
|
args.world_size = int(os.getenv("WORLD_SIZE", "1"))
|
|
|
# Tensor model parallel size.
|
|
|
args.tensor_model_parallel_size = min(
|
|
|
args.tensor_model_parallel_size, args.world_size
|
|
|
)
|
|
|
assert (
|
|
|
args.world_size % args.tensor_model_parallel_size == 0
|
|
|
), "world size" " ({}) is not divisible by tensor model parallel size ({})".format(
|
|
|
args.world_size, args.tensor_model_parallel_size
|
|
|
)
|
|
|
# Pipeline model parallel size.
|
|
|
args.pipeline_model_parallel_size = min(
|
|
|
args.pipeline_model_parallel_size,
|
|
|
(args.world_size // args.tensor_model_parallel_size),
|
|
|
)
|
|
|
# Checks.
|
|
|
if args.no_pipeline_parallel:
|
|
|
assert (
|
|
|
args.pipeline_model_parallel_size == 1
|
|
|
), "pipeline_model_parallel_size must be 1 if pipeline parallel is disabled"
|
|
|
model_parallel_size = (
|
|
|
args.pipeline_model_parallel_size * args.tensor_model_parallel_size
|
|
|
)
|
|
|
assert args.world_size % model_parallel_size == 0, (
|
|
|
"world size is not"
|
|
|
" divisible by tensor parallel size ({}) times pipeline parallel "
|
|
|
"size ({})".format(
|
|
|
args.world_size,
|
|
|
args.tensor_model_parallel_size,
|
|
|
args.pipeline_model_parallel_size,
|
|
|
)
|
|
|
)
|
|
|
args.data_parallel_size = args.world_size // model_parallel_size
|
|
|
if args.rank == 0:
|
|
|
print(
|
|
|
"using world size: {}, data-parallel-size: {}, "
|
|
|
"tensor-model-parallel size: {}, "
|
|
|
"pipeline-model-parallel size: {} ".format(
|
|
|
args.world_size,
|
|
|
args.data_parallel_size,
|
|
|
args.tensor_model_parallel_size,
|
|
|
args.pipeline_model_parallel_size,
|
|
|
),
|
|
|
flush=True,
|
|
|
)
|
|
|
|
|
|
# Deprecated arguments
|
|
|
assert args.batch_size is None, (
|
|
|
"--batch-size argument is no longer " "valid, use --micro-batch-size instead"
|
|
|
)
|
|
|
del args.batch_size
|
|
|
assert args.warmup is None, (
|
|
|
"--warmup argument is no longer valid, use " "--lr-warmup-fraction instead"
|
|
|
)
|
|
|
del args.warmup
|
|
|
assert args.model_parallel_size is None, (
|
|
|
"--model-parallel-size is no "
|
|
|
"longer valid, use --tensor-model-parallel-size instead"
|
|
|
)
|
|
|
del args.model_parallel_size
|
|
|
|
|
|
# Set input defaults.
|
|
|
for key in defaults:
|
|
|
# For default to be valid, it should not be provided in the
|
|
|
# arguments that are passed to the program. We check this by
|
|
|
# ensuring the arg is set to None.
|
|
|
if getattr(args, key) is not None:
|
|
|
if args.force_default:
|
|
|
print(
|
|
|
"WARNING: overriding arguments for {key}:{v2} \
|
|
|
with default {key}:{v}".format(
|
|
|
key=key, v=defaults[key], v2=getattr(args, key)
|
|
|
),
|
|
|
flush=True,
|
|
|
)
|
|
|
setattr(args, key, defaults[key])
|
|
|
else:
|
|
|
if args.rank == 0:
|
|
|
print(
|
|
|
"WARNING: overriding default arguments for {key}:{v} \
|
|
|
with {key}:{v2}".format(
|
|
|
key=key, v=defaults[key], v2=getattr(args, key)
|
|
|
),
|
|
|
flush=True,
|
|
|
)
|
|
|
else:
|
|
|
setattr(args, key, defaults[key])
|
|
|
|
|
|
# Batch size.
|
|
|
assert args.micro_batch_size is not None
|
|
|
assert args.micro_batch_size > 0
|
|
|
if args.global_batch_size is None:
|
|
|
args.global_batch_size = args.micro_batch_size * args.data_parallel_size
|
|
|
if args.rank == 0:
|
|
|
print(
|
|
|
"setting global batch size to {}".format(args.global_batch_size),
|
|
|
flush=True,
|
|
|
)
|
|
|
assert args.global_batch_size > 0
|
|
|
if args.num_layers_per_virtual_pipeline_stage is not None:
|
|
|
assert args.pipeline_model_parallel_size > 2, (
|
|
|
"pipeline-model-parallel size should be greater than 2 with "
|
|
|
"interleaved schedule"
|
|
|
)
|
|
|
assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, (
|
|
|
"number of layers is not divisible by number of layers per virtual "
|
|
|
"pipeline stage"
|
|
|
)
|
|
|
args.virtual_pipeline_model_parallel_size = (
|
|
|
args.num_layers // args.pipeline_model_parallel_size
|
|
|
) // args.num_layers_per_virtual_pipeline_stage
|
|
|
else:
|
|
|
args.virtual_pipeline_model_parallel_size = None
|
|
|
|
|
|
# Parameters dtype.
|
|
|
args.params_dtype = torch.float
|
|
|
if args.fp16:
|
|
|
assert not args.bf16
|
|
|
args.params_dtype = torch.half
|
|
|
if args.bf16:
|
|
|
assert not args.fp16
|
|
|
args.params_dtype = torch.bfloat16
|
|
|
# bfloat16 requires gradient accumulation and all-reduce to
|
|
|
# be done in fp32.
|
|
|
if not args.accumulate_allreduce_grads_in_fp32:
|
|
|
args.accumulate_allreduce_grads_in_fp32 = True
|
|
|
if args.rank == 0:
|
|
|
print(
|
|
|
"accumulate and all-reduce gradients in fp32 for "
|
|
|
"bfloat16 data type.",
|
|
|
flush=True,
|
|
|
)
|
|
|
|
|
|
if args.rank == 0:
|
|
|
print("using {} for parameters ...".format(args.params_dtype), flush=True)
|
|
|
|
|
|
# If we do accumulation and all-reduces in fp32, we need to have
|
|
|
# local DDP and we should set the use-contiguous-buffers-in-ddp.
|
|
|
if args.accumulate_allreduce_grads_in_fp32:
|
|
|
assert args.DDP_impl == "local"
|
|
|
args.use_contiguous_buffers_in_ddp = True
|
|
|
|
|
|
if args.dataloader_type is None:
|
|
|
args.dataloader_type = "single"
|
|
|
|
|
|
# Consumed tokens.
|
|
|
args.consumed_train_samples = 0
|
|
|
args.consumed_valid_samples = 0
|
|
|
args.consumed_train_tokens = 0
|
|
|
|
|
|
# Iteration-based training.
|
|
|
if args.train_iters:
|
|
|
# If we use iteration-based training, make sure the
|
|
|
# sample-based options are off.
|
|
|
assert args.train_samples is None, "expected iteration-based training"
|
|
|
assert (
|
|
|
args.lr_decay_samples is None
|
|
|
), "expected iteration-based learning rate decay"
|
|
|
assert (
|
|
|
args.lr_warmup_samples == 0
|
|
|
), "expected iteration-based learning rate warmup"
|
|
|
assert (
|
|
|
args.rampup_batch_size is None
|
|
|
), "expected no batch-size rampup for iteration-based training"
|
|
|
if args.lr_warmup_fraction is not None:
|
|
|
assert (
|
|
|
args.lr_warmup_iters == 0
|
|
|
), "can only specify one of lr-warmup-fraction and lr-warmup-iters"
|
|
|
|
|
|
# Sample-based training.
|
|
|
if args.train_samples:
|
|
|
# If we use sample-based training, make sure the
|
|
|
# iteration-based options are off.
|
|
|
assert args.train_iters is None, "expected sample-based training"
|
|
|
assert args.lr_decay_iters is None, "expected sample-based learning rate decay"
|
|
|
assert args.lr_warmup_iters == 0, "expected sample-based learnig rate warmup"
|
|
|
if args.lr_warmup_fraction is not None:
|
|
|
assert args.lr_warmup_samples == 0, (
|
|
|
"can only specify one of lr-warmup-fraction " "and lr-warmup-samples"
|
|
|
)
|
|
|
|
|
|
# Check required arguments.
|
|
|
required_args = [
|
|
|
"num_layers",
|
|
|
"hidden_size",
|
|
|
"num_attention_heads",
|
|
|
"max_position_embeddings",
|
|
|
]
|
|
|
for req_arg in required_args:
|
|
|
_check_arg_is_not_none(args, req_arg)
|
|
|
|
|
|
# args.learned_position_embeddings = args.learned_position_embeddings > 0
|
|
|
|
|
|
# Checks.
|
|
|
if args.ffn_hidden_size is None:
|
|
|
args.ffn_hidden_size = 4 * args.hidden_size
|
|
|
|
|
|
if args.kv_channels is None:
|
|
|
assert args.hidden_size % args.num_attention_heads == 0
|
|
|
args.kv_channels = args.hidden_size // args.num_attention_heads
|
|
|
|
|
|
if args.seq_length is not None:
|
|
|
assert args.encoder_seq_length is None
|
|
|
args.encoder_seq_length = args.seq_length
|
|
|
else:
|
|
|
assert args.encoder_seq_length is not None
|
|
|
args.seq_length = args.encoder_seq_length
|
|
|
|
|
|
if args.seq_length is not None:
|
|
|
assert args.max_position_embeddings >= args.seq_length
|
|
|
if args.decoder_seq_length is not None:
|
|
|
assert args.max_position_embeddings >= args.decoder_seq_length
|
|
|
if args.lr is not None:
|
|
|
assert args.min_lr <= args.lr
|
|
|
if args.save is not None:
|
|
|
assert args.save_interval is not None
|
|
|
# Mixed precision checks.
|
|
|
if args.fp16_lm_cross_entropy:
|
|
|
assert args.fp16, "lm cross entropy in fp16 only support in fp16 mode."
|
|
|
if args.fp32_residual_connection:
|
|
|
assert (
|
|
|
args.fp16 or args.bf16
|
|
|
), "residual connection in fp32 only supported when using fp16 or bf16."
|
|
|
# Activation checkpointing.
|
|
|
if args.distribute_checkpointed_activations:
|
|
|
assert args.checkpoint_activations, (
|
|
|
"for distribute-checkpointed-activations to work you "
|
|
|
"need to enable checkpoint-activations"
|
|
|
)
|
|
|
|
|
|
_print_args(args)
|
|
|
return args
|
|
|
|
|
|
|
|
|
def _print_args(args):
|
|
|
"""Print arguments."""
|
|
|
if args.rank == 0:
|
|
|
print("------------------------ arguments ------------------------", flush=True)
|
|
|
str_list = []
|
|
|
for arg in vars(args):
|
|
|
dots = "." * (48 - len(arg))
|
|
|
str_list.append(" {} {} {}".format(arg, dots, getattr(args, arg)))
|
|
|
for arg in sorted(str_list, key=lambda x: x.lower()):
|
|
|
print(arg, flush=True)
|
|
|
print("-------------------- end of arguments ---------------------", flush=True)
|
|
|
|
|
|
|
|
|
def _check_arg_is_not_none(args, arg):
|
|
|
assert getattr(args, arg) is not None, "{} argument is None".format(arg)
|
|
|
|
|
|
|
|
|
def _add_network_size_args(parser):
|
|
|
group = parser.add_argument_group(title="network size")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--num-layers",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Number of transformer layers.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--hidden-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Transformer hidden size.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--reward-growth",
|
|
|
type=str,
|
|
|
default="constant",
|
|
|
choices=["constant", "linear", "quadratic"],
|
|
|
help="Reward growth function.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--ffn-hidden-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Transformer Feed-Forward Network hidden size. "
|
|
|
"This is set to 4*hidden-size if not provided",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--num-attention-heads",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Number of transformer attention heads.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--kv-channels",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Projection weights dimension in multi-head "
|
|
|
"attention. This is set to "
|
|
|
" args.hidden_size // args.num_attention_heads "
|
|
|
"if not provided.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--scale-embeddings",
|
|
|
action="store_true",
|
|
|
help="Scale embeddings by sqrt(d_model).",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--max-position-embeddings",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Maximum number of position embeddings to use. "
|
|
|
"This is the size of position embedding.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-learned-position-embeddings",
|
|
|
action="store_true",
|
|
|
help="Do not learn position embeddings. ",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--make-vocab-size-divisible-by",
|
|
|
type=int,
|
|
|
default=128,
|
|
|
help="Pad the vocab size to be divisible by this value."
|
|
|
"This is added for computational efficieny reasons.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--layernorm-epsilon", type=float, default=1e-5, help="Layer norm epsilon."
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--apply-residual-connection-post-layernorm",
|
|
|
action="store_true",
|
|
|
help="If set, use original BERT residula connection " "ordering.",
|
|
|
)
|
|
|
group.add_argument('--scaled-upper-triang-masked-softmax-fusion',
|
|
|
action='store_true',
|
|
|
help='Enable fusion of query_key_value_scaling '
|
|
|
'time (upper diagonal) masking, softmax.')
|
|
|
group.add_argument(
|
|
|
"--openai-gelu",
|
|
|
action="store_true",
|
|
|
help="Use OpenAIs GeLU implementation. This option"
|
|
|
"should not be used unless for backward compatibility"
|
|
|
"reasons.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--onnx-safe",
|
|
|
type=bool,
|
|
|
required=False,
|
|
|
help="Use workarounds for known problems with " "Torch ONNX exporter",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--bert-no-binary-head",
|
|
|
action="store_false",
|
|
|
help="Disable BERT binary head.",
|
|
|
dest="bert_binary_head",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_logging_args(parser):
|
|
|
group = parser.add_argument_group(title="logging")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--log-params-norm",
|
|
|
action="store_true",
|
|
|
help="If set, calculate and log parameters norm.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--log-num-zeros-in-grad",
|
|
|
action="store_true",
|
|
|
help="If set, calculate and log the number of zeros in gradient.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tensorboard-log-interval",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="Report to tensorboard interval.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tensorboard-queue-size",
|
|
|
type=int,
|
|
|
default=1000,
|
|
|
help="Size of the tensorboard queue for pending events "
|
|
|
"and summaries before one of the ‘add’ calls forces a "
|
|
|
"flush to disk.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--log-timers-to-tensorboard",
|
|
|
action="store_true",
|
|
|
help="If set, write timers to tensorboard.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--log-batch-size-to-tensorboard",
|
|
|
action="store_true",
|
|
|
help="If set, write batch-size to tensorboard.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-log-learnig-rate-to-tensorboard",
|
|
|
action="store_false",
|
|
|
help="Disable learning rate logging to tensorboard.",
|
|
|
dest="log_learning_rate_to_tensorboard",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-log-loss-scale-to-tensorboard",
|
|
|
action="store_false",
|
|
|
help="Disable loss-scale logging to tensorboard.",
|
|
|
dest="log_loss_scale_to_tensorboard",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--log-validation-ppl-to-tensorboard",
|
|
|
action="store_true",
|
|
|
help="If set, write validation perplexity to " "tensorboard.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--wandb-logging",
|
|
|
action="store_true",
|
|
|
help="If set, log training progress to wandb.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--wandb-log-interval",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="Log to wandb every N steps.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_regularization_args(parser):
|
|
|
group = parser.add_argument_group(title="regularization")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--attention-dropout",
|
|
|
type=float,
|
|
|
default=0.1,
|
|
|
help="Post attention dropout probability.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--hidden-dropout",
|
|
|
type=float,
|
|
|
default=0.1,
|
|
|
help="Dropout probability for hidden state transformer.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--weight-decay",
|
|
|
type=float,
|
|
|
default=0.01,
|
|
|
help="Weight decay coefficient for L2 regularization.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tempering",
|
|
|
type=float,
|
|
|
default=None,
|
|
|
help="Tempering coefficient for the model.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--gold",
|
|
|
action="store_true",
|
|
|
help="If set, use gold regularization.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--gold-beta",
|
|
|
type=float,
|
|
|
default=0.05,
|
|
|
help="Beta for GOLD tempering.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--play-tau",
|
|
|
type=float,
|
|
|
default=2.0
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--clip-grad",
|
|
|
type=float,
|
|
|
default=1.0,
|
|
|
help="Gradient clipping based on global L2 norm.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--adam-beta1",
|
|
|
type=float,
|
|
|
default=0.9,
|
|
|
help="First coefficient for computing running averages "
|
|
|
"of gradient and its square",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--adam-beta2",
|
|
|
type=float,
|
|
|
default=0.999,
|
|
|
help="Second coefficient for computing running averages "
|
|
|
"of gradient and its square",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--adam-eps",
|
|
|
type=float,
|
|
|
default=1e-08,
|
|
|
help="Term added to the denominator to improve" "numerical stability",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--sgd-momentum", type=float, default=0.9, help="Momentum factor for sgd"
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_training_args(parser):
|
|
|
group = parser.add_argument_group(title="training")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--micro-batch-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Batch size per model instance (local batch size). "
|
|
|
"Global batch size is local batch size times data "
|
|
|
"parallel size times number of micro batches.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--batch-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Old batch size parameter, do not use. " "Use --micro-batch-size instead",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--global-batch-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Training batch size. If set, it should be a "
|
|
|
"multiple of micro-batch-size times data-parallel-size. "
|
|
|
"If this value is None, then "
|
|
|
"use micro-batch-size * data-parallel-size as the "
|
|
|
"global batch size. This choice will result in 1 for "
|
|
|
"number of micro-batches.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--rampup-batch-size",
|
|
|
nargs="*",
|
|
|
default=None,
|
|
|
help="Batch size ramp up with the following values:"
|
|
|
" --rampup-batch-size <start batch size> "
|
|
|
" <batch size incerement> "
|
|
|
" <ramp-up samples> "
|
|
|
"For example:"
|
|
|
" --rampup-batch-size 16 8 300000 \ "
|
|
|
" --global-batch-size 1024"
|
|
|
"will start with global batch size 16 and over "
|
|
|
" (1024 - 16) / 8 = 126 intervals will increase"
|
|
|
"the batch size linearly to 1024. In each interval"
|
|
|
"we will use approximately 300000 / 126 = 2380 samples.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--checkpoint-activations",
|
|
|
action="store_true",
|
|
|
help="Checkpoint activation to allow for training "
|
|
|
"with larger models, sequences, and batch sizes.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--distribute-checkpointed-activations",
|
|
|
action="store_true",
|
|
|
help="If set, distribute checkpointed activations "
|
|
|
"across model parallel group.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--checkpoint-num-layers",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="chunk size (number of layers) for checkpointing.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--train-iters",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Total number of iterations to train over all "
|
|
|
"training runs. Note that either train-iters or "
|
|
|
"train-samples should be provided.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--train-samples",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Total number of samples to train over all "
|
|
|
"training runs. Note that either train-iters or "
|
|
|
"train-samples should be provided.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--train-tokens",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Total number of tokens to train over all " "training runs.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--log-interval", type=int, default=100, help="Report loss and timing interval."
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--exit-interval",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Exit the program after the iteration is divisible " "by this value.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--exit-duration-in-mins",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Exit the program after this many minutes.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tensorboard-dir",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Write TensorBoard logs to this directory.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-masked-softmax-fusion",
|
|
|
action="store_false",
|
|
|
help="Disable fusion of query_key_value scaling, " "masking, and softmax.",
|
|
|
dest="masked_softmax_fusion",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-bias-gelu-fusion",
|
|
|
action="store_false",
|
|
|
help="Disable bias and gelu fusion.",
|
|
|
dest="bias_gelu_fusion",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-bias-dropout-fusion",
|
|
|
action="store_false",
|
|
|
help="Disable bias and dropout fusion.",
|
|
|
dest="bias_dropout_fusion",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--optimizer",
|
|
|
type=str,
|
|
|
default="adam",
|
|
|
choices=["adam", "sgd"],
|
|
|
help="Optimizer function",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--dataloader-type",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
choices=["single", "cyclic"],
|
|
|
help="Single pass vs multiple pass data loader",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--cpu-optimizer", action="store_true", help="Run optimizer on CPU"
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--cpu_torch_adam",
|
|
|
action="store_true",
|
|
|
help="Use Torch Adam as optimizer on CPU.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-pipeline-parallel",
|
|
|
action="store_true",
|
|
|
help="Disable pipeline parallelism",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--ms-model",
|
|
|
action="store_true",
|
|
|
help="use model converted from Mindspore",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_initialization_args(parser):
|
|
|
group = parser.add_argument_group(title="initialization")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--seed",
|
|
|
type=int,
|
|
|
default=1234,
|
|
|
help="Random seed used for python, numpy, " "pytorch, and cuda.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--init-method-std",
|
|
|
type=float,
|
|
|
default=0.02,
|
|
|
help="Standard deviation of the zero mean normal "
|
|
|
"distribution used for weight initialization.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--init-method-xavier-uniform",
|
|
|
action="store_true",
|
|
|
help="Enable Xavier uniform parameter initialization",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_inference_args(parser):
|
|
|
group = parser.add_argument_group(title="initialization")
|
|
|
|
|
|
group.add_argument(
|
|
|
'--beam-warmup',
|
|
|
action="store_true",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
'--beam-warmup-length',
|
|
|
type=int,
|
|
|
default=0,
|
|
|
)
|
|
|
group.add_argument(
|
|
|
'--beam-search',
|
|
|
action="store_true",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
'--beam-search-nucleus',
|
|
|
action="store_true",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
'--num-beams',
|
|
|
type=int,
|
|
|
default=4,
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_learning_rate_args(parser):
|
|
|
group = parser.add_argument_group(title="learning rate")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--lr",
|
|
|
type=float,
|
|
|
default=None,
|
|
|
help="Initial learning rate. Depending on decay style "
|
|
|
"and initial warmup, the learing rate at each "
|
|
|
"iteration would be different.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-decay-style",
|
|
|
type=str,
|
|
|
default="linear",
|
|
|
choices=["constant", "linear", "cosine"],
|
|
|
help="Learning rate decay function.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-decay-iters",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="number of iterations to decay learning rate over,"
|
|
|
" If None defaults to `--train-iters`",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-decay-samples",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="number of samples to decay learning rate over,"
|
|
|
" If None defaults to `--train-samples`",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-decay-tokens",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="number of tokens to decay learning rate over,"
|
|
|
" If not None will override iter/sample-based decay",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-warmup-fraction",
|
|
|
type=float,
|
|
|
default=None,
|
|
|
help="fraction of lr-warmup-(iters/samples) to use " "for warmup (as a float)",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-warmup-iters",
|
|
|
type=int,
|
|
|
default=0,
|
|
|
help="number of iterations to linearly warmup " "learning rate over.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lr-warmup-samples",
|
|
|
type=int,
|
|
|
default=0,
|
|
|
help="number of samples to linearly warmup " "learning rate over.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--warmup",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Old lr warmup argument, do not use. Use one of the"
|
|
|
"--lr-warmup-* arguments above",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--min-lr",
|
|
|
type=float,
|
|
|
default=0.0,
|
|
|
help="Minumum value for learning rate. The scheduler"
|
|
|
"clip values below this threshold.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--override-lr-scheduler",
|
|
|
action="store_true",
|
|
|
help="Reset the values of the scheduler (learning rate,"
|
|
|
"warmup iterations, minimum learning rate, maximum "
|
|
|
"number of iterations, and decay style from input "
|
|
|
"arguments and ignore values from checkpoints. Note"
|
|
|
"that all the above values will be reset.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--use-checkpoint-lr-scheduler",
|
|
|
action="store_true",
|
|
|
help="Use checkpoint to set the values of the scheduler "
|
|
|
"(learning rate, warmup iterations, minimum learning "
|
|
|
"rate, maximum number of iterations, and decay style "
|
|
|
"from checkpoint and ignore input arguments.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_checkpointing_args(parser):
|
|
|
group = parser.add_argument_group(title="checkpointing")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--save",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Output directory to save checkpoints to.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--save-interval",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Number of iterations between checkpoint saves.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-save-optim",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="Do not save current optimizer.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-save-rng",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="Do not save current rng state.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--load",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Directory containing a model checkpoint.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--low-memory-load",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="Load model checkpoint in low memory mode."
|
|
|
"On each machine, workers load the checkpoint one at a time."
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--dist-timeout",
|
|
|
type=int,
|
|
|
default=30,
|
|
|
help="Timeout for Pytorch Distributed backend (in minutes).",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--load-state",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Start training from a existing model state.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-load-optim",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="Do not load optimizer when loading checkpoint.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-load-rng",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="Do not load rng state when loading checkpoint.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--finetune",
|
|
|
action="store_true",
|
|
|
help="Load model for finetuning. Do not load optimizer "
|
|
|
"or rng state from checkpoint and set iteration to 0. "
|
|
|
"Assumed when loading a release checkpoint.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_mixed_precision_args(parser):
|
|
|
group = parser.add_argument_group(title="mixed precision")
|
|
|
|
|
|
group.add_argument("--fp16", action="store_true", help="Run model in fp16 mode.")
|
|
|
group.add_argument("--ln-fp16", action="store_true", help="Run layernorm in fp16 mode.")
|
|
|
group.add_argument(
|
|
|
"--bf16", action="store_true", help="Run model in bfloat16 mode."
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--loss-scale",
|
|
|
type=float,
|
|
|
default=None,
|
|
|
help="Static loss scaling, positive power of 2 "
|
|
|
"values can improve fp16 convergence. If None, dynamic"
|
|
|
"loss scaling is used.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--initial-loss-scale",
|
|
|
type=float,
|
|
|
default=2 ** 32,
|
|
|
help="Initial loss-scale for dynamic loss scaling.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--min-loss-scale",
|
|
|
type=float,
|
|
|
default=1.0,
|
|
|
help="Minimum loss scale for dynamic loss scale.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--loss-scale-window",
|
|
|
type=float,
|
|
|
default=1000,
|
|
|
help="Window over which to raise/lower dynamic scale.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--hysteresis", type=int, default=2, help="hysteresis for dynamic loss scaling"
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--fp32-residual-connection",
|
|
|
action="store_true",
|
|
|
help="Move residual connections to fp32.",
|
|
|
)
|
|
|
group.add_argument('--apply-query-key-layer-scaling', action='store_true',
|
|
|
help='Scale Q * K^T by 1 / layer-number. If this flag '
|
|
|
'is set, then it will automatically set '
|
|
|
'attention-softmax-in-fp32 to true')
|
|
|
group.add_argument(
|
|
|
"--attention-softmax-in-fp32",
|
|
|
action="store_true",
|
|
|
help="Run attention masking and softmax in fp32. "
|
|
|
"This flag is ignored unless "
|
|
|
"--no-query-key-layer-scaling is specified.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--accumulate-allreduce-grads-in-fp32",
|
|
|
action="store_true",
|
|
|
help="Gradient accumulation and all-reduce in fp32.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--fp16-lm-cross-entropy",
|
|
|
action="store_true",
|
|
|
help="Move the cross entropy unreduced loss calculation" "for lm head to fp16.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_distributed_args(parser):
|
|
|
group = parser.add_argument_group(title="distributed")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--tensor-model-parallel-size",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="Degree of tensor model parallelism.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--pipeline-model-parallel-size",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="Degree of pipeline model parallelism.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--model-parallel-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Old model parallel argument, do not use. Use "
|
|
|
"--tensor-model-parallel-size instead.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--num-layers-per-virtual-pipeline-stage",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Number of layers per virtual pipeline stage",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--distributed-backend",
|
|
|
default="nccl",
|
|
|
choices=["nccl", "gloo"],
|
|
|
help="Which backend to use for distributed training.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--DDP-impl",
|
|
|
default="local",
|
|
|
choices=["local", "torch"],
|
|
|
help="which DistributedDataParallel implementation " "to use.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--use-contiguous-buffers-in-ddp",
|
|
|
action="store_true",
|
|
|
help="If set, use contiguous buffer in DDP. Note that "
|
|
|
"this option only works woth local DDP.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--no-scatter-gather-tensors-in-pipeline",
|
|
|
action="store_false",
|
|
|
help="Use scatter/gather to optimize communication of tensors in pipeline",
|
|
|
dest="scatter_gather_tensors_in_pipeline",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--local_rank",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="local rank passed from distributed launcher.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--lazy-mpu-init",
|
|
|
type=bool,
|
|
|
required=False,
|
|
|
help="If set to True, initialize_megatron() "
|
|
|
"skips DDP initialization and returns function to "
|
|
|
"complete it instead.Also turns on "
|
|
|
"--use-cpu-initialization flag. This is for "
|
|
|
"external DDP manager.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--use-cpu-initialization",
|
|
|
action="store_true",
|
|
|
default=None,
|
|
|
help="If set, affine parallel weights " "initialization uses CPU",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--force-device",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Force the model to run on a particular gpu",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--force-default",
|
|
|
action="store_true",
|
|
|
help="Force setting default arguments for distributed training",
|
|
|
)
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_validation_args(parser):
|
|
|
group = parser.add_argument_group(title="validation")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--eval-iters",
|
|
|
type=int,
|
|
|
default=100,
|
|
|
help="Number of iterations to run for evaluation" "validation/test for.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--eval-interval",
|
|
|
type=int,
|
|
|
default=1000,
|
|
|
help="Interval between running evaluation on " "validation set.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--co-evaluation",
|
|
|
action="store_true",
|
|
|
help="If set, run evaluation on each part of the validation set"
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_data_args(parser):
|
|
|
group = parser.add_argument_group(title="data and dataloader")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--data-path",
|
|
|
nargs="*",
|
|
|
default=None,
|
|
|
help="Path to the training dataset. Accepted format:"
|
|
|
"1) a single data path, 2) multiple datasets in the"
|
|
|
"form: dataset1-weight dataset1-path dataset2-weight "
|
|
|
"dataset2-path ...",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--valid-data-path",
|
|
|
nargs="*",
|
|
|
default=None,
|
|
|
help="Path to the validation dataset. Accepted format:"
|
|
|
"1) a single data path, 2) multiple datasets in the"
|
|
|
"form: dataset1-weight dataset1-path dataset2-weight "
|
|
|
"dataset2-path ...;"
|
|
|
"when co-evaluation is enabled, the form will be dataset1-tag dataset1-path ...",
|
|
|
)
|
|
|
group.add_argument("--index-cache-dir", type=str, default=None, help="Path to the index cache")
|
|
|
group.add_argument(
|
|
|
"--test-data-path",
|
|
|
nargs="*",
|
|
|
default=None,
|
|
|
help="Path to the test dataset. Accepted format:"
|
|
|
"1) a single data path, 2) multiple datasets in the"
|
|
|
"form: dataset1-tag dataset1-path dataset2-tag "
|
|
|
"dataset2-path ...",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--split",
|
|
|
type=str,
|
|
|
default="969, 30, 1",
|
|
|
help="Comma-separated list of proportions for training,"
|
|
|
" validation, and test split. For example the split "
|
|
|
"`90,5,5` will use 90%% of data for training, 5%% for "
|
|
|
"validation and 5%% for test.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--vocab-file",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Path to the vocab file.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--merge-file",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Path to the BPE merge file.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tokenizer-path",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Path to the tokenizer dir.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--vocab-extra-ids",
|
|
|
type=int,
|
|
|
default=0,
|
|
|
help="Number of additional vocabulary tokens. "
|
|
|
"They are used for span masking in the T5 model",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--seq-length",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Maximum sequence length to process.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--encoder-seq-length",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Maximum encoder sequence length to process."
|
|
|
"This should be exclusive of --seq-length",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--decoder-seq-length",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Maximum decoder sequence length to process.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--retriever-seq-length",
|
|
|
type=int,
|
|
|
default=256,
|
|
|
help="Maximum sequence length for the biencoder model " " for retriever",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--sample-rate",
|
|
|
type=float,
|
|
|
default=1.0,
|
|
|
help="sample rate for training data. Supposed to be 0 " " < sample_rate < 1",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--mask-prob",
|
|
|
type=float,
|
|
|
default=0.15,
|
|
|
help="Probability of replacing a token with mask.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--short-seq-prob",
|
|
|
type=float,
|
|
|
default=0.1,
|
|
|
help="Probability of producing a short sequence.",
|
|
|
)
|
|
|
group.add_argument("--mmap-warmup", action="store_true", help="Warm up mmap files.")
|
|
|
group.add_argument(
|
|
|
"--num-workers", type=int, default=2, help="Dataloader number of workers."
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tokenizer-type",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
choices=["BertWordPieceLowerCase", "BertWordPieceCase", "GPT2BPETokenizer"],
|
|
|
help="What type of tokenizer to use.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--data-impl",
|
|
|
type=str,
|
|
|
default="infer",
|
|
|
choices=["lazy", "cached", "mmap", "infer"],
|
|
|
help="Implementation of indexed datasets.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--reset-position-ids",
|
|
|
action="store_true",
|
|
|
help="Reset posistion ids after end-of-document token.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--reset-attention-mask",
|
|
|
action="store_true",
|
|
|
help="Reset self attention masks after " "end-of-document token.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--eod-mask-loss",
|
|
|
action="store_true",
|
|
|
help="Mask loss for the end of document tokens.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_autoresume_args(parser):
|
|
|
group = parser.add_argument_group(title="autoresume")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--adlr-autoresume",
|
|
|
action="store_true",
|
|
|
help="Enable autoresume on adlr cluster.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--adlr-autoresume-interval",
|
|
|
type=int,
|
|
|
default=1000,
|
|
|
help="Intervals over which check for autoresume" "termination signal",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_biencoder_args(parser):
|
|
|
group = parser.add_argument_group(title="biencoder")
|
|
|
|
|
|
# network size
|
|
|
group.add_argument(
|
|
|
"--ict-head-size",
|
|
|
type=int,
|
|
|
default=None,
|
|
|
help="Size of block embeddings to be used in ICT and "
|
|
|
"REALM (paper default: 128)",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--biencoder-projection-dim",
|
|
|
type=int,
|
|
|
default=0,
|
|
|
help="Size of projection head used in biencoder (paper" " default: 128)",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--biencoder-shared-query-context-model",
|
|
|
action="store_true",
|
|
|
help="Whether to share the parameters of the query "
|
|
|
"and context models or not",
|
|
|
)
|
|
|
|
|
|
# checkpointing
|
|
|
group.add_argument(
|
|
|
"--ict-load",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Directory containing an ICTBertModel checkpoint",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--bert-load",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Directory containing an BertModel checkpoint "
|
|
|
"(needed to start ICT and REALM)",
|
|
|
)
|
|
|
|
|
|
# data
|
|
|
group.add_argument(
|
|
|
"--titles-data-path",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Path to titles dataset used for ICT",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--query-in-block-prob",
|
|
|
type=float,
|
|
|
default=0.1,
|
|
|
help="Probability of keeping query in block for " "ICT dataset",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--use-one-sent-docs",
|
|
|
action="store_true",
|
|
|
help="Whether to use one sentence documents in ICT",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--evidence-data-path",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Path to Wikipedia Evidence frm DPR paper",
|
|
|
)
|
|
|
|
|
|
# training
|
|
|
group.add_argument(
|
|
|
"--retriever-report-topk-accuracies",
|
|
|
nargs="+",
|
|
|
type=int,
|
|
|
default=[],
|
|
|
help="Which top-k accuracies to report " "(e.g. '1 5 20')",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--retriever-score-scaling",
|
|
|
action="store_true",
|
|
|
help="Whether to scale retriever scores by inverse "
|
|
|
"square root of hidden size",
|
|
|
)
|
|
|
|
|
|
# faiss index
|
|
|
group.add_argument(
|
|
|
"--block-data-path",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Where to save/load BlockData to/from",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--embedding-path",
|
|
|
type=str,
|
|
|
default=None,
|
|
|
help="Where to save/load Open-Retrieval Embedding" " data to/from",
|
|
|
)
|
|
|
|
|
|
# indexer
|
|
|
group.add_argument(
|
|
|
"--indexer-batch-size",
|
|
|
type=int,
|
|
|
default=128,
|
|
|
help="How large of batches to use when doing indexing " "jobs",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--indexer-log-interval",
|
|
|
type=int,
|
|
|
default=1000,
|
|
|
help="After how many batches should the indexer " "report progress",
|
|
|
)
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_vit_args(parser):
|
|
|
group = parser.add_argument_group(title="vit")
|
|
|
|
|
|
group.add_argument(
|
|
|
"--num-classes",
|
|
|
type=int,
|
|
|
default=1000,
|
|
|
help="num of classes in vision classificaiton task",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--img-dim",
|
|
|
type=int,
|
|
|
default=224,
|
|
|
help="Image size for vision classification task",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--num-channels",
|
|
|
type=int,
|
|
|
default=3,
|
|
|
help="Number of channels in input image data",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--patch-dim", type=int, default=16, help="patch dimension used in vit"
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_zero_args(parser):
|
|
|
"""Text generate arguments."""
|
|
|
|
|
|
group = parser.add_argument_group("ZeRO configurations", "configurations")
|
|
|
group.add_argument("--zero-stage", type=int, default=1.0)
|
|
|
group.add_argument(
|
|
|
"--zero-reduce-scatter",
|
|
|
action="store_true",
|
|
|
help="Use reduce scatter if specified",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--zero-contigious-gradients",
|
|
|
action="store_true",
|
|
|
help="Use contigious memory optimizaiton if specified",
|
|
|
)
|
|
|
group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
|
|
|
group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
|
|
|
group.add_argument(
|
|
|
"--remote-device",
|
|
|
type=str,
|
|
|
default="none",
|
|
|
choices=["none", "cpu", "nvme"],
|
|
|
help="Remote device for ZeRO-3 initialized parameters.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--use-pin-memory",
|
|
|
action="store_true",
|
|
|
help="Use pinned CPU memory for ZeRO-3 initialized model parameters.",
|
|
|
)
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_memoryopt_args(parser):
|
|
|
"""Memory optimization arguments."""
|
|
|
|
|
|
group = parser.add_argument_group("Memory optimizations", "configurations")
|
|
|
group.add_argument(
|
|
|
"--scattered-embeddings",
|
|
|
action="store_true",
|
|
|
help="Save memory by scattering embedding activations. "
|
|
|
"Introduces dropout differences across MP configurations.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--split-transformers",
|
|
|
action="store_true",
|
|
|
help="Save memory by splitting transformer layers into two parts, "
|
|
|
"allowing for more frequent activation checkpoint savings.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--memory-centric-tiled-linear",
|
|
|
action="store_true",
|
|
|
help="Save memory by tiling with deepspeed.zero.TiledLinear.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--tile-factor",
|
|
|
type=int,
|
|
|
default=1,
|
|
|
help="Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. "
|
|
|
"Must be enabled with --memory-centric-tiled-linear. "
|
|
|
"Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. "
|
|
|
"Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. "
|
|
|
"Default is 1.",
|
|
|
)
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
def _add_activation_checkpoint_args(parser):
|
|
|
group = parser.add_argument_group(
|
|
|
"Activation Checkpointing", "Checkpointing Configurations"
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--deepspeed-activation-checkpointing",
|
|
|
action="store_true",
|
|
|
help="uses activation checkpointing from deepspeed",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--partition-activations",
|
|
|
action="store_true",
|
|
|
help="partition Activations across GPUs before checkpointing.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--contigious-checkpointing",
|
|
|
action="store_true",
|
|
|
help="Contigious memory checkpointing for activatoins.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--checkpoint-in-cpu",
|
|
|
action="store_true",
|
|
|
help="Move the activation checkpoints to CPU.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--synchronize-each-layer",
|
|
|
action="store_true",
|
|
|
help="does a synchronize at the beginning and end of each checkpointed layer.",
|
|
|
)
|
|
|
group.add_argument(
|
|
|
"--profile-backward",
|
|
|
action="store_true",
|
|
|
help="Enables backward pass profiling for checkpointed layers.",
|
|
|
)
|
|
|
return parser
|