# coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Megatron arguments.""" import argparse import os import torch import deepspeed def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): """Parse all arguments.""" parser = argparse.ArgumentParser( description="Megatron-LM Arguments", allow_abbrev=False ) # Standard arguments. parser = _add_network_size_args(parser) parser = _add_regularization_args(parser) parser = _add_training_args(parser) parser = _add_initialization_args(parser) parser = _add_learning_rate_args(parser) parser = _add_checkpointing_args(parser) parser = _add_mixed_precision_args(parser) parser = _add_distributed_args(parser) parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) parser = _add_biencoder_args(parser) parser = _add_vit_args(parser) parser = _add_logging_args(parser) parser = _add_zero_args(parser) parser = _add_memoryopt_args(parser) parser = _add_activation_checkpoint_args(parser) parser = _add_inference_args(parser) # Custom arguments. if extra_args_provider is not None: parser = extra_args_provider(parser) parser = deepspeed.add_config_arguments(parser) # Parse. if ignore_unknown_args: args, _ = parser.parse_known_args() else: args = parser.parse_args() # helper argument to set deepspeed pipeline parallel or not args.ds_pipeline_enabled = not args.no_pipeline_parallel # Distributed args. args.rank = int(os.getenv("RANK", "0")) args.world_size = int(os.getenv("WORLD_SIZE", "1")) # Tensor model parallel size. args.tensor_model_parallel_size = min( args.tensor_model_parallel_size, args.world_size ) assert ( args.world_size % args.tensor_model_parallel_size == 0 ), "world size" " ({}) is not divisible by tensor model parallel size ({})".format( args.world_size, args.tensor_model_parallel_size ) # Pipeline model parallel size. args.pipeline_model_parallel_size = min( args.pipeline_model_parallel_size, (args.world_size // args.tensor_model_parallel_size), ) # Checks. if args.no_pipeline_parallel: assert ( args.pipeline_model_parallel_size == 1 ), "pipeline_model_parallel_size must be 1 if pipeline parallel is disabled" model_parallel_size = ( args.pipeline_model_parallel_size * args.tensor_model_parallel_size ) assert args.world_size % model_parallel_size == 0, ( "world size is not" " divisible by tensor parallel size ({}) times pipeline parallel " "size ({})".format( args.world_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size, ) ) args.data_parallel_size = args.world_size // model_parallel_size if args.rank == 0: print( "using world size: {}, data-parallel-size: {}, " "tensor-model-parallel size: {}, " "pipeline-model-parallel size: {} ".format( args.world_size, args.data_parallel_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size, ), flush=True, ) # Deprecated arguments assert args.batch_size is None, ( "--batch-size argument is no longer " "valid, use --micro-batch-size instead" ) del args.batch_size assert args.warmup is None, ( "--warmup argument is no longer valid, use " "--lr-warmup-fraction instead" ) del args.warmup assert args.model_parallel_size is None, ( "--model-parallel-size is no " "longer valid, use --tensor-model-parallel-size instead" ) del args.model_parallel_size # Set input defaults. for key in defaults: # For default to be valid, it should not be provided in the # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: if args.force_default: print( "WARNING: overriding arguments for {key}:{v2} \ with default {key}:{v}".format( key=key, v=defaults[key], v2=getattr(args, key) ), flush=True, ) setattr(args, key, defaults[key]) else: if args.rank == 0: print( "WARNING: overriding default arguments for {key}:{v} \ with {key}:{v2}".format( key=key, v=defaults[key], v2=getattr(args, key) ), flush=True, ) else: setattr(args, key, defaults[key]) # Batch size. assert args.micro_batch_size is not None assert args.micro_batch_size > 0 if args.global_batch_size is None: args.global_batch_size = args.micro_batch_size * args.data_parallel_size if args.rank == 0: print( "setting global batch size to {}".format(args.global_batch_size), flush=True, ) assert args.global_batch_size > 0 if args.num_layers_per_virtual_pipeline_stage is not None: assert args.pipeline_model_parallel_size > 2, ( "pipeline-model-parallel size should be greater than 2 with " "interleaved schedule" ) assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, ( "number of layers is not divisible by number of layers per virtual " "pipeline stage" ) args.virtual_pipeline_model_parallel_size = ( args.num_layers // args.pipeline_model_parallel_size ) // args.num_layers_per_virtual_pipeline_stage else: args.virtual_pipeline_model_parallel_size = None # Parameters dtype. args.params_dtype = torch.float if args.fp16: assert not args.bf16 args.params_dtype = torch.half if args.bf16: assert not args.fp16 args.params_dtype = torch.bfloat16 # bfloat16 requires gradient accumulation and all-reduce to # be done in fp32. if not args.accumulate_allreduce_grads_in_fp32: args.accumulate_allreduce_grads_in_fp32 = True if args.rank == 0: print( "accumulate and all-reduce gradients in fp32 for " "bfloat16 data type.", flush=True, ) if args.rank == 0: print("using {} for parameters ...".format(args.params_dtype), flush=True) # If we do accumulation and all-reduces in fp32, we need to have # local DDP and we should set the use-contiguous-buffers-in-ddp. if args.accumulate_allreduce_grads_in_fp32: assert args.DDP_impl == "local" args.use_contiguous_buffers_in_ddp = True if args.dataloader_type is None: args.dataloader_type = "single" # Consumed tokens. args.consumed_train_samples = 0 args.consumed_valid_samples = 0 args.consumed_train_tokens = 0 # Iteration-based training. if args.train_iters: # If we use iteration-based training, make sure the # sample-based options are off. assert args.train_samples is None, "expected iteration-based training" assert ( args.lr_decay_samples is None ), "expected iteration-based learning rate decay" assert ( args.lr_warmup_samples == 0 ), "expected iteration-based learning rate warmup" assert ( args.rampup_batch_size is None ), "expected no batch-size rampup for iteration-based training" if args.lr_warmup_fraction is not None: assert ( args.lr_warmup_iters == 0 ), "can only specify one of lr-warmup-fraction and lr-warmup-iters" # Sample-based training. if args.train_samples: # If we use sample-based training, make sure the # iteration-based options are off. assert args.train_iters is None, "expected sample-based training" assert args.lr_decay_iters is None, "expected sample-based learning rate decay" assert args.lr_warmup_iters == 0, "expected sample-based learnig rate warmup" if args.lr_warmup_fraction is not None: assert args.lr_warmup_samples == 0, ( "can only specify one of lr-warmup-fraction " "and lr-warmup-samples" ) # Check required arguments. required_args = [ "num_layers", "hidden_size", "num_attention_heads", "max_position_embeddings", ] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # args.learned_position_embeddings = args.learned_position_embeddings > 0 # Checks. if args.ffn_hidden_size is None: args.ffn_hidden_size = 4 * args.hidden_size if args.kv_channels is None: assert args.hidden_size % args.num_attention_heads == 0 args.kv_channels = args.hidden_size // args.num_attention_heads if args.seq_length is not None: assert args.encoder_seq_length is None args.encoder_seq_length = args.seq_length else: assert args.encoder_seq_length is not None args.seq_length = args.encoder_seq_length if args.seq_length is not None: assert args.max_position_embeddings >= args.seq_length if args.decoder_seq_length is not None: assert args.max_position_embeddings >= args.decoder_seq_length if args.lr is not None: assert args.min_lr <= args.lr if args.save is not None: assert args.save_interval is not None # Mixed precision checks. if args.fp16_lm_cross_entropy: assert args.fp16, "lm cross entropy in fp16 only support in fp16 mode." if args.fp32_residual_connection: assert ( args.fp16 or args.bf16 ), "residual connection in fp32 only supported when using fp16 or bf16." # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.checkpoint_activations, ( "for distribute-checkpointed-activations to work you " "need to enable checkpoint-activations" ) _print_args(args) return args def _print_args(args): """Print arguments.""" if args.rank == 0: print("------------------------ arguments ------------------------", flush=True) str_list = [] for arg in vars(args): dots = "." * (48 - len(arg)) str_list.append(" {} {} {}".format(arg, dots, getattr(args, arg))) for arg in sorted(str_list, key=lambda x: x.lower()): print(arg, flush=True) print("-------------------- end of arguments ---------------------", flush=True) def _check_arg_is_not_none(args, arg): assert getattr(args, arg) is not None, "{} argument is None".format(arg) def _add_network_size_args(parser): group = parser.add_argument_group(title="network size") group.add_argument( "--num-layers", type=int, default=None, help="Number of transformer layers.", ) group.add_argument( "--hidden-size", type=int, default=None, help="Transformer hidden size.", ) group.add_argument( "--reward-growth", type=str, default="constant", choices=["constant", "linear", "quadratic"], help="Reward growth function.", ) group.add_argument( "--ffn-hidden-size", type=int, default=None, help="Transformer Feed-Forward Network hidden size. " "This is set to 4*hidden-size if not provided", ) group.add_argument( "--num-attention-heads", type=int, default=None, help="Number of transformer attention heads.", ) group.add_argument( "--kv-channels", type=int, default=None, help="Projection weights dimension in multi-head " "attention. This is set to " " args.hidden_size // args.num_attention_heads " "if not provided.", ) group.add_argument( "--scale-embeddings", action="store_true", help="Scale embeddings by sqrt(d_model).", ) group.add_argument( "--max-position-embeddings", type=int, default=None, help="Maximum number of position embeddings to use. " "This is the size of position embedding.", ) group.add_argument( "--no-learned-position-embeddings", action="store_true", help="Do not learn position embeddings. ", ) group.add_argument( "--make-vocab-size-divisible-by", type=int, default=128, help="Pad the vocab size to be divisible by this value." "This is added for computational efficieny reasons.", ) group.add_argument( "--layernorm-epsilon", type=float, default=1e-5, help="Layer norm epsilon." ) group.add_argument( "--apply-residual-connection-post-layernorm", action="store_true", help="If set, use original BERT residula connection " "ordering.", ) group.add_argument('--scaled-upper-triang-masked-softmax-fusion', action='store_true', help='Enable fusion of query_key_value_scaling ' 'time (upper diagonal) masking, softmax.') group.add_argument( "--openai-gelu", action="store_true", help="Use OpenAIs GeLU implementation. This option" "should not be used unless for backward compatibility" "reasons.", ) group.add_argument( "--onnx-safe", type=bool, required=False, help="Use workarounds for known problems with " "Torch ONNX exporter", ) group.add_argument( "--bert-no-binary-head", action="store_false", help="Disable BERT binary head.", dest="bert_binary_head", ) return parser def _add_logging_args(parser): group = parser.add_argument_group(title="logging") group.add_argument( "--log-params-norm", action="store_true", help="If set, calculate and log parameters norm.", ) group.add_argument( "--log-num-zeros-in-grad", action="store_true", help="If set, calculate and log the number of zeros in gradient.", ) group.add_argument( "--tensorboard-log-interval", type=int, default=1, help="Report to tensorboard interval.", ) group.add_argument( "--tensorboard-queue-size", type=int, default=1000, help="Size of the tensorboard queue for pending events " "and summaries before one of the ‘add’ calls forces a " "flush to disk.", ) group.add_argument( "--log-timers-to-tensorboard", action="store_true", help="If set, write timers to tensorboard.", ) group.add_argument( "--log-batch-size-to-tensorboard", action="store_true", help="If set, write batch-size to tensorboard.", ) group.add_argument( "--no-log-learnig-rate-to-tensorboard", action="store_false", help="Disable learning rate logging to tensorboard.", dest="log_learning_rate_to_tensorboard", ) group.add_argument( "--no-log-loss-scale-to-tensorboard", action="store_false", help="Disable loss-scale logging to tensorboard.", dest="log_loss_scale_to_tensorboard", ) group.add_argument( "--log-validation-ppl-to-tensorboard", action="store_true", help="If set, write validation perplexity to " "tensorboard.", ) group.add_argument( "--wandb-logging", action="store_true", help="If set, log training progress to wandb.", ) group.add_argument( "--wandb-log-interval", type=int, default=1, help="Log to wandb every N steps.", ) return parser def _add_regularization_args(parser): group = parser.add_argument_group(title="regularization") group.add_argument( "--attention-dropout", type=float, default=0.1, help="Post attention dropout probability.", ) group.add_argument( "--hidden-dropout", type=float, default=0.1, help="Dropout probability for hidden state transformer.", ) group.add_argument( "--weight-decay", type=float, default=0.01, help="Weight decay coefficient for L2 regularization.", ) group.add_argument( "--tempering", type=float, default=None, help="Tempering coefficient for the model.", ) group.add_argument( "--gold", action="store_true", help="If set, use gold regularization.", ) group.add_argument( "--gold-beta", type=float, default=0.05, help="Beta for GOLD tempering.", ) group.add_argument( "--play-tau", type=float, default=2.0 ) group.add_argument( "--clip-grad", type=float, default=1.0, help="Gradient clipping based on global L2 norm.", ) group.add_argument( "--adam-beta1", type=float, default=0.9, help="First coefficient for computing running averages " "of gradient and its square", ) group.add_argument( "--adam-beta2", type=float, default=0.999, help="Second coefficient for computing running averages " "of gradient and its square", ) group.add_argument( "--adam-eps", type=float, default=1e-08, help="Term added to the denominator to improve" "numerical stability", ) group.add_argument( "--sgd-momentum", type=float, default=0.9, help="Momentum factor for sgd" ) return parser def _add_training_args(parser): group = parser.add_argument_group(title="training") group.add_argument( "--micro-batch-size", type=int, default=None, help="Batch size per model instance (local batch size). " "Global batch size is local batch size times data " "parallel size times number of micro batches.", ) group.add_argument( "--batch-size", type=int, default=None, help="Old batch size parameter, do not use. " "Use --micro-batch-size instead", ) group.add_argument( "--global-batch-size", type=int, default=None, help="Training batch size. If set, it should be a " "multiple of micro-batch-size times data-parallel-size. " "If this value is None, then " "use micro-batch-size * data-parallel-size as the " "global batch size. This choice will result in 1 for " "number of micro-batches.", ) group.add_argument( "--rampup-batch-size", nargs="*", default=None, help="Batch size ramp up with the following values:" " --rampup-batch-size " " " " " "For example:" " --rampup-batch-size 16 8 300000 \ " " --global-batch-size 1024" "will start with global batch size 16 and over " " (1024 - 16) / 8 = 126 intervals will increase" "the batch size linearly to 1024. In each interval" "we will use approximately 300000 / 126 = 2380 samples.", ) group.add_argument( "--checkpoint-activations", action="store_true", help="Checkpoint activation to allow for training " "with larger models, sequences, and batch sizes.", ) group.add_argument( "--distribute-checkpointed-activations", action="store_true", help="If set, distribute checkpointed activations " "across model parallel group.", ) group.add_argument( "--checkpoint-num-layers", type=int, default=1, help="chunk size (number of layers) for checkpointing.", ) group.add_argument( "--train-iters", type=int, default=None, help="Total number of iterations to train over all " "training runs. Note that either train-iters or " "train-samples should be provided.", ) group.add_argument( "--train-samples", type=int, default=None, help="Total number of samples to train over all " "training runs. Note that either train-iters or " "train-samples should be provided.", ) group.add_argument( "--train-tokens", type=int, default=None, help="Total number of tokens to train over all " "training runs.", ) group.add_argument( "--log-interval", type=int, default=100, help="Report loss and timing interval." ) group.add_argument( "--exit-interval", type=int, default=None, help="Exit the program after the iteration is divisible " "by this value.", ) group.add_argument( "--exit-duration-in-mins", type=int, default=None, help="Exit the program after this many minutes.", ) group.add_argument( "--tensorboard-dir", type=str, default=None, help="Write TensorBoard logs to this directory.", ) group.add_argument( "--no-masked-softmax-fusion", action="store_false", help="Disable fusion of query_key_value scaling, " "masking, and softmax.", dest="masked_softmax_fusion", ) group.add_argument( "--no-bias-gelu-fusion", action="store_false", help="Disable bias and gelu fusion.", dest="bias_gelu_fusion", ) group.add_argument( "--no-bias-dropout-fusion", action="store_false", help="Disable bias and dropout fusion.", dest="bias_dropout_fusion", ) group.add_argument( "--optimizer", type=str, default="adam", choices=["adam", "sgd"], help="Optimizer function", ) group.add_argument( "--dataloader-type", type=str, default=None, choices=["single", "cyclic"], help="Single pass vs multiple pass data loader", ) group.add_argument( "--cpu-optimizer", action="store_true", help="Run optimizer on CPU" ) group.add_argument( "--cpu_torch_adam", action="store_true", help="Use Torch Adam as optimizer on CPU.", ) group.add_argument( "--no-pipeline-parallel", action="store_true", help="Disable pipeline parallelism", ) group.add_argument( "--ms-model", action="store_true", help="use model converted from Mindspore", ) return parser def _add_initialization_args(parser): group = parser.add_argument_group(title="initialization") group.add_argument( "--seed", type=int, default=1234, help="Random seed used for python, numpy, " "pytorch, and cuda.", ) group.add_argument( "--init-method-std", type=float, default=0.02, help="Standard deviation of the zero mean normal " "distribution used for weight initialization.", ) group.add_argument( "--init-method-xavier-uniform", action="store_true", help="Enable Xavier uniform parameter initialization", ) return parser def _add_inference_args(parser): group = parser.add_argument_group(title="initialization") group.add_argument( '--beam-warmup', action="store_true", ) group.add_argument( '--beam-warmup-length', type=int, default=0, ) group.add_argument( '--beam-search', action="store_true", ) group.add_argument( '--beam-search-nucleus', action="store_true", ) group.add_argument( '--num-beams', type=int, default=4, ) return parser def _add_learning_rate_args(parser): group = parser.add_argument_group(title="learning rate") group.add_argument( "--lr", type=float, default=None, help="Initial learning rate. Depending on decay style " "and initial warmup, the learing rate at each " "iteration would be different.", ) group.add_argument( "--lr-decay-style", type=str, default="linear", choices=["constant", "linear", "cosine"], help="Learning rate decay function.", ) group.add_argument( "--lr-decay-iters", type=int, default=None, help="number of iterations to decay learning rate over," " If None defaults to `--train-iters`", ) group.add_argument( "--lr-decay-samples", type=int, default=None, help="number of samples to decay learning rate over," " If None defaults to `--train-samples`", ) group.add_argument( "--lr-decay-tokens", type=int, default=None, help="number of tokens to decay learning rate over," " If not None will override iter/sample-based decay", ) group.add_argument( "--lr-warmup-fraction", type=float, default=None, help="fraction of lr-warmup-(iters/samples) to use " "for warmup (as a float)", ) group.add_argument( "--lr-warmup-iters", type=int, default=0, help="number of iterations to linearly warmup " "learning rate over.", ) group.add_argument( "--lr-warmup-samples", type=int, default=0, help="number of samples to linearly warmup " "learning rate over.", ) group.add_argument( "--warmup", type=int, default=None, help="Old lr warmup argument, do not use. Use one of the" "--lr-warmup-* arguments above", ) group.add_argument( "--min-lr", type=float, default=0.0, help="Minumum value for learning rate. The scheduler" "clip values below this threshold.", ) group.add_argument( "--override-lr-scheduler", action="store_true", help="Reset the values of the scheduler (learning rate," "warmup iterations, minimum learning rate, maximum " "number of iterations, and decay style from input " "arguments and ignore values from checkpoints. Note" "that all the above values will be reset.", ) group.add_argument( "--use-checkpoint-lr-scheduler", action="store_true", help="Use checkpoint to set the values of the scheduler " "(learning rate, warmup iterations, minimum learning " "rate, maximum number of iterations, and decay style " "from checkpoint and ignore input arguments.", ) return parser def _add_checkpointing_args(parser): group = parser.add_argument_group(title="checkpointing") group.add_argument( "--save", type=str, default=None, help="Output directory to save checkpoints to.", ) group.add_argument( "--save-interval", type=int, default=None, help="Number of iterations between checkpoint saves.", ) group.add_argument( "--no-save-optim", action="store_true", default=None, help="Do not save current optimizer.", ) group.add_argument( "--no-save-rng", action="store_true", default=None, help="Do not save current rng state.", ) group.add_argument( "--load", type=str, default=None, help="Directory containing a model checkpoint.", ) group.add_argument( "--low-memory-load", action="store_true", default=None, help="Load model checkpoint in low memory mode." "On each machine, workers load the checkpoint one at a time." ) group.add_argument( "--dist-timeout", type=int, default=30, help="Timeout for Pytorch Distributed backend (in minutes).", ) group.add_argument( "--load-state", type=str, default=None, help="Start training from a existing model state.", ) group.add_argument( "--no-load-optim", action="store_true", default=None, help="Do not load optimizer when loading checkpoint.", ) group.add_argument( "--no-load-rng", action="store_true", default=None, help="Do not load rng state when loading checkpoint.", ) group.add_argument( "--finetune", action="store_true", help="Load model for finetuning. Do not load optimizer " "or rng state from checkpoint and set iteration to 0. " "Assumed when loading a release checkpoint.", ) return parser def _add_mixed_precision_args(parser): group = parser.add_argument_group(title="mixed precision") group.add_argument("--fp16", action="store_true", help="Run model in fp16 mode.") group.add_argument("--ln-fp16", action="store_true", help="Run layernorm in fp16 mode.") group.add_argument( "--bf16", action="store_true", help="Run model in bfloat16 mode." ) group.add_argument( "--loss-scale", type=float, default=None, help="Static loss scaling, positive power of 2 " "values can improve fp16 convergence. If None, dynamic" "loss scaling is used.", ) group.add_argument( "--initial-loss-scale", type=float, default=2 ** 32, help="Initial loss-scale for dynamic loss scaling.", ) group.add_argument( "--min-loss-scale", type=float, default=1.0, help="Minimum loss scale for dynamic loss scale.", ) group.add_argument( "--loss-scale-window", type=float, default=1000, help="Window over which to raise/lower dynamic scale.", ) group.add_argument( "--hysteresis", type=int, default=2, help="hysteresis for dynamic loss scaling" ) group.add_argument( "--fp32-residual-connection", action="store_true", help="Move residual connections to fp32.", ) group.add_argument('--apply-query-key-layer-scaling', action='store_true', help='Scale Q * K^T by 1 / layer-number. If this flag ' 'is set, then it will automatically set ' 'attention-softmax-in-fp32 to true') group.add_argument( "--attention-softmax-in-fp32", action="store_true", help="Run attention masking and softmax in fp32. " "This flag is ignored unless " "--no-query-key-layer-scaling is specified.", ) group.add_argument( "--accumulate-allreduce-grads-in-fp32", action="store_true", help="Gradient accumulation and all-reduce in fp32.", ) group.add_argument( "--fp16-lm-cross-entropy", action="store_true", help="Move the cross entropy unreduced loss calculation" "for lm head to fp16.", ) return parser def _add_distributed_args(parser): group = parser.add_argument_group(title="distributed") group.add_argument( "--tensor-model-parallel-size", type=int, default=1, help="Degree of tensor model parallelism.", ) group.add_argument( "--pipeline-model-parallel-size", type=int, default=1, help="Degree of pipeline model parallelism.", ) group.add_argument( "--model-parallel-size", type=int, default=None, help="Old model parallel argument, do not use. Use " "--tensor-model-parallel-size instead.", ) group.add_argument( "--num-layers-per-virtual-pipeline-stage", type=int, default=None, help="Number of layers per virtual pipeline stage", ) group.add_argument( "--distributed-backend", default="nccl", choices=["nccl", "gloo"], help="Which backend to use for distributed training.", ) group.add_argument( "--DDP-impl", default="local", choices=["local", "torch"], help="which DistributedDataParallel implementation " "to use.", ) group.add_argument( "--use-contiguous-buffers-in-ddp", action="store_true", help="If set, use contiguous buffer in DDP. Note that " "this option only works woth local DDP.", ) group.add_argument( "--no-scatter-gather-tensors-in-pipeline", action="store_false", help="Use scatter/gather to optimize communication of tensors in pipeline", dest="scatter_gather_tensors_in_pipeline", ) group.add_argument( "--local_rank", type=int, default=None, help="local rank passed from distributed launcher.", ) group.add_argument( "--lazy-mpu-init", type=bool, required=False, help="If set to True, initialize_megatron() " "skips DDP initialization and returns function to " "complete it instead.Also turns on " "--use-cpu-initialization flag. This is for " "external DDP manager.", ) group.add_argument( "--use-cpu-initialization", action="store_true", default=None, help="If set, affine parallel weights " "initialization uses CPU", ) group.add_argument( "--force-device", type=int, default=None, help="Force the model to run on a particular gpu", ) group.add_argument( "--force-default", action="store_true", help="Force setting default arguments for distributed training", ) return parser def _add_validation_args(parser): group = parser.add_argument_group(title="validation") group.add_argument( "--eval-iters", type=int, default=100, help="Number of iterations to run for evaluation" "validation/test for.", ) group.add_argument( "--eval-interval", type=int, default=1000, help="Interval between running evaluation on " "validation set.", ) group.add_argument( "--co-evaluation", action="store_true", help="If set, run evaluation on each part of the validation set" ) return parser def _add_data_args(parser): group = parser.add_argument_group(title="data and dataloader") group.add_argument( "--data-path", nargs="*", default=None, help="Path to the training dataset. Accepted format:" "1) a single data path, 2) multiple datasets in the" "form: dataset1-weight dataset1-path dataset2-weight " "dataset2-path ...", ) group.add_argument( "--valid-data-path", nargs="*", default=None, help="Path to the validation dataset. Accepted format:" "1) a single data path, 2) multiple datasets in the" "form: dataset1-weight dataset1-path dataset2-weight " "dataset2-path ...;" "when co-evaluation is enabled, the form will be dataset1-tag dataset1-path ...", ) group.add_argument("--index-cache-dir", type=str, default=None, help="Path to the index cache") group.add_argument( "--test-data-path", nargs="*", default=None, help="Path to the test dataset. Accepted format:" "1) a single data path, 2) multiple datasets in the" "form: dataset1-tag dataset1-path dataset2-tag " "dataset2-path ...", ) group.add_argument( "--split", type=str, default="969, 30, 1", help="Comma-separated list of proportions for training," " validation, and test split. For example the split " "`90,5,5` will use 90%% of data for training, 5%% for " "validation and 5%% for test.", ) group.add_argument( "--vocab-file", type=str, default=None, help="Path to the vocab file.", ) group.add_argument( "--merge-file", type=str, default=None, help="Path to the BPE merge file.", ) group.add_argument( "--tokenizer-path", type=str, default=None, help="Path to the tokenizer dir.", ) group.add_argument( "--vocab-extra-ids", type=int, default=0, help="Number of additional vocabulary tokens. " "They are used for span masking in the T5 model", ) group.add_argument( "--seq-length", type=int, default=None, help="Maximum sequence length to process.", ) group.add_argument( "--encoder-seq-length", type=int, default=None, help="Maximum encoder sequence length to process." "This should be exclusive of --seq-length", ) group.add_argument( "--decoder-seq-length", type=int, default=None, help="Maximum decoder sequence length to process.", ) group.add_argument( "--retriever-seq-length", type=int, default=256, help="Maximum sequence length for the biencoder model " " for retriever", ) group.add_argument( "--sample-rate", type=float, default=1.0, help="sample rate for training data. Supposed to be 0 " " < sample_rate < 1", ) group.add_argument( "--mask-prob", type=float, default=0.15, help="Probability of replacing a token with mask.", ) group.add_argument( "--short-seq-prob", type=float, default=0.1, help="Probability of producing a short sequence.", ) group.add_argument("--mmap-warmup", action="store_true", help="Warm up mmap files.") group.add_argument( "--num-workers", type=int, default=2, help="Dataloader number of workers." ) group.add_argument( "--tokenizer-type", type=str, default=None, choices=["BertWordPieceLowerCase", "BertWordPieceCase", "GPT2BPETokenizer"], help="What type of tokenizer to use.", ) group.add_argument( "--data-impl", type=str, default="infer", choices=["lazy", "cached", "mmap", "infer"], help="Implementation of indexed datasets.", ) group.add_argument( "--reset-position-ids", action="store_true", help="Reset posistion ids after end-of-document token.", ) group.add_argument( "--reset-attention-mask", action="store_true", help="Reset self attention masks after " "end-of-document token.", ) group.add_argument( "--eod-mask-loss", action="store_true", help="Mask loss for the end of document tokens.", ) return parser def _add_autoresume_args(parser): group = parser.add_argument_group(title="autoresume") group.add_argument( "--adlr-autoresume", action="store_true", help="Enable autoresume on adlr cluster.", ) group.add_argument( "--adlr-autoresume-interval", type=int, default=1000, help="Intervals over which check for autoresume" "termination signal", ) return parser def _add_biencoder_args(parser): group = parser.add_argument_group(title="biencoder") # network size group.add_argument( "--ict-head-size", type=int, default=None, help="Size of block embeddings to be used in ICT and " "REALM (paper default: 128)", ) group.add_argument( "--biencoder-projection-dim", type=int, default=0, help="Size of projection head used in biencoder (paper" " default: 128)", ) group.add_argument( "--biencoder-shared-query-context-model", action="store_true", help="Whether to share the parameters of the query " "and context models or not", ) # checkpointing group.add_argument( "--ict-load", type=str, default=None, help="Directory containing an ICTBertModel checkpoint", ) group.add_argument( "--bert-load", type=str, default=None, help="Directory containing an BertModel checkpoint " "(needed to start ICT and REALM)", ) # data group.add_argument( "--titles-data-path", type=str, default=None, help="Path to titles dataset used for ICT", ) group.add_argument( "--query-in-block-prob", type=float, default=0.1, help="Probability of keeping query in block for " "ICT dataset", ) group.add_argument( "--use-one-sent-docs", action="store_true", help="Whether to use one sentence documents in ICT", ) group.add_argument( "--evidence-data-path", type=str, default=None, help="Path to Wikipedia Evidence frm DPR paper", ) # training group.add_argument( "--retriever-report-topk-accuracies", nargs="+", type=int, default=[], help="Which top-k accuracies to report " "(e.g. '1 5 20')", ) group.add_argument( "--retriever-score-scaling", action="store_true", help="Whether to scale retriever scores by inverse " "square root of hidden size", ) # faiss index group.add_argument( "--block-data-path", type=str, default=None, help="Where to save/load BlockData to/from", ) group.add_argument( "--embedding-path", type=str, default=None, help="Where to save/load Open-Retrieval Embedding" " data to/from", ) # indexer group.add_argument( "--indexer-batch-size", type=int, default=128, help="How large of batches to use when doing indexing " "jobs", ) group.add_argument( "--indexer-log-interval", type=int, default=1000, help="After how many batches should the indexer " "report progress", ) return parser def _add_vit_args(parser): group = parser.add_argument_group(title="vit") group.add_argument( "--num-classes", type=int, default=1000, help="num of classes in vision classificaiton task", ) group.add_argument( "--img-dim", type=int, default=224, help="Image size for vision classification task", ) group.add_argument( "--num-channels", type=int, default=3, help="Number of channels in input image data", ) group.add_argument( "--patch-dim", type=int, default=16, help="patch dimension used in vit" ) return parser def _add_zero_args(parser): """Text generate arguments.""" group = parser.add_argument_group("ZeRO configurations", "configurations") group.add_argument("--zero-stage", type=int, default=1.0) group.add_argument( "--zero-reduce-scatter", action="store_true", help="Use reduce scatter if specified", ) group.add_argument( "--zero-contigious-gradients", action="store_true", help="Use contigious memory optimizaiton if specified", ) group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0) group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0) group.add_argument( "--remote-device", type=str, default="none", choices=["none", "cpu", "nvme"], help="Remote device for ZeRO-3 initialized parameters.", ) group.add_argument( "--use-pin-memory", action="store_true", help="Use pinned CPU memory for ZeRO-3 initialized model parameters.", ) return parser def _add_memoryopt_args(parser): """Memory optimization arguments.""" group = parser.add_argument_group("Memory optimizations", "configurations") group.add_argument( "--scattered-embeddings", action="store_true", help="Save memory by scattering embedding activations. " "Introduces dropout differences across MP configurations.", ) group.add_argument( "--split-transformers", action="store_true", help="Save memory by splitting transformer layers into two parts, " "allowing for more frequent activation checkpoint savings.", ) group.add_argument( "--memory-centric-tiled-linear", action="store_true", help="Save memory by tiling with deepspeed.zero.TiledLinear.", ) group.add_argument( "--tile-factor", type=int, default=1, help="Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. " "Must be enabled with --memory-centric-tiled-linear. " "Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. " "Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. " "Default is 1.", ) return parser def _add_activation_checkpoint_args(parser): group = parser.add_argument_group( "Activation Checkpointing", "Checkpointing Configurations" ) group.add_argument( "--deepspeed-activation-checkpointing", action="store_true", help="uses activation checkpointing from deepspeed", ) group.add_argument( "--partition-activations", action="store_true", help="partition Activations across GPUs before checkpointing.", ) group.add_argument( "--contigious-checkpointing", action="store_true", help="Contigious memory checkpointing for activatoins.", ) group.add_argument( "--checkpoint-in-cpu", action="store_true", help="Move the activation checkpoints to CPU.", ) group.add_argument( "--synchronize-each-layer", action="store_true", help="does a synchronize at the beginning and end of each checkpointed layer.", ) group.add_argument( "--profile-backward", action="store_true", help="Enables backward pass profiling for checkpointed layers.", ) return parser