You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
CodeGeeX/scripts/process_pretrain_dataset.sh

25 lines
578 B
Bash

# Process dataset for CodeGeeX pretraining
DATASET_PATH=$1
OUTPUT_PATH=$2
LANGUAGE=$3
SCRIPT_PATH=$(realpath "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
MAIN_DIR=$(dirname "$SCRIPT_DIR")
TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
if [ -z "$LANGUAGE" ]; then
LANGUAGE=python
fi
CMD="python $MAIN_DIR/codegeex/data/process_pretrain_dataset.py \
--dataset_path $DATASET_PATH \
--tokenizer_path $TOKENIZER_PATH \
--output_prefix $OUTPUT_PATH \
--language $LANGUAGE \
--mode pretrain \
--seq_len 2048"
echo "$CMD"
eval "$CMD"