mirror of https://github.com/THUDM/CodeGeeX.git
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
25 lines
578 B
Bash
25 lines
578 B
Bash
2 years ago
|
# Process dataset for CodeGeeX pretraining
|
||
|
|
||
|
DATASET_PATH=$1
|
||
|
OUTPUT_PATH=$2
|
||
|
LANGUAGE=$3
|
||
|
|
||
|
SCRIPT_PATH=$(realpath "$0")
|
||
|
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
|
||
|
MAIN_DIR=$(dirname "$SCRIPT_DIR")
|
||
|
TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
|
||
|
|
||
|
if [ -z "$LANGUAGE" ]; then
|
||
|
LANGUAGE=python
|
||
|
fi
|
||
|
|
||
|
CMD="python $MAIN_DIR/codegeex/data/process_pretrain_dataset.py \
|
||
|
--dataset_path $DATASET_PATH \
|
||
|
--tokenizer_path $TOKENIZER_PATH \
|
||
|
--output_prefix $OUTPUT_PATH \
|
||
|
--language $LANGUAGE \
|
||
|
--mode pretrain \
|
||
|
--seq_len 2048"
|
||
|
|
||
|
echo "$CMD"
|
||
|
eval "$CMD"
|