gpt2-medium-persian / src /run_tokenizer.sh
saied's picture
pushing tokenizer
c36ebf7
raw
history blame
785 Bytes
#!/bin/bash
export LC_ALL=C.UTF-8
export LANG=C.UTF-8
export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
export DATASET_NAME=oscar
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
export VOCAB_SIZE=50000
export MIN_FREQUENCY=2
export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<|endoftext|>','<|startoftext|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]'
python src/train_tokenizer.py \
--output_dir="$OUTPUT_DIR" \
--dataset_name="$DATASET_NAME" \
--dataset_config_name="$DATASET_CONFIG_NAME" \
--vocab_size=$VOCAB_SIZE \
--min_frequency=$MIN_FREQUENCY \
--special_tokens="$SPECIAL_TOKENS"