flax-community
/

gpt2-medium-persian

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

gpt2-medium-persian / src /run_tokenizer.sh

saied's picture

pushing tokenizer

c36ebf7 about 3 years ago

785 Bytes

	#!/bin/bash

	export LC_ALL=C.UTF-8
	export LANG=C.UTF-8

	export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
	export DATASET_NAME=oscar
	export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
	export VOCAB_SIZE=50000
	export MIN_FREQUENCY=2
	export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<\|endoftext\|>','<\|startoftext\|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]'


	python src/train_tokenizer.py \
	--output_dir="$OUTPUT_DIR" \
	--dataset_name="$DATASET_NAME" \
	--dataset_config_name="$DATASET_CONFIG_NAME" \
	--vocab_size=$VOCAB_SIZE \
	--min_frequency=$MIN_FREQUENCY \
	--special_tokens="$SPECIAL_TOKENS"