''' File Name: prepare_sft_dataset.py Author: Nikhil Malhotra Date: 21/7/2024 purpose: The purpose of a file is to create high quality SFT dataset for Project Indus. Dataset source is obtained from Hugging face and enables to get high quality SFT dataset Dataset is then translated in requisite dialects as supported by Google Dataset is also split in train and test and enables to create requisite files Name of the file carries the source, translated into a dialect along with split type ''' #Imports import os import pandas as pd import configparser from googletrans import Translator #Configuration settings source_dir_path = "source" translated_dir_path = "translated" config = configparser.ConfigParser(default_section="DATASETS") ''' Dialects double codes used for translation 1. English : en 2. Hindi : hi 3. Dogri : doi 4. Bhojpuri: bho 5. Maithili :mai ''' translated_dialect = "hi" def store_sft_dataset(name_of_dataset , data_frame,split_type): """ Method to get data from Hugging face source and store this data in csv file This method also calls translation in dialects Input : name of dataset to be fetched from hugging face Data Frame : Data frame contains question or answers or prompts and messages Split type: test or train as the data is split """ file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv" if not os.path.isfile(file_name): print("Opening file....." , file_name) data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True) print("Finished writing file....",file_name) #Change dialect to translate at top depending upon the dialect needed translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect) def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"): USE_COUNT = False #Only use this if you need to test with small data count = 0 """ This method enables translation in various dialects byt taking a dataset in a data frame and then storing it as a csv file Input : name of dataset to be fetched from hugging face Data Frame : Data frame contains question or answers or prompts and messages Split type: test or train as the data is split dialect name: Dialect in whcih data needs to be converted into """ print("Translating now....") translator = Translator() #Make a new dataframe for translation translate_df = pd.DataFrame(columns=["question" , "answer"]) translated_append_list = [] for index,val in data_frame.iterrows(): translated_ques = translator.translate(val["question"], dest=dialect_name).text translated_ans = translator.translate(val["answer"], dest=dialect_name).text translated_append_list.append({'question' : translated_ques, 'answer': translated_ans}) count +=1 #If count is used then the program thinks you are testing. # It enables you to break the loop with 5 data points if USE_COUNT: if count == 5: break df = pd.concat([translate_df, pd.DataFrame(translated_append_list)]) translated_file_name = str(os.path.join(translated_dir_path, name_of_dataset.replace("/","_") ))+split_type+"_"+dialect_name+"_translated.csv" if not os.path.isfile(translated_file_name): print("Opening file....." , translated_file_name) df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True) print("Finished writing file....",translated_file_name) if __name__ == "__main__": """ Main method to read config file and then use it to store and translate to produce a high quality SFT dataset """ #Read the config file and get the requisite section to call for the dataset config.read("dataset.ini", encoding="utf-8") for key in config['openai']: if key.lower().strip().__eq__("name"): name_of_dataset = config['openai'][key].replace('"','') splits = { 'train': 'main/train-00000-of-00001.parquet', 'test': 'main/test-00000-of-00001.parquet' } df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"]) store_sft_dataset(name_of_dataset , df_1,"train")