'''
    File Name: prepare_sft_dataset.py      Author: Nikhil Malhotra
    Date: 21/7/2024 
    purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
    Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
    Dataset is then translated in requisite dialects as supported by Google
    Dataset is also split in train and test and enables to create requisite files
    Name of the file carries the source, translated into a dialect along with split type
'''
#Imports
import os
import pandas as pd
import configparser
from googletrans import Translator


#Configuration settings
source_dir_path = "source"
translated_dir_path = "translated"
config = configparser.ConfigParser(default_section="DATASETS")
'''
    Dialects double codes used for translation
    1. English : en 
    2. Hindi : hi
    3. Dogri : doi
    4. Bhojpuri: bho
    5. Maithili :mai
'''
translated_dialect = "hi"

def store_sft_dataset(name_of_dataset , data_frame,split_type):
   """
        Method to get data from Hugging face source and store this data in csv file
        This method also calls translation in dialects
        Input : name of dataset to be fetched from hugging face
        Data Frame : Data frame contains question or answers or prompts and messages
        Split type: test or train as the data is split
    """
   file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
   if not os.path.isfile(file_name):
        print("Opening file....." , file_name)
        data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
        print("Finished writing file....",file_name)
   
   #Change dialect to translate at top depending upon the dialect needed
   translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
   

def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
    USE_COUNT = False #Only use this if you need to test with small data
    count = 0
    """
        This method enables translation in various dialects byt taking a dataset in a data frame and then 
        storing it as a csv file
        Input : name of dataset to be fetched from hugging face
        Data Frame : Data frame contains question or answers or prompts and messages
        Split type: test or train as the data is split
        dialect name: Dialect in whcih data needs to be converted into
    """
    print("Translating now....")
    translator = Translator()
    #Make a new dataframe for translation
    translate_df = pd.DataFrame(columns=["question" , "answer"])
    translated_append_list = []
    for index,val in data_frame.iterrows():
        translated_ques = translator.translate(val["question"],  dest=dialect_name).text
        translated_ans = translator.translate(val["answer"],  dest=dialect_name).text
        translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
        count +=1
        #If count is used then the program thinks you are testing.
        # It enables you to break the loop with 5 data points
        if USE_COUNT:
            if count == 5:
                break
    
    df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])

    translated_file_name = str(os.path.join(translated_dir_path,
                                 name_of_dataset.replace("/","_")
                                 ))+split_type+"_"+dialect_name+"_translated.csv"
    
    if not os.path.isfile(translated_file_name):
        print("Opening file....." , translated_file_name)
        df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
        print("Finished writing file....",translated_file_name)    


if __name__ == "__main__":
    """
        Main method to read config file and then use it to store and translate to produce
        a high quality SFT dataset
    """
    #Read the config file and get the requisite section to call for the dataset
    config.read("dataset.ini", encoding="utf-8")
    for key in config['openai']:
        if key.lower().strip().__eq__("name"):
            name_of_dataset = config['openai'][key].replace('"','')
            splits = {  'train': 'main/train-00000-of-00001.parquet', 
                         'test': 'main/test-00000-of-00001.parquet'
                     }
            df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
            store_sft_dataset(name_of_dataset , df_1,"train")