from transformers import BertTokenizerFast, BertConfig from typing import Dict, List, Union, Tuple def num_unique_labels(dataset: Dict[str, Union[str, List[str]]]) -> Tuple[int, int]: """ Calculate the number of NER labels and INTENT labels in the dataset. Args: dataset (dict): A dictionary containing 'text', 'entities' and 'intent' keys. Returns: Tuple: Number of unique NER and INTENT lables. """ one_dimensional_ner = [tag for subset in dataset['entities'] for tag in subset] return len(set(one_dimensional_ner)), len(set(dataset['intent'])) def ner_labels_to_ids() -> Dict[str, int]: """ Map NER labels to corresponding numeric IDs. Returns: Dict[str, int]: A dictionary where keys are NER labels, and values are their corresponding IDs. """ labels_to_ids_ner = { 'O': 0, 'B-DATE': 1, 'I-DATE': 2, 'B-TIME': 3, 'I-TIME': 4, 'B-TASK': 5, 'I-TASK': 6, 'B-DUR': 7, 'I-DUR': 8 } return labels_to_ids_ner def ner_ids_to_labels(ner_labels_to_ids) -> Dict[int, str]: """ Map numeric IDs to corresponding NER labels. Returns: Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding NER labels. """ ner_ids_to_labels = {v: k for k, v in ner_labels_to_ids.items()} return ner_ids_to_labels def intent_labels_to_ids() -> Dict[str, int]: """ Map intent labels to corresponding numeric values. Returns: Dict[str, int]: A dictionary where keys are intent labels, and values are their corresponding numeric IDs. """ intent_labels_to_ids = { "'Schedule Appointment'": 0, "'Schedule Meeting'": 1, "'Set Alarm'": 2, "'Set Reminder'": 3, "'Set Timer'": 4 } return intent_labels_to_ids def intent_ids_to_labels(intent_labels_to_ids) -> Dict[int, str]: """ Map numeric values to corresponding intent labels. Returns: Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding intent labels. """ intent_ids_to_labels = {v: k for k, v in intent_labels_to_ids.items()} return intent_ids_to_labels def tokenizer() -> BertTokenizerFast: tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') return tokenizer def bert_config() -> BertConfig: config = BertConfig.from_pretrained('bert-base-uncased') return config def structure_data(dataset): structured_data = {'text': [], 'entities': [], 'intent': []} for sample in dataset: structured_data['text'].append(sample['text']) structured_data['entities'].append(sample['entities'].split()) structured_data['intent'].append(sample['intent']) return structured_data