--- library_name: transformers tags: [] --- # Multi-images Multi-audio Multi-turn Malaysian 7B Mistral WanDB https://wandb.ai/huseinzol05/multimodal-mistral?workspace=user-huseinzol05 ## how-to ```python from modeling_combine import MM_LLMs, MM_LLMs_Config from transformers import AutoTokenizer, AutoProcessor from PIL import Image import librosa import requests model = MM_LLMs.from_pretrained( 'mesolitica/malaysian-mistral-mmmmodal', flash_attention = True, dtype = torch.bfloat16, torch_dtype = torch.bfloat16 ) _ = model.cuda() image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384') audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small') tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-mmmmodal') def prepare_dataset(messages, images: List[str] = None, audio: List[str] = None, sr = 16000): if images is not None: images = [Image.open(f).convert('RGB') for f in images] image_output = image_processor(images=images, return_tensors='pt')['pixel_values'] else: image_output = None if audio is not None: audio = [librosa.load(f, sr=sr)[0] for f in audio] audio_features = audio_processor(audio, sampling_rate=sr, return_tensors='pt',)['input_features'] else: audio_features = None prompt = tokenizer.apply_chat_template(messages, tokenize = False) outputs = tokenizer( prompt, return_tensors='pt', return_overflowing_tokens=False, return_length=False ) outputs['images'] = image_output outputs['audios'] = audio_features image_token = tokenizer.convert_tokens_to_ids('') audio_token = tokenizer.convert_tokens_to_ids('