import gradio as gr import cv2 import easyocr import numpy as np import requests import os import whisper from transformers import pipeline API_KEY = os.getenv("API_KEY") API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection" headers = {"Authorization": "Bearer "+ API_KEY+""} reader = easyocr.Reader(['en'], gpu=False) model = whisper.load_model("base") sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions") def query(image): image_data = np.array(image, dtype=np.uint8) _, buffer = cv2.imencode('.jpg', image_data) binary_data = buffer.tobytes() response = requests.post(API_URL, headers=headers, data=binary_data) return response.json() def text_extraction(image): global text_content text_content = '' facial_data = query(image) text_ = reader.readtext(image) threshold = 0.25 for t_, t in enumerate(text_): bbox, text, score = t text_content = text_content + ' ' + ' '.join(text) if score > threshold: cv2.rectangle(image, tuple(map(int, bbox[0])), tuple(map(int, bbox[2])), (0, 255, 0), 5) return image, text_content, facial_data def analyze_sentiment(text): results = sentiment_analysis(text) sentiment_results = {result['label']: result['score'] for result in results} return sentiment_results def get_sentiment_emoji(sentiment): # Define the emojis corresponding to each sentiment emoji_mapping = { "disappointment": "😞", "sadness": "😢", "annoyance": "😠", "neutral": "😐", "disapproval": "👎", "realization": "😮", "nervousness": "😬", "approval": "👍", "joy": "😄", "anger": "😡", "embarrassment": "😳", "caring": "🤗", "remorse": "😔", "disgust": "🤢", "grief": "😥", "confusion": "😕", "relief": "😌", "desire": "😍", "admiration": "😌", "optimism": "😊", "fear": "😨", "love": "❤️", "excitement": "🎉", "curiosity": "🤔", "amusement": "😄", "surprise": "😲", "gratitude": "🙏", "pride": "🦁" } return emoji_mapping.get(sentiment, "") def display_sentiment_results(sentiment_results, option): sentiment_text = "" for sentiment, score in sentiment_results.items(): emoji = get_sentiment_emoji(sentiment) if option == "Sentiment Only": sentiment_text += f"{sentiment} {emoji}\n" elif option == "Sentiment + Score": sentiment_text += f"{sentiment} {emoji}: {score}\n" return sentiment_text def inference(image, text, audio, sentiment_option): extracted_image, extracted_text, extracted_facial_data = text_extraction(image) audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) lang = max(probs, key=probs.get) options = whisper.DecodingOptions(fp16=False) result = whisper.decode(model, mel, options) audio_sentiment_results = analyze_sentiment(result.text) # Ta - Text from audio image_sentiment_results = analyze_sentiment(extracted_text) # Ti - Text from image text_sentiment_results = analyze_sentiment(text) # T - User defined Text audio_sentiment_output = display_sentiment_results(audio_sentiment_results, sentiment_option) image_sentiment_output = display_sentiment_results(image_sentiment_results, sentiment_option) text_sentiment_output = display_sentiment_results(text_sentiment_results, sentiment_option) return extracted_image, extracted_facial_data, extracted_text, image_sentiment_output, text_sentiment_output, lang.upper(), result.text, sentiment_output title = """

Cross Model Machine Learning (Sentiment Analysis)

""" image_path = "thmbnail.jpg" description = """ 💻 This demo showcases a Cross Model Machine Learning for Sentiment Analysis.


⚙️ Components of the tool:

     - Sentiment Analysis of Image
     - Text Extraction from Image
     - Sentiment analysis of the user given text.
     - Real-time multilingual speech recognition
     - Language identification
     - Sentiment analysis of the transcriptions

🎯 The sentiment analysis results are provided as a dictionary with different emotions and their corresponding scores.

😃 The sentiment analysis results are displayed with emojis representing the corresponding sentiment.

✅ The higher the score for a specific emotion, the stronger the presence of that emotion in the transcribed text.

❓ Use the microphone for real-time speech recognition.

⚡️ The model will transcribe the audio and perform sentiment analysis on the transcribed text.
""" custom_css = """ #banner-image { display: block; margin-left: auto; margin-right: auto; } #chat-message { font-size: 14px; min-height: 300px; } """ block = gr.Blocks(css=custom_css) with block: gr.HTML(title) with gr.Row(): with gr.Column(): gr.Image(image_path, elem_id="banner-image", show_label=False) with gr.Column(): gr.HTML(description) with gr.Blocks(): with gr.Row(): with gr.Column(): image = gr.Image() image_output = gr.Image() text_output = gr.Textbox(label="Text Content") text_sentiment = gr.Textbox(label="Text Sentiment") facial_output = gr.JSON(label="Facial Data") with gr.Text(): gr.Textbox(label="Text Content") output_text_sentiment = gr.TextBox("Text Sentiment") with gr.Column(): audio = gr.Audio(label="Input Audio", show_label=False, type="filepath") sentiment_option = gr.Radio(choices=["Sentiment Only", "Sentiment + Score"], label="Select an option") lang_str = gr.Textbox(label="Language") text = gr.Textbox(label="Transcription") sentiment_output = gr.Textbox(label="Sentiment Analysis Results") btn = gr.Button("Transcribe") btn.click(inference, inputs=[image, text, audio, sentiment_option], outputs=[image_output, facial_output, text_output, text_sentiment, output_text_sentiment, lang_str, text, sentiment_output]) block.launch()