Combining the power of Image of Multimodal LLM (Llava),with Whisper AI, and Gradio UI
Here is a simple attempt of working with Image and Audio AI that deals with understanding and interpreting data from various sources or modes, such as text, images, audio, and video. This ability to process multiple types of data makes multimodal models more versatile and capable than unimodal ones.
Here is a simple application using Gradio that allows users to upload image, ask questions, the apps provides audio and text answer.
Now, let’s introduce the main players in our exploration: Llava 1.5B, Whisper AI, and Gradio UI.
Llava https://huggingface.co/llava-hf/llava-1.5-7b-hf
Whisper AI: Developed by Meta AI, Whisper is an audio model that can transcribe and generate speech in multiple languages. It’s unique because it was designed with a multimodal architecture, enabling it to not only transcribe speech but also perform other tasks related to audio data.
Gradio UI: Gradio is a open-source toolkit for creating, deploying, and interactively exploring machine learning models in production. Its user-friendly interface allows us to easily build applications that leverage the capabilities of Llava 1.5B and Whisper AI.
This is the Main Gradio app
# app.py
import gradio as gr
from multimodal_rag1 import img2txt_description, img2txt_query, transcribe, text_to_speech
import os
def process_inputs(audio_path, image_path, user_query):
# Process the audio file
speech_to_text_output = transcribe(audio_path)
# Handle the image input
image_description = “”
image_query_response = “”
if image_path:
image_description = img2txt_description(image_path)
if user_query:
image_query_response = img2txt_query(user_query, image_path)
else:
image_query_response = “No query provided.”
else:
image_query_response = “No image provided.”
# Generate text-to-speech output
response_text = image_description + “ “ + image_query_response
response_audio_path = “response.mp3”
text_to_speech(response_text, response_audio_path)
return speech_to_text_output, image_description, image_query_response, response_audio_path
def launch_interface():
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(source=”microphone”, type=”filepath”),
gr.Image(type=”filepath”),
gr.Textbox(label=”User Query”, placeholder=”Ask a question about the image…”)
],
outputs=[
gr.Textbox(label=”Speech to Text”),
gr.Textbox(label=”Image Description”),
gr.Textbox(label=”Image Query Response”),
gr.Audio(label=”Response Audio”)
],
title=”Multimodal Image Processing with Llava and Whisper”,
description=”Upload an image, interact via voice input, and receive audio responses.”
)
iface.launch(debug=True)
if __name__ == “__main__”:
launch_interface()
Open and create another file and name is multimodal_rag.py
Ensure you have a GPU and make changes to see that it works
Also create a requirement.txt file to load all python dependencies.
# -*- coding: utf-8 -*-
“””Multimodal RAG.py
“””
# !pip install -q -U transformers==4.37.2
# !pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
# !pip install -q git+https://github.com/openai/whisper.git
# !pip install -q gradio
# !pip install -q gTTS
import torch
from transformers import pipeline
# model_id = “nlpconnect/vit-gpt2-image-captioning”
model_id = “llava-hf/llava-1.5–7b-hf”
pipe = pipeline(“image-to-text”, model=model_id)
import whisper
import gradio as gr
import warnings
import os
from gtts import gTTS
from PIL import Image
import numpy as np
import re
torch.cuda.is_available()
DEVICE = “cuda” if torch.cuda.is_available() else “cpu”
print(f”Using torch {torch.__version__} ({DEVICE})”)
import whisper
model = whisper.load_model(“medium”, device=DEVICE)
print(
f”Model is {‘multilingual’ if model.is_multilingual else ‘English-only’} “
f”and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.”
)
def img2txt_description(image):
prompt_instructions = “””
Describe the image using as much detail as possible,
is it a painting, a photograph, what colors are predominant,
what is the image about?
“””
prompt = “USER: <image>\n” + prompt_instructions + “\nASSISTANT:”
outputs = pipe(image, prompt=prompt)
extracted_text = outputs[“generated_text”].split(“ASSISTANT:”)[-1].strip()
return extracted_text
def img2txt_query(input_text, input_image):
prompt_instructions = f”Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}”
prompt = “USER: <image>\n” + prompt_instructions + “\nASSISTANT:”
outputs = pipe(input_image, prompt=prompt)
extracted_text = outputs[“generated_text”].split(“ASSISTANT:”)[-1].strip()
return extracted_text
def transcribe(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
return result.text
def text_to_speech(text, file_path):
language = ‘en’
audioobj = gTTS(text=text, lang=language, slow=False)
audioobj.save(file_path)
return