Combining the power of Image of Multimodal LLM (Llava),with Whisper AI, and Gradio UI

Dwarakanath Rao
3 min readFeb 28, 2024

--

Here is a simple attempt of working with Image and Audio AI that deals with understanding and interpreting data from various sources or modes, such as text, images, audio, and video. This ability to process multiple types of data makes multimodal models more versatile and capable than unimodal ones.

Here is a simple application using Gradio that allows users to upload image, ask questions, the apps provides audio and text answer.

Now, let’s introduce the main players in our exploration: Llava 1.5B, Whisper AI, and Gradio UI.

Llava https://huggingface.co/llava-hf/llava-1.5-7b-hf

Whisper AI: Developed by Meta AI, Whisper is an audio model that can transcribe and generate speech in multiple languages. It’s unique because it was designed with a multimodal architecture, enabling it to not only transcribe speech but also perform other tasks related to audio data.

Gradio UI: Gradio is a open-source toolkit for creating, deploying, and interactively exploring machine learning models in production. Its user-friendly interface allows us to easily build applications that leverage the capabilities of Llava 1.5B and Whisper AI.

This is the Main Gradio app

# app.py

import gradio as gr

from multimodal_rag1 import img2txt_description, img2txt_query, transcribe, text_to_speech

import os

def process_inputs(audio_path, image_path, user_query):

# Process the audio file

speech_to_text_output = transcribe(audio_path)

# Handle the image input

image_description = “”

image_query_response = “”

if image_path:

image_description = img2txt_description(image_path)

if user_query:

image_query_response = img2txt_query(user_query, image_path)

else:

image_query_response = “No query provided.”

else:

image_query_response = “No image provided.”

# Generate text-to-speech output

response_text = image_description + “ “ + image_query_response

response_audio_path = “response.mp3”

text_to_speech(response_text, response_audio_path)

return speech_to_text_output, image_description, image_query_response, response_audio_path

def launch_interface():

iface = gr.Interface(

fn=process_inputs,

inputs=[

gr.Audio(source=”microphone”, type=”filepath”),

gr.Image(type=”filepath”),

gr.Textbox(label=”User Query”, placeholder=”Ask a question about the image…”)

],

outputs=[

gr.Textbox(label=”Speech to Text”),

gr.Textbox(label=”Image Description”),

gr.Textbox(label=”Image Query Response”),

gr.Audio(label=”Response Audio”)

],

title=”Multimodal Image Processing with Llava and Whisper”,

description=”Upload an image, interact via voice input, and receive audio responses.”

)

iface.launch(debug=True)

if __name__ == “__main__”:

launch_interface()

Open and create another file and name is multimodal_rag.py

Ensure you have a GPU and make changes to see that it works

Also create a requirement.txt file to load all python dependencies.

# -*- coding: utf-8 -*-

“””Multimodal RAG.py

“””

# !pip install -q -U transformers==4.37.2

# !pip install -q bitsandbytes==0.41.3 accelerate==0.25.0

# !pip install -q git+https://github.com/openai/whisper.git

# !pip install -q gradio

# !pip install -q gTTS

import torch

from transformers import pipeline

# model_id = “nlpconnect/vit-gpt2-image-captioning”

model_id = “llava-hf/llava-1.5–7b-hf”

pipe = pipeline(“image-to-text”, model=model_id)

import whisper

import gradio as gr

import warnings

import os

from gtts import gTTS

from PIL import Image

import numpy as np

import re

torch.cuda.is_available()

DEVICE = “cuda” if torch.cuda.is_available() else “cpu”

print(f”Using torch {torch.__version__} ({DEVICE})”)

import whisper

model = whisper.load_model(“medium”, device=DEVICE)

print(

f”Model is {‘multilingual’ if model.is_multilingual else ‘English-only’} “

f”and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.”

)

def img2txt_description(image):

prompt_instructions = “””

Describe the image using as much detail as possible,

is it a painting, a photograph, what colors are predominant,

what is the image about?

“””

prompt = “USER: <image>\n” + prompt_instructions + “\nASSISTANT:”

outputs = pipe(image, prompt=prompt)

extracted_text = outputs[“generated_text”].split(“ASSISTANT:”)[-1].strip()

return extracted_text

def img2txt_query(input_text, input_image):

prompt_instructions = f”Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}”

prompt = “USER: <image>\n” + prompt_instructions + “\nASSISTANT:”

outputs = pipe(input_image, prompt=prompt)

extracted_text = outputs[“generated_text”].split(“ASSISTANT:”)[-1].strip()

return extracted_text

def transcribe(audio):

audio = whisper.load_audio(audio)

audio = whisper.pad_or_trim(audio)

mel = whisper.log_mel_spectrogram(audio).to(model.device)

_, probs = model.detect_language(mel)

options = whisper.DecodingOptions()

result = whisper.decode(model, mel, options)

return result.text

def text_to_speech(text, file_path):

language = ‘en’

audioobj = gTTS(text=text, lang=language, slow=False)

audioobj.save(file_path)

return

--

--

No responses yet