import lancedb
from utils import load_json_file
from mm_rag.embeddings.bridgetower_embeddings import (
    BridgeTowerEmbeddings
)
from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
from mm_rag.MLM.client import PredictionGuardClient
from mm_rag.MLM.lvlm import LVLM
from PIL import Image
from langchain_core.runnables import (
    RunnableParallel, 
    RunnablePassthrough, 
    RunnableLambda
)

# declare host file
LANCEDB_HOST_FILE = "./shared_data/.lancedb"

# declare table name
TBL_NAME = "test_tbl"

# if you haven't practice Lesson 3 and 4, 
#   change 'test_tbl' to 'demo_tbl' to use pre-populated data
#   by uncomment the following line
#TBL_NAME = "demo_tbl"

# initialize an BridgeTower embedder 
embedder = BridgeTowerEmbeddings()

## Creating a LanceDB vector store 
vectorstore = MultimodalLanceDB(
    uri=LANCEDB_HOST_FILE, 
    embedding=embedder, 
    table_name=TBL_NAME
)

### creating a retriever for the vector store
### with search_type="similarity" and search_kwargs={"k": 1} 
retriever_module = vectorstore.as_retriever(
    search_type='similarity', 
    search_kwargs={"k": 1}
)

# Invoke the retrieval for a query
query = "What do the astronauts feel about their work?"
retrieved_video_segments = retriever_module.invoke(query)
# get the first retrieved video segment
retrieved_video_segment = retrieved_video_segments[0]

# get all metadata of the retrieved video segment
retrieved_metadata = retrieved_video_segment.metadata['metadata']

# get the extracted frame
frame_path = retrieved_metadata['extracted_frame_path']
# get the corresponding transcript
transcript = retrieved_metadata['transcript']
# get the path to video where the frame was extracted
video_path = retrieved_metadata['video_path']
# get the time stamp when the frame was extracted
timestamp = retrieved_metadata['mid_time_ms']

# display
print(f"Transcript:\n{transcript}\n")
print(f"Path to extracted frame: {frame_path}")
print(f"Path to video: {video_path}")
print(f"Timestamp in ms when the frame was extracted: {timestamp}")
display(Image.open(frame_path))

Transcript:
As I look back on the the mission that we've had here on the International Space Station, I'm proud to have been a part of much of the science activities that happened over the last two months.

Path to extracted frame: ./shared_data/videos/video1/extracted_frame/frame_1.jpg
Path to video: ./shared_data/videos/video1/Welcome back to Planet Earth.mp4
Timestamp in ms when the frame was extracted: 8719.0

# initialize a client as PredictionGuardClient
client = PredictionGuardClient()
# initialize LVLM with the given client
lvlm_inference_module = LVLM(client=client)

# This new query is the augmentation of the previous query
# with the transcript retrieved above. 
augmented_query_template = (
    "The transcript associated with the image is '{transcript}'. "
    "{previous_query}"
)
augmented_query = augmented_query_template.format(
    transcript=transcript,
    previous_query=query,
)
print(f"Augmented query is:\n{augmented_query}")

Augmented query is:
The transcript associated with the image is 'As I look back on the the mission that we've had here on the International Space Station, I'm proud to have been a part of much of the science activities that happened over the last two months.'. What do the astronauts feel about their work?

# we use the augmented query and the retrieved path-to-image
# as the input to LVLM inference module
input = {'prompt':augmented_query, 'image': frame_path}
response = lvlm_inference_module.invoke(input)

# display the response
print('LVLM Response:')
print(response)

LVLM Response:
The astronauts in the image appear to be proud of their work on the International Space Station. They are all wearing shirts and ties, which suggests a professional and formal atmosphere. One of the astronauts is holding a microphone, indicating that they might be discussing their experiences or sharing information with the public. The fact that they are all smiling and posing for a picture together suggests that they are comfortable with each other and are likely enjoying their time on the space station.

def prompt_processing(input):
    # get the retrieved results and user's query
    retrieved_results = input['retrieved_results']
    user_query = input['user_query']
    
    # get the first retrieved result by default
    retrieved_result = retrieved_results[0]
    prompt_template = (
      "The transcript associated with the image is '{transcript}'. "
      "{user_query}"
    )
    
    # get all metadata of the retrieved video segment
    retrieved_metadata = retrieved_result.metadata['metadata']

    # get the corresponding transcript
    transcript = retrieved_metadata['transcript']
    # get the extracted frame
    frame_path = retrieved_metadata['extracted_frame_path']
    
    return {
        'prompt': prompt_template.format(
            transcript=transcript, 
            user_query=user_query
        ),
        'image' : frame_path
    }
    
# initialize prompt processing module 
# as a Langchain RunnableLambda of function prompt_processing
prompt_processing_module = RunnableLambda(prompt_processing)

# We use the user query and the retrieved results above
input_to_lvlm = prompt_processing_module.invoke(
    {
        'retrieved_results': retrieved_video_segments, 
        'user_query': query
    })

# display output of prompt processing module 
#  which is the input to LVLM Inference module
print(input_to_lvlm)

{'prompt': "The transcript associated with the image is 'As I look back on the the mission that we've had here on the International Space Station, I'm proud to have been a part of much of the science activities that happened over the last two months.'. What do the astronauts feel about their work?", 'image': './shared_data/videos/video1/extracted_frame/frame_1.jpg'}

# combine all the modules into a chain 
# to create Multimodal RAG system
mm_rag_chain = (
    RunnableParallel({
        "retrieved_results": retriever_module , 
        "user_query": RunnablePassthrough()
    }) 
    | prompt_processing_module
    | lvlm_inference_module
)

# invoke the Multimodal RAG system with a query
query1 = "What do the astronauts feel about their work?"
final_text_response1 = mm_rag_chain.invoke(query1)
# display
print(f"USER Query: {query1}")
print(f"MM-RAG Response: {final_text_response1}")

USER Query: What do the astronauts feel about their work?
MM-RAG Response: The astronauts in the image appear to be proud of their work on the International Space Station. They are all wearing shirts and ties, which suggests a professional and formal atmosphere. One of the astronauts is holding a microphone, indicating that they might be discussing their experiences or sharing information with the public. The fact that they are all smiling and posing for a picture together suggests that they are comfortable with each other and are likely enjoying their time on the space station.

# let's try another query
query2 = "What is the name of one of the astronauts?"
final_text_response2 = mm_rag_chain.invoke(query2)
# display
print(f"USER Query: {query2}")
print(f"MM-RAG Response: {final_text_response2}")

USER Query: What is the name of one of the astronauts?
MM-RAG Response: One of the astronauts is named Robert Behnken.

# the output of this new chain will be a dictionary
mm_rag_chain_with_retrieved_image = (
    RunnableParallel({
        "retrieved_results": retriever_module , 
        "user_query": RunnablePassthrough()
    }) 
    | prompt_processing_module
    | RunnableParallel({
        'final_text_output': lvlm_inference_module, 
        'input_to_lvlm' : RunnablePassthrough()
    })
)

# let's try again with query2
response3 = mm_rag_chain_with_retrieved_image.invoke(query2)
# display
print("Type of output of mm_rag_chain_with_retrieved_image is:")
print(type(response3))
print(f"Keys of the dict are {response3.keys()}")

Type of output of mm_rag_chain_with_retrieved_image is:
<class 'dict'>
Keys of the dict are dict_keys(['final_text_output', 'input_to_lvlm'])

# We now extract final text response and path to extracted frame
final_text_response3 = response3['final_text_output']
path_to_extracted_frame = response3['input_to_lvlm']['image']

# display
print(f"USER Query: {query2}")
print(f"MM-RAG Response: {final_text_response3}")
print("Retrieved frame:")
display(Image.open(path_to_extracted_frame))

USER Query: What is the name of one of the astronauts?
MM-RAG Response: One of the astronauts is named Robert Behnken.
Retrieved frame:

# let's try again with another query
query4 = "an astronaut's spacewalk"
response4 = mm_rag_chain_with_retrieved_image.invoke(query4)
# extract results
final_text_response4 = response4['final_text_output']
path_to_extracted_frame4 = response4['input_to_lvlm']['image']
# display
print(f"USER Query: {query4}")
print()
print(f"MM-RAG Response: {final_text_response4}")
print()
print("Retrieved frame:")
display(Image.open(path_to_extracted_frame4))

USER Query: an astronaut's spacewalk

MM-RAG Response: 

The image shows an astronaut wearing a white space suit, standing on a spacecraft and performing a spacewalk. The astronaut is holding onto a metal bar, which is likely a part of the spacecraft's structure. The scene captures the excitement and challenges of space exploration, as the astronaut carries out a critical task in the mission. The image also highlights the astronaut's skill and expertise in handling the dangers and complexities of working in the harsh environment of space.

Retrieved frame:

# We would like an astronaut's spacewalk with the earth view behind
query5 = (
    "Describe the image of an astronaut's spacewalk "
    "with an amazing view of the earth from space behind"
)
response5 = mm_rag_chain_with_retrieved_image.invoke(query5)
# extract results
final_text_response5 = response5['final_text_output']
path_to_extracted_frame5 = response5['input_to_lvlm']['image']
# display
print(f"USER Query: {query5}")
print()
print(f"MM-RAG Response: {final_text_response5}")
print()
print("Retrieved Frame:")
display(Image.open(path_to_extracted_frame5))

USER Query: Describe the image of an astronaut's spacewalk with an amazing view of the earth from space behind

MM-RAG Response: The image captures an astronaut in a white space suit, performing a spacewalk outside the International Space Station (ISS). The astronaut is standing on a metal structure, possibly a part of the ISS, and appears to be working on a task or conducting maintenance. The view from space is breathtaking, showcasing the Earth below, with its vibrant colors and intricate patterns. The scene evokes a sense of awe and accomplishment, as the astronaut takes part in this extraordinary mission.

Retrieved Frame:

# Slightly change the query5
query6 = (
    "An astronaut's spacewalk with "
    "an amazing view of the earth from space behind"
)
response6 = mm_rag_chain_with_retrieved_image.invoke(query6)
# extract results
final_text_response6 = response6['final_text_output']
path_to_extracted_frame6 = response6['input_to_lvlm']['image']
# display
print(f"USER Query: {query6}")
print()
print(f"MM-RAG Response: {final_text_response6}")
print()
print("Retrieved Frame:")
display(Image.open(path_to_extracted_frame6))

USER Query: An astronaut's spacewalk with an amazing view of the earth from space behind

MM-RAG Response: 

The image captures a breathtaking view of the Earth from space, with an astronaut performing a spacewalk. The spacewalker is standing on a space station, which is equipped with various tools and equipment. There are several people visible in the scene, including the spacewalker and others who might be assisting or observing the activity. The image showcases the incredible achievements of human space exploration and the stunning beauty of our planet from above.

Retrieved Frame:

Lesson 5: Multimodal RAG with Multimodal Langchain¶

Setup¶

Preprocessing¶

Setup LanceDB vectorstore¶

Retrieval Module¶

Initialize Embedding Model¶

Create Retrieval¶

Invoke Retrieval with User Query¶

LVLM Inference Module¶

Initialize Client and LVLM for Inference¶

Invoke LVLM Inference with User Query¶

Prompt Processing Module¶

Invoke Prompt Processing Module with Retrieved Results and User Query¶

Multimodal RAG¶

Define Multimodal RAG System as a Chain in LangChain¶

Invoke the Multimodal RAG System with a Query¶

Multimodal RAG System Showing Retrieved Image/Frame¶