import requests
import os
# You can use your own uploaded images and captions. 
# You will be responsible for the legal use of images that 
#  you are going to use.

url1='http://farm3.staticflickr.com/2519/4126738647_cc436c111b_z.jpg'
cap1='A motorcycle sits parked across from a herd of livestock'

url2='http://farm3.staticflickr.com/2046/2003879022_1b4b466d1d_z.jpg'
cap2='Motorcycle on platform to be worked on in garage'

url3='http://farm1.staticflickr.com/133/356148800_9bf03b6116_z.jpg'
cap3='a cat laying down stretched out near a laptop'

img1 = {
  'flickr_url': url1,
  'caption': cap1,
  'image_path' : './shared_data/motorcycle_1.jpg'
}

img2 = {
    'flickr_url': url2,
    'caption': cap2,
    'image_path' : './shared_data/motorcycle_2.jpg'
}

img3 = {
    'flickr_url' : url3,
    'caption': cap3,
    'image_path' : './shared_data/cat_1.jpg'
}

# download images
imgs = [img1, img2, img3]
# Ensure the directory exists
os.makedirs('./shared_data', exist_ok=True)
for img in imgs:
    data = requests.get(img['flickr_url']).content
    with open(img['image_path'], 'wb') as f:
        f.write(data)

from PIL import Image
from IPython.display import display

img3['image_path'] = './shared_data/cat_1.png'# the downloaded img is corrupted

for img in [img1, img2, img3]:
    image = Image.open(img['image_path'])
    caption = img['caption']
    display(image)
    display(caption)
    print()

'A motorcycle sits parked across from a herd of livestock'

'Motorcycle on platform to be worked on in garage'

'a cat laying down stretched out near a laptop'

import json
import os
import numpy as np
from numpy.linalg import norm
import cv2
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
from utils import encode_image
from utils import bt_embedding_from_prediction_guard as bt_embeddings

/home/yutong/anaconda3/envs/multimodal/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

from utils import download_image

embeddings = []
for img in [img1, img2, img3]:
    img_path = img['image_path']
    caption = img['caption']
    # base64_img = encode_image(img_path)
    img = download_image(img_path)
    embedding = bt_embeddings(caption, img)
    # embedding = bt_embeddings(caption, base64_img)
    # embeddings.append(embedding)
    embeddings += embedding

# Each image-text pair is now converted into multimodal 
# embedding vector which has dimensions of 512.

print(len(embeddings[0]))

2048

def cosine_similarity(vec1, vec2):
    similarity = np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))
    return similarity

ex1_embed = np.array(embeddings[0])
ex2_embed = np.array(embeddings[1])
ex3_embed = np.array(embeddings[2])
sim_ex1_ex2 = cosine_similarity(ex1_embed, ex2_embed)
sim_ex1_ex3 = cosine_similarity(ex1_embed, ex3_embed)

print("Cosine similarity between ex1_embeded and ex2_embeded is:")
display(sim_ex1_ex2)
print("Cosine similarity between ex1_embeded and ex3_embeded is:")
display(sim_ex1_ex3)

Cosine similarity between ex1_embeded and ex2_embeded is:

np.float64(0.9268679148305662)

Cosine similarity between ex1_embeded and ex3_embeded is:

np.float64(0.8940822324198224)

dist_ex1_ex2 = cv2.norm(ex1_embed, ex2_embed, cv2.NORM_L2)
dist_ex1_ex3 = cv2.norm(ex1_embed, ex3_embed, cv2.NORM_L2)

print("Euclidean distance between ex1_embeded and ex2_embeded is:")
display(dist_ex1_ex2)

print("Euclidean distance between ex1_embeded and ex3_embeded is:")
display(dist_ex1_ex3)

Euclidean distance between ex1_embeded and ex2_embeded is:

failed to open /dev/dri/renderD128: Permission denied
failed to open /dev/dri/renderD129: Permission denied
failed to open /dev/dri/renderD130: Permission denied
failed to open /dev/dri/renderD131: Permission denied
failed to open /dev/dri/renderD128: Permission denied
failed to open /dev/dri/renderD129: Permission denied
failed to open /dev/dri/renderD130: Permission denied
failed to open /dev/dri/renderD131: Permission denied

10.3079760231243

Euclidean distance between ex1_embeded and ex3_embeded is:

12.509936672285201

from utils import prepare_dataset_for_umap_visualization as data_prep

# prepare image_text pairs 

# for the first 50 data of Huggingface dataset 
#  "yashikota/cat-image-dataset"
cat_img_txt_pairs = data_prep("yashikota/cat-image-dataset", 
                             "cat", test_size=50)

# for the first 50 data of Huggingface dataset 
#  "tanganke/stanford_cars"
car_img_txt_pairs = data_prep("tanganke/stanford_cars", 
                             "car", test_size=50)

Generating train split: 100%|██████████| 9993/9993 [00:04<00:00, 2222.36 examples/s]
Generating train split: 100%|██████████| 8144/8144 [00:03<00:00, 2603.31 examples/s]
Generating test split: 100%|██████████| 8041/8041 [00:03<00:00, 2479.72 examples/s]
Generating contrast split: 100%|██████████| 8041/8041 [00:01<00:00, 7218.40 examples/s]
Generating gaussian_noise split: 100%|██████████| 8041/8041 [00:02<00:00, 2911.22 examples/s]
Generating impulse_noise split: 100%|██████████| 8041/8041 [00:03<00:00, 2550.19 examples/s]
Generating jpeg_compression split: 100%|██████████| 8041/8041 [00:01<00:00, 5755.36 examples/s]
Generating motion_blur split: 100%|██████████| 8041/8041 [00:01<00:00, 4783.01 examples/s]
Generating pixelate split: 100%|██████████| 8041/8041 [00:00<00:00, 106211.50 examples/s]
Generating spatter split: 100%|██████████| 8041/8041 [00:03<00:00, 2537.41 examples/s]

# display an example of a cat image-text pair data
display(cat_img_txt_pairs[0]['caption'])
display(cat_img_txt_pairs[0]['pil_img'])

# display an example of a car image-text pair data
display(car_img_txt_pairs[0]['caption'])
display(car_img_txt_pairs[0]['pil_img'])

'an image of cat'

'a picture of car'

# compute BridgeTower embeddings for cat image-text pairs
cat_embeddings = []
for img_txt_pair in tqdm(
                        cat_img_txt_pairs, 
                        total=len(cat_img_txt_pairs)
                    ):
    pil_img = img_txt_pair['pil_img']
    caption = img_txt_pair['caption']
    # base64_img = encode_image(pil_img)
    img = download_image(pil_img)
    # embedding = bt_embeddings(caption, base64_img)
    embedding = bt_embeddings(caption, img)
    # cat_embeddings.append(embedding)
    cat_embeddings += embedding

# compute BridgeTower embeddings for car image-text pairs
car_embeddings = []
for img_txt_pair in tqdm(
                        car_img_txt_pairs, 
                        total=len(car_img_txt_pairs)
                    ):
    pil_img = img_txt_pair['pil_img']
    caption = img_txt_pair['caption']
    # base64_img = encode_image(pil_img)
    # embedding = bt_embeddings(caption, base64_img)
    # car_embeddings.append(embedding)
    img = download_image(pil_img)
    embedding = bt_embeddings(caption, img)
    car_embeddings += embedding

100%|██████████| 50/50 [01:43<00:00,  2.07s/it]
100%|██████████| 50/50 [01:44<00:00,  2.09s/it]

# function transforms high-dimension vectors to 2D vectors using UMAP
def dimensionality_reduction(embed_arr, label):
    X_scaled = MinMaxScaler().fit_transform(embed_arr)
    print(X_scaled)
    mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
    df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
    df_emb["label"] = label
    print(df_emb)
    return df_emb

# stacking embeddings of cat and car examples into one numpy array
all_embeddings = np.concatenate([cat_embeddings, car_embeddings])

# prepare labels for the 3 examples
labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)

# compute dimensionality reduction for the 3 examples
reduced_dim_emb = dimensionality_reduction(all_embeddings, labels)

[[0.94044932 0.51520534 0.40854868 ... 0.82443252 0.18936151 0.85551098]
 [0.44651759 0.72984073 0.90337535 ... 0.74894671 0.77774062 0.73214139]
 [1.         0.53984785 0.68427674 ... 0.25664673 0.30507779 0.73041823]
 ...
 [0.74945686 0.84984432 0.33774444 ... 0.97078401 0.41347174 0.8945623 ]
 [0.42135375 0.72863285 0.18744774 ... 0.49615729 0.18102552 0.814073  ]
 [0.73606836 0.55605253 0.52697104 ... 0.71685533 0.16279527 0.79900731]]
            X          Y label
0   15.105264   1.334000   cat
1    9.278584   9.754931   cat
2   15.108495   1.325792   cat
3   15.523254   1.790911   cat
4    4.120176  14.938648   cat
..        ...        ...   ...
95  -4.299042  -1.406977   car
96  -3.817038  -1.527148   car
97  -4.179157  -0.701077   car
98  11.569551  -2.112752   car
99  -4.280169  -1.823108   car

[100 rows x 3 columns]

import matplotlib.pyplot as plt
import seaborn as sns

# Plot the centroids against the cluster
fig, ax = plt.subplots(figsize=(8,6)) # Set figsize

sns.set_style("whitegrid", {'axes.grid' : False})
sns.scatterplot(data=reduced_dim_emb, 
                x=reduced_dim_emb['X'], 
                y=reduced_dim_emb['Y'], 
                hue='label', 
                palette='bright')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title('Scatter plot of images of cats and cars using UMAP')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

Lesson 1: Multimodal Embeddings¶

Multimodal Data¶

BridgeTower Embedding Computation¶

Setup¶

Compute Embedding¶

Cosine Similarity Between Embedding Vectors¶

Euclidean Distance Between Embedding Vectors¶

Visualizing High-dimensional Data with UMAP¶

Take-Home Notes:¶