import requests
import os
# You can use your own uploaded images and captions.
# You will be responsible for the legal use of images that
# you are going to use.
url1='http://farm3.staticflickr.com/2519/4126738647_cc436c111b_z.jpg'
cap1='A motorcycle sits parked across from a herd of livestock'
url2='http://farm3.staticflickr.com/2046/2003879022_1b4b466d1d_z.jpg'
cap2='Motorcycle on platform to be worked on in garage'
url3='http://farm1.staticflickr.com/133/356148800_9bf03b6116_z.jpg'
cap3='a cat laying down stretched out near a laptop'
img1 = {
'flickr_url': url1,
'caption': cap1,
'image_path' : './shared_data/motorcycle_1.jpg'
}
img2 = {
'flickr_url': url2,
'caption': cap2,
'image_path' : './shared_data/motorcycle_2.jpg'
}
img3 = {
'flickr_url' : url3,
'caption': cap3,
'image_path' : './shared_data/cat_1.jpg'
}
# download images
imgs = [img1, img2, img3]
# Ensure the directory exists
os.makedirs('./shared_data', exist_ok=True)
for img in imgs:
data = requests.get(img['flickr_url']).content
with open(img['image_path'], 'wb') as f:
f.write(data)
from PIL import Image
from IPython.display import display
img3['image_path'] = './shared_data/cat_1.png'# the downloaded img is corrupted
for img in [img1, img2, img3]:
image = Image.open(img['image_path'])
caption = img['caption']
display(image)
display(caption)
print()
'A motorcycle sits parked across from a herd of livestock'
'Motorcycle on platform to be worked on in garage'
'a cat laying down stretched out near a laptop'
import json
import os
import numpy as np
from numpy.linalg import norm
import cv2
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
from utils import encode_image
from utils import bt_embedding_from_prediction_guard as bt_embeddings
/home/yutong/anaconda3/envs/multimodal/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Compute Embedding¶
from utils import download_image
embeddings = []
for img in [img1, img2, img3]:
img_path = img['image_path']
caption = img['caption']
# base64_img = encode_image(img_path)
img = download_image(img_path)
embedding = bt_embeddings(caption, img)
# embedding = bt_embeddings(caption, base64_img)
# embeddings.append(embedding)
embeddings += embedding
# Each image-text pair is now converted into multimodal
# embedding vector which has dimensions of 512.
print(len(embeddings[0]))
2048
Cosine Similarity Between Embedding Vectors¶
def cosine_similarity(vec1, vec2):
similarity = np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))
return similarity
ex1_embed = np.array(embeddings[0])
ex2_embed = np.array(embeddings[1])
ex3_embed = np.array(embeddings[2])
sim_ex1_ex2 = cosine_similarity(ex1_embed, ex2_embed)
sim_ex1_ex3 = cosine_similarity(ex1_embed, ex3_embed)
print("Cosine similarity between ex1_embeded and ex2_embeded is:")
display(sim_ex1_ex2)
print("Cosine similarity between ex1_embeded and ex3_embeded is:")
display(sim_ex1_ex3)
Cosine similarity between ex1_embeded and ex2_embeded is:
np.float64(0.9268679148305662)
Cosine similarity between ex1_embeded and ex3_embeded is:
np.float64(0.8940822324198224)
Note: As expected, since the 1st and the 2nd images all depict motorcycles
and the third image depicts a cat, which is different from motorcycle,
the Cosine similarity between the 1st and the 2nd examples' embeddings
is greater than that between the 1st and the 3rd examples' embeddings.
Euclidean Distance Between Embedding Vectors¶
dist_ex1_ex2 = cv2.norm(ex1_embed, ex2_embed, cv2.NORM_L2)
dist_ex1_ex3 = cv2.norm(ex1_embed, ex3_embed, cv2.NORM_L2)
print("Euclidean distance between ex1_embeded and ex2_embeded is:")
display(dist_ex1_ex2)
print("Euclidean distance between ex1_embeded and ex3_embeded is:")
display(dist_ex1_ex3)
Euclidean distance between ex1_embeded and ex2_embeded is:
failed to open /dev/dri/renderD128: Permission denied failed to open /dev/dri/renderD129: Permission denied failed to open /dev/dri/renderD130: Permission denied failed to open /dev/dri/renderD131: Permission denied failed to open /dev/dri/renderD128: Permission denied failed to open /dev/dri/renderD129: Permission denied failed to open /dev/dri/renderD130: Permission denied failed to open /dev/dri/renderD131: Permission denied
10.3079760231243
Euclidean distance between ex1_embeded and ex3_embeded is:
12.509936672285201
Note: As expected, since the 1st and the 2nd images all depict motorcycles
and the third image depicts a cat, which is different from motorcycle,
the Euclidean distance between the 1st and the 2nd examples' embeddings
is smaller than that between the 1st and the 3rd examples' embeddings.
Visualizing High-dimensional Data with UMAP¶
from utils import prepare_dataset_for_umap_visualization as data_prep
# prepare image_text pairs
# for the first 50 data of Huggingface dataset
# "yashikota/cat-image-dataset"
cat_img_txt_pairs = data_prep("yashikota/cat-image-dataset",
"cat", test_size=50)
# for the first 50 data of Huggingface dataset
# "tanganke/stanford_cars"
car_img_txt_pairs = data_prep("tanganke/stanford_cars",
"car", test_size=50)
Generating train split: 100%|██████████| 9993/9993 [00:04<00:00, 2222.36 examples/s] Generating train split: 100%|██████████| 8144/8144 [00:03<00:00, 2603.31 examples/s] Generating test split: 100%|██████████| 8041/8041 [00:03<00:00, 2479.72 examples/s] Generating contrast split: 100%|██████████| 8041/8041 [00:01<00:00, 7218.40 examples/s] Generating gaussian_noise split: 100%|██████████| 8041/8041 [00:02<00:00, 2911.22 examples/s] Generating impulse_noise split: 100%|██████████| 8041/8041 [00:03<00:00, 2550.19 examples/s] Generating jpeg_compression split: 100%|██████████| 8041/8041 [00:01<00:00, 5755.36 examples/s] Generating motion_blur split: 100%|██████████| 8041/8041 [00:01<00:00, 4783.01 examples/s] Generating pixelate split: 100%|██████████| 8041/8041 [00:00<00:00, 106211.50 examples/s] Generating spatter split: 100%|██████████| 8041/8041 [00:03<00:00, 2537.41 examples/s]
Note: your images may differ from those seen in the video.
# display an example of a cat image-text pair data
display(cat_img_txt_pairs[0]['caption'])
display(cat_img_txt_pairs[0]['pil_img'])
# display an example of a car image-text pair data
display(car_img_txt_pairs[0]['caption'])
display(car_img_txt_pairs[0]['pil_img'])
'an image of cat'
'a picture of car'
# compute BridgeTower embeddings for cat image-text pairs
cat_embeddings = []
for img_txt_pair in tqdm(
cat_img_txt_pairs,
total=len(cat_img_txt_pairs)
):
pil_img = img_txt_pair['pil_img']
caption = img_txt_pair['caption']
# base64_img = encode_image(pil_img)
img = download_image(pil_img)
# embedding = bt_embeddings(caption, base64_img)
embedding = bt_embeddings(caption, img)
# cat_embeddings.append(embedding)
cat_embeddings += embedding
# compute BridgeTower embeddings for car image-text pairs
car_embeddings = []
for img_txt_pair in tqdm(
car_img_txt_pairs,
total=len(car_img_txt_pairs)
):
pil_img = img_txt_pair['pil_img']
caption = img_txt_pair['caption']
# base64_img = encode_image(pil_img)
# embedding = bt_embeddings(caption, base64_img)
# car_embeddings.append(embedding)
img = download_image(pil_img)
embedding = bt_embeddings(caption, img)
car_embeddings += embedding
100%|██████████| 50/50 [01:43<00:00, 2.07s/it] 100%|██████████| 50/50 [01:44<00:00, 2.09s/it]
# function transforms high-dimension vectors to 2D vectors using UMAP
def dimensionality_reduction(embed_arr, label):
X_scaled = MinMaxScaler().fit_transform(embed_arr)
print(X_scaled)
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = label
print(df_emb)
return df_emb
# stacking embeddings of cat and car examples into one numpy array
all_embeddings = np.concatenate([cat_embeddings, car_embeddings])
# prepare labels for the 3 examples
labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
# compute dimensionality reduction for the 3 examples
reduced_dim_emb = dimensionality_reduction(all_embeddings, labels)
[[0.94044932 0.51520534 0.40854868 ... 0.82443252 0.18936151 0.85551098] [0.44651759 0.72984073 0.90337535 ... 0.74894671 0.77774062 0.73214139] [1. 0.53984785 0.68427674 ... 0.25664673 0.30507779 0.73041823] ... [0.74945686 0.84984432 0.33774444 ... 0.97078401 0.41347174 0.8945623 ] [0.42135375 0.72863285 0.18744774 ... 0.49615729 0.18102552 0.814073 ] [0.73606836 0.55605253 0.52697104 ... 0.71685533 0.16279527 0.79900731]] X Y label 0 15.105264 1.334000 cat 1 9.278584 9.754931 cat 2 15.108495 1.325792 cat 3 15.523254 1.790911 cat 4 4.120176 14.938648 cat .. ... ... ... 95 -4.299042 -1.406977 car 96 -3.817038 -1.527148 car 97 -4.179157 -0.701077 car 98 11.569551 -2.112752 car 99 -4.280169 -1.823108 car [100 rows x 3 columns]
import matplotlib.pyplot as plt
import seaborn as sns
# Plot the centroids against the cluster
fig, ax = plt.subplots(figsize=(8,6)) # Set figsize
sns.set_style("whitegrid", {'axes.grid' : False})
sns.scatterplot(data=reduced_dim_emb,
x=reduced_dim_emb['X'],
y=reduced_dim_emb['Y'],
hue='label',
palette='bright')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title('Scatter plot of images of cats and cars using UMAP')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
Note:
- The embeddings of image-text pairs of `cats` (i.e., blue dots) are
closed to each other.
- The embeddings of image-text pairs of `cars` (i.e., orange dots) are
closed to each other.
- The embeddings of image-text pairs of `cats` (blue dots) are far away
from the embeddings of image-text pairs of `cars` (orange dots).
Note that UMAP includes some randomness, Your clusters may not appear the same as those in the videos however cats and cars should still be clustered separately.
Take-Home Notes:¶
Notes:
- While we presented the two metrics: Cosine Similarity and
Euclidean Distance, and one visualization technique for embeddings:
UMAP above to demonstrate the meaning of embeddings,
you can also use other metrics (e.g., Cosine Distance and
Minkowski Distance) and other visualization techniques (e.g., t-SNE)
to verify the embeddings.
- There are other multimodal embedding models that can compute
the embeddings for images and texts like BridgeTower does. For example,
CLIP for image embedding and Sentence Transformer for text embedding.