# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SETUP¶
This is follow up to the How to Set Up your Google Cloud Account instructions from the course, Large Multimodal Model Prompting with Gemini on the Learning Platform of DeepLearning.AI.
Install Vertex AI SDK and other Required Packages¶
%pip install --upgrade --user --quiet google-cloud-aiplatform
Note: you may need to restart the kernel to use updated packages.
Restart Runtime¶
To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.
The restart might take a minute or longer. After it's restarted, continue to the next step.
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)
{'status': 'ok', 'restart': True}
The Kernel crashed while executing code in the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
Authenticate your Notebook Environment (Colab Only)¶
If you're running this notebook on Google Colab, run the cell below to authenticate your environment.
NOTE: The Gmail email address you use to authenticate this lesson colab must be the same as the one you used to set up your Google Cloud account and your Project.
import sys
if "google.colab" in sys.modules:
from google.colab import auth
auth.authenticate_user()
Set Google Cloud Project Information and Initialize Vertex AI SDK¶
Add your Project ID below, which you created while following the How to Set Up your Google Cloud Account instructions. If your Project ID
was dlai-shortcourse-on-gemini
, then you can run the cell below as it is. Otherwise, be sure to change it.
You can also look up your Project ID in your Project Dashboard.
PROJECT_ID = "[PROJECT_ID]" # @param {type:"string"}
LOCATION = "us-central1" # @param {type:"string"}
import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)
Utils Functions and Images¶
Important Note:
If using Google Colab, all the
from utils import ...
functions and all required images are readily accessible in the notebook. Execute the following cells to proceed.
import typing
import IPython.display
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps
from vertexai.generative_models import (
GenerationConfig,
Image,
)
def gemini(prompt, model):
responses = model.generate_content(prompt,
stream=True)
response_text = ""
for response in responses:
response_text += response.text
return response_text
def display_images(
images: typing.Iterable[Image],
max_width: int = 600,
max_height: int = 350,
) -> None:
for image in images:
pil_image = typing.cast(PIL_Image.Image, image._pil_image)
if pil_image.mode != "RGB":
# RGB is supported by all Jupyter environments (e.g. RGBA is not yet)
pil_image = pil_image.convert("RGB")
image_width, image_height = pil_image.size
if max_width < image_width or max_height < image_height:
# Resize to display a smaller notebook image
pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))
IPython.display.display(pil_image)
def print_multimodal_prompt(contents: list):
"""
Given contents that would be sent to Gemini,
output the full multimodal prompt for ease of readability.
"""
for content in contents:
if isinstance(content, Image):
display_images([content])
elif isinstance(content, Part):
url = get_url_from_gcs(content.file_data.file_uri)
IPython.display.display(load_image_from_url(url))
else:
print(content)
def gemini_vision(contents_image, model):
responses = model.generate_content(
contents_image,
stream=True)
response_text = ""
for response in responses:
response_text += response.text
return response_text
def gemini_vision_parameters(contents_image, model, config):
responses = model.generate_content(
contents=contents_image,
generation_config=config,
stream=True
)
response_text = ""
for response in responses:
response_text += response.text
return response_text
# download images from bucket
! gsutil cp "gs://dlai-sc-gemini-bucket/Andrew_power_tools.png" ./andrew_power_tools.png
! gsutil cp "gs://dlai-sc-gemini-bucket/panda.png" ./panda.png
/bin/bash: gsutil: command not found /bin/bash: gsutil: command not found
IN COURSE VIDEO¶
Lesson video starts from below.
Lesson 2: Multimodal Prompting and Parameter Control¶
In this notebook, you'll have a quick overview of simple text, image and video examples with Gemini, and learn about different Parameters which can be set up.
Text Examples¶
- Load the gemini-1.0-pro-002 model.
from vertexai.generative_models import GenerativeModel
model = GenerativeModel("gemini-1.0-pro-002")
from utils import gemini
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
If you run the next cell and receive a 403 error
:
- Recheck your project name, and be sure it matches the name assigned when setting up the Google Cloud Project. Note that project names are lowercase.
- Ensure you completed Step 5 of the instructions on setting up Google Cloud and Vertex AI.
gemini("What is a multimodal model?", model=model)
'A multimodal model is a type of artificial intelligence (AI) model that can process and understand multiple types of data, such as text, images, and videos. This makes them more versatile and powerful than traditional AI models that can only process one type of data. Multimodal models are used for a variety of tasks, including machine translation, image captioning, and video summarization.\n\nHere are some of the key benefits of using multimodal models:\n\n* They can capture more information from the data they are processing. This is because they can take into account the relationships between different types of data, such as the words in a sentence and the images that are associated with it.\n* They can be more accurate than traditional AI models. This is because they can use multiple sources of information to make predictions.\n* They can be more creative than traditional AI models. This is because they can use their understanding of multiple types of data to generate new ideas.\n\nMultimodal models are still a relatively new technology, but they have the potential to revolutionize the way we interact with AI. As they continue to develop, we can expect to see them used in a wider range of applications.\n\nHere are some specific examples of multimodal models:\n\n* **Machine translation models that use text and images to translate languages more accurately.**\n* **Image captioning models that use images and text to describe images in detail.**\n* **Video summarization models that use video and text to summarize videos and generate transcripts.**\n* **Dialogue systems that use text and speech to have conversations with humans.**\n\nThese are just a few examples of the many different ways that multimodal models can be used. As the technology continues to develop, we can expect to see even more innovative and creative applications for these models.'
Under the hood of the helper functions¶
prompt_1 = """
In short, what is deeplearning.ai,
and what can it offer me as a Machine Learning Engineer?"
"""
stream=True
processes the response as it being generated.- With
stream=False
, you have to wait until the entire response has been generated before it can be proccessed and printed.
response_1 = model.generate_content(prompt_1, stream=True)
response_1
<generator object _GenerativeModel._generate_content_streaming at 0x7f355ce99f20>
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
for response in response_1:
print(response)
candidates { content { role: "model" parts { text: "##" } } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " Deeplearning.ai in a Nutshell\n\nDeeplearning.ai is an online platform that" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0942382812 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0756835938 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.123535156 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.138671875 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.188476562 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.123535156 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.151367188 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0610351562 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " offers courses, specializations, and professional certificate programs in Artificial Intelligence (AI)," } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0289306641 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0395507812 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0629882812 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0864257812 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0942382812 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0502929688 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0559082031 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0311279297 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " Deep Learning (DL), and Machine Learning (ML). \n\nIt was co-founded by Andrew Ng, a prominent figure in the AI and ML field. The platform provides" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0284423828 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0317382812 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0454101562 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0583496094 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0849609375 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0461425781 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.048828125 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0311279297 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " learners with the opportunity to:\n\n* **Develop skills in AI and ML:** Learn about various deep learning techniques, algorithms and applications through interactive exercises, projects," } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0289306641 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0390625 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0356445312 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0583496094 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0654296875 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.03515625 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0385742188 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0289306641 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " and video lectures.\n* **Earn recognized credentials:** Gain valuable certificates upon completion of specializations and programs, demonstrating your expertise to potential employers.\n* **Connect with the AI community:** Join a global network of learners and experts, participate in discussions," } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0272216797 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0480957031 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0302734375 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.068359375 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0559082031 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0336914062 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0361328125 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0233154297 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " and collaborate on projects.\n\n## Deeplearning.ai for Machine Learning Engineers\n\nDeeplearning.ai can be particularly beneficial for Machine Learning Engineers by:\n\n* **Upskilling and reskilling:** Stay current with the latest advancements in the field" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0373535156 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0600585938 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0302734375 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.099609375 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0593261719 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0317382812 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0297851562 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0169677734 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: ", including cutting-edge deep learning techniques and applications.\n* **Expanding knowledge:** Gain a deeper understanding of Deep Learning and its role within Machine Learning, enhancing your ability to build and deploy complex AI models.\n* **Career advancement:** Differentiate yourself from other ML Engineers by earning recognized credentials and demonstrating specialized expertise in" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0390625 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0600585938 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0267333984 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0864257812 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0600585938 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0373535156 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0317382812 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0191650391 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " Deep Learning.\n* **Networking opportunities:** Connect with other professionals in the field, building valuable connections and expanding your professional network.\n\n## Is Deeplearning.ai right for you?\n\nWhether or not Deeplearning.ai is the right platform for you depends on your individual needs and goals. If you are a Machine Learning Engineer looking" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0534667969 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0756835938 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.029296875 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0927734375 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.078125 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0517578125 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0434570312 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0263671875 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " to expand your knowledge in AI and Deep Learning, gain recognized credentials, and connect with the AI community, then Deeplearning.ai could be a valuable resource for you.\n\n**Here are some additional factors to consider:**\n\n* **Your existing knowledge:** If you already have a strong foundation in Machine Learning, you might" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0373535156 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0617675781 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0267333984 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0864257812 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0583496094 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0385742188 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0336914062 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0279541016 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " benefit more from advanced Deep Learning courses rather than introductory ones. \n* **Your learning style:** Do you prefer learning through video lectures, interactive exercises, or a combination of both? \n* **Your budget:** Deeplearning.ai offers various pricing options, including free courses, paid specializations and professional certificate programs." } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0306396484 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.068359375 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0275878906 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0903320312 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0559082031 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0415039062 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0378417969 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0319824219 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " \n* **Your time commitment:** Consider the amount of time you can dedicate to learning new skills.\n\nI recommend exploring the Deeplearning.ai website to learn more about the specific courses, specializations, and programs they offer to see if they align with your individual goals and learning style. You can find reviews from" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.029296875 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.068359375 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0279541016 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0864257812 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.048828125 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0390625 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0341796875 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0302734375 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: " other learners online to gain further insights about their experiences with Deeplearning.ai. \n" } } safety_ratings { category: HARM_CATEGORY_HATE_SPEECH probability: NEGLIGIBLE probability_score: 0.0284423828 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.06640625 } safety_ratings { category: HARM_CATEGORY_DANGEROUS_CONTENT probability: NEGLIGIBLE probability_score: 0.0272216797 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0849609375 } safety_ratings { category: HARM_CATEGORY_HARASSMENT probability: NEGLIGIBLE probability_score: 0.0466308594 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0385742188 } safety_ratings { category: HARM_CATEGORY_SEXUALLY_EXPLICIT probability: NEGLIGIBLE probability_score: 0.0317382812 severity: HARM_SEVERITY_NEGLIGIBLE severity_score: 0.0289306641 } } model_version: "gemini-1.0-pro-002" candidates { content { role: "model" parts { text: "" } } finish_reason: STOP } usage_metadata { prompt_token_count: 25 candidates_token_count: 543 total_token_count: 568 } model_version: "gemini-1.0-pro-002"
response_1 = model.generate_content(prompt_1,
stream=True)
for response in response_1:
print(response.text)
## Deeplearning.ai in a Nutshell Deeplearning.ai, founded by Andrew Ng, focuses on online education about deep learning and artificial intelligence. They provide various resources, including: * **Courses:** Deeplearning.ai offers a range of courses, from introductory to advanced, covering topics like deep learning fundamentals, computer vision, and natural language processing. These courses are taught by industry experts and offer hands-on learning through coding assignments and projects. * **Specializations:** Deeplearning .ai also provides specializations, which are groups of related courses designed to help learners gain mastery in a specific area. For example, they offer a deep learning specialization and an AI for business specialization. * **Content:** Beyond courses, deeplearning.ai offers a wealth of free content, including blog posts, articles, and videos on various deep learning and AI topics. ## What Deeplearning.ai Offers Machine Learning Engineers: * **Upskilling:** For Machine Learning Engineers who want to stay current with the latest advancements in deep learning, deeplearning.ai offers a variety of resources to help them learn new skills and techniques. * **Specialization:** The specializations offered by deeplearning.ai can help Machine Learning Engineers gain deeper expertise in specific areas of deep learning, making them more valuable to potential employers. * **Career Advancement:** By completing courses and specializations offered by deeplearning.ai, Machine Learning Engineers can demonstrate their commitment to professional development and increase their chances of landing better jobs or promotions. However, it's important to note that deeplearning.ai is just one resource among many available for Machine Learning Engineers. Other platforms like Coursera, Udacity, and fast.ai also have a wide variety of courses and programs on deep learning and AI. Here are some additional points to consider: * **Cost:** While some of the content on deeplearning.ai is free, most of the courses and specializations require a paid subscription. * **Time commitment:** Completing a course or specialization on deeplearning.ai requires a significant time investment. * **Prerequisites:** Some of the more advanced courses on deeplearning.ai require prior knowledge of machine learning and Python programming. Ultimately, whether or not deeplearning.ai is right for you as a Machine Learning Engineer depends on your individual needs and learning goals. I hope this helps! Let me know if you have any other questions.
Multimodality: Image + Text¶
from vertexai.generative_models import (
GenerativeModel,
Image,
Part,
)
- Load the gemini-1.0-pro-vision-001 model.
multimodal_model = GenerativeModel("gemini-1.0-pro-vision-001")
- Load a prompt and an image
image = Image.load_from_file("wuxia.jpg")
prompt_3 = "Please describe what is in this image?"
# prompt_3 = "What are likely professions of this person?"
contents_image = [image, prompt_3]
# from utils import print_multimodal_prompt
print("-------Prompt--------")
print_multimodal_prompt(contents_image)
-------Prompt--------
Please describe what is in this image?
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
# from utils import gemini_vision
gemini_vision(contents_image, model=multimodal_model)
' The image contains a fight scene between a group of people. They are all wearing traditional Japanese clothing and are armed with swords. The background is a blur of black and white, which suggests that the fight is taking place in a dark and shadowy environment.'
Multimodality: Video + Text¶
- Load a video and a prompt.
- The video loads from a GCP bucket.
video_uri = "gs://cicisnow/gesture.mp4"
video_url = "https://storage.googleapis.com/cicisnow/gesture.mp4"
from IPython.display import Video
Video(video_url, width=450)
prompt = """
Answer the following questions using the video only:
- What is the main character in this video?
- What is the main character doing?
- Where is the main character located?
"""
video = Part.from_uri(video_uri, mime_type="video/mp4")
contents_video = [prompt, video]
responses_4 = multimodal_model.generate_content(contents_video, stream=True)
Note: This cell is converted to markdown to prevent accidentally executing it. The cost to run this cell is approximately 0.12 USD (as of August 2024).
responses_4 = multimodal_model.generate_content(contents_video, stream=True)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
for response in responses_4:
print(response.text, end="")
- The main character is a young woman named Jamie. - She is sitting in an office, working on her computer. - She is wearing a casual outfit, and she has her hair in a ponytail.
Gemini Model Parameters¶
- Load an image.
- You'll be using the gemini-1.0-pro-vision-001 model.
- First run the model using its default parameters.
image_1 = Image.load_from_file("soup_noodle.jpg")
prompt_1 = """Write what is happening in the following image
from a unique perspective and do not mention names"""
contents = [image_1, prompt_1]
# from utils import print_multimodal_prompt
print("-------Prompt--------")
print_multimodal_prompt(contents)
-------Prompt--------
Write what is happening in the following image from a unique perspective and do not mention names
response_1 = multimodal_model.generate_content(
contents,
stream=True
)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
for response in response_1:
print(response.text, end="")
A young woman is eating a bowl of spicy noodles. She is slurping the noodles and has a spoonful of them in her mouth. She is wearing a straw hat and a white shirt. She is sitting at a table in a restaurant.
Temperature, Top k¶
- GenerationConfig lets you set up the parameters for the model.
Note: In the latest version, from vertexai.preview.generative_models
has been changed to from vertexai.generative_models
.
from vertexai.preview.generative_models
can still be used.
from vertexai.generative_models import GenerationConfig
# from utils import gemini_vision_parameters
- Setting
temperature=0.0
andtop_k=1
generation_config_1 = GenerationConfig(
temperature=0.0,
top_k=1,
)
response_zero_temp = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_1)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
print(response_zero_temp)
A young woman is eating a bowl of soup. She is using a spoon to scoop the soup from the bowl and bring it to her mouth. She is wearing a straw hat and a white shirt. She is sitting at a table in a restaurant.
responses_zero_temp = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_1)
print(response_zero_temp)
A young woman is eating a bowl of soup. She is using a spoon to scoop the soup from the bowl and bring it to her mouth. She is wearing a straw hat and a white shirt. She is sitting at a table in a restaurant.
- Setting
temperature=1
andtop_k=40
generation_config_2 = GenerationConfig(
temperature=1,
top_k=40,
)
responses_high_temp_topk = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_2)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
print(responses_high_temp_topk)
As I approach the table, I see her sitting there, waiting for me. She is wearing a beautiful dress and her hair is done up in a way that makes her look even more stunning. I can't help but feel lucky to be here with her. We ordered some food and as we wait I can't help but admire the way she eats. She is so graceful and elegant, even when she's eating something as simple as soup. The food was delicious, and it was made even better by the experience of sharing it with her.
Top p¶
- Setting
top_p=0.01
, and maintainingtemperature=1
,top_k=40
generation_config_4 = GenerationConfig(
temperature=1,
top_k=40,
top_p=0.01,
)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
responses_high_temp_topp = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_4)
print(responses_high_temp_topp)
A young woman is eating a bowl of soup. She is using a spoon to scoop the soup from the bowl and bring it to her mouth. She is wearing a straw hat and a white shirt. She is sitting at a table in a restaurant.
Max Output Tokens¶
- The range for
max_output_tokens
is1 (inclusive) to 2049 (exclusive)
- When using
max_output_tokens
, the number of words returned by the model are 1 less than what you specify. - So setting
max_output_tokens
to1
will throw an error, since there would be no generated text.
generation_config_5 = GenerationConfig(
max_output_tokens=10,
)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
responses_max_output = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_5)
print(responses_max_output)
A young woman is eating a bowl of soup
Stop Sequences¶
- The range for
stop_sequences
is1 (inclusive) to 17 (exclusive)
- Multiple (no more than 16) words, numbers, space or special characters can be passed as a
list
. - The model response stops just before the first time it encounters any stop sequence.
generation_config_6 = GenerationConfig(
stop_sequences=["panda"]
)
Note: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.
responses_stop = gemini_vision_parameters(
contents,
multimodal_model,
generation_config_6)
print(responses_stop)
A young woman is eating a bowl of soup with chopsticks. She is wearing a straw hat and a white shirt. She is sitting at a table in a restaurant.