Spaces:
Running
Running
import streamlit as st | |
# Set the page layout to 'wide' | |
st.set_page_config(layout="wide") | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
# from IPython.display import display | |
import base64 | |
import time | |
import random | |
# helper decoder | |
def decode_base64_image(image_string): | |
base64_image = base64.b64decode(image_string) | |
buffer = BytesIO(base64_image) | |
return Image.open(buffer) | |
# display PIL images as grid | |
def display_image(image=None,width=500,height=500): | |
img = image.resize((width, height)) | |
return img | |
def pretty_print(messages): | |
for message in messages: | |
return f"{message['role']}: {message['content']}" | |
# API Gateway endpoint URL | |
api_url = 'https://a02q342s5b.execute-api.us-east-2.amazonaws.com/reinvent-demo-inf2-sm-20231114' | |
# # Define the CSS to change the text input background color | |
# input_field_style = """ | |
# <style> | |
# /* Customize the text input field background and text color */ | |
# .stTextInput input { | |
# background-color: #fbd8bf; /* 'Rind' color */ | |
# color: #232F3E; /* Dark text color */ | |
# } | |
# /* You might also want to change the color for textarea if you're using it */ | |
# .stTextArea textarea { | |
# background-color: #fbd8bf; /* 'Rind' color */ | |
# color: #232F3E; /* Dark text color */ | |
# } | |
# </style> | |
# """ | |
# # Inject custom styles into the Streamlit app | |
# st.markdown(input_field_style, unsafe_allow_html=True) | |
# Creating Tabs | |
tab1, tab2, tab3, tab4 = st.tabs(["Image Generation", "Architecture", "Stable Diffusion Architecture", "Code"]) | |
with tab1: | |
# Create two columns for layout | |
left_column, right_column = st.columns(2) | |
with right_column: | |
cont = st.container() | |
# =========== | |
with left_column: | |
# Define Streamlit UI elements | |
st.title('Stable Diffusion XL Image Generation with AWS Inferentia2') | |
sample_prompts = [ | |
"A futuristic cityscape at sunset, cyberpunk", | |
"A serene landscape with mountains and a river, photorealistic style", | |
"An astronaut riding a horse, artistic and surreal", | |
"A robot playing chess in a medieval setting, high detail", | |
"An underwater scene with colorful coral reefs and fish, vibrant colors", | |
"Raccoon astronaut in space, sci-fi, future, cold color palette, muted colors, detailed, 8k", | |
"A lost city rediscovered in the Amazon jungle, overgrown with plants, in the style of a vintage travel poster", | |
"A steampunk train emitting clouds of steam as it races through a mountain pass, digital art", | |
"An enchanted forest with bioluminescent trees and fairies dancing, in a Studio Ghibli style", | |
"A portrait of an elegant alien empress with a detailed headdress, reminiscent of Art Nouveau", | |
"A post-apocalyptic Tokyo with nature reclaiming skyscrapers, in the style of a concept art", | |
"A mythical phoenix rising from ashes, vibrant colors, with a nebula in the background", | |
"A cybernetic wolf in a neon-lit city, cyberpunk theme, rain-drenched streets", | |
"A high fantasy battle scene with dragons in the sky and knights on the ground, epic scale", | |
"An ice castle on a lonely mountain peak, under the northern lights, fantasy illustration", | |
"A surreal landscape where giant flowers bloom in the desert, with a distant thunderstorm, hyperrealism" | |
] | |
def set_random_prompt(): | |
# This function will be called when the button is clicked | |
random_prompt = random.choice(sample_prompts) | |
# Update the session state for the input field | |
st.session_state.prompt_one = random_prompt | |
prompt_one = st.text_area("Enter your prompt:", | |
key="prompt_one") | |
st.button('Random Prompt', on_click=set_random_prompt) | |
# Number of inference steps | |
num_inference_steps_one = st.slider("Number of Inference Steps", | |
min_value=1, | |
max_value=100, | |
value=15, | |
help="More steps might improve quality, with diminishing marginal returns. 30-50 seems best, but your mileage may vary.") | |
# Create an expandable section for optional parameters | |
with st.expander("Optional Parameters"): | |
# Random seed input | |
seed_one = st.number_input("Random seed", | |
value=555, | |
help="Set to the same value to generate the same image if other inputs are the same, change to generate a different image for same inputs.") | |
# Negative prompt input | |
negative_prompt_one = st.text_area("Enter your negative prompt:", | |
"cartoon, graphic, text, painting, crayon, graphite, abstract glitch, blurry") | |
if st.button('Generate Image'): | |
with st.spinner(f'Generating Image with {num_inference_steps_one} iterations'): | |
start_time = time.time() | |
# =============== | |
# Example input data | |
prompt_input_one = { | |
"prompt": prompt_one, | |
"parameters": { | |
"num_inference_steps": num_inference_steps_one, | |
"seed": seed_one, | |
"negative_prompt": negative_prompt_one | |
}, | |
"endpoint": "huggingface-pytorch-inference-neuronx-2023-11-14-21-22-10-388" | |
} | |
# Make API request | |
response_one = requests.post(api_url, json=prompt_input_one) | |
# Process and display the response | |
if response_one.status_code == 200: | |
result_one = response_one.json() | |
# st.success(f"Prediction result: {result}") | |
image_one = display_image(decode_base64_image(result_one["generated_images"][0])) | |
cont.image(image_one, | |
caption=f"{prompt_one}") | |
end_time = time.time() | |
total_time = round(end_time - start_time, 2) | |
cont.text(f"Prompt: {prompt_one}") | |
cont.text(f"Number of Iterations: {num_inference_steps_one}") | |
cont.text(f"Random Seed: {seed_one}") | |
cont.text(f'Total time taken: {total_time} seconds') | |
# Calculate and display the time per iteration in milliseconds | |
time_per_iteration_ms = (total_time / num_inference_steps_one) | |
cont.text(f'Time per iteration: {time_per_iteration_ms:.2f} seconds') | |
else: | |
st.error(f"Error: {response_one.text}") | |
# with pass: | |
# st.title('Llama 2 7B Text Generation with AWS Inferentia 2') | |
# params = { | |
# "do_sample" : True, | |
# "top_p": 0.6, | |
# "temperature": 0.9, | |
# "top_k": 50, | |
# "max_new_tokens": 512, | |
# "repetition_penalty": 1.03, | |
# } | |
# if "messages" not in st.session_state: | |
# st.session_state.messages = [ | |
# {"role": "system", "content": "You are a helpful Travel Planning Assistant. You respond with only 1-2 sentences."}, | |
# {'role': 'user', 'content': 'Where can I travel in the fall for cloudy, rainy, and beautiful views?'}, | |
# ] | |
# for message in st.session_state.messages: | |
# with st.chat_message(message["role"]): | |
# st.markdown(message["content"]) | |
# with st.chat_message("assistant"): | |
# message_placeholder = st.empty() | |
# full_response = "" | |
# prompt_input_one = { | |
# "prompt": st.session_state.messages, | |
# "parameters": params, | |
# "endpoint": "huggingface-pytorch-inference-neuronx-2023-11-28-16-09-51-708" | |
# } | |
# response_one = requests.post(api_url, json=prompt_input_one) | |
# if response_one.status_code == 200: | |
# result_one = response_one.json() | |
# # st.success(f"Prediction result: {result}") | |
# full_response += result_one["generation"] | |
# else: | |
# st.error(f"Error: {response_one.text}") | |
# message_placeholder.markdown(full_response) | |
# st.session_state.messages.append({"role": "assistant", "content": full_response}) | |
# if prompt := st.chat_input("What is up?"): | |
# st.session_state.messages.append({"role": "user", "content": prompt}) | |
# print(st.session_state.messages) | |
# with st.chat_message("user"): | |
# st.markdown(prompt) | |
# with st.chat_message("assistant"): | |
# message_placeholder = st.empty() | |
# new_response = "" | |
# prompt_input_one = { | |
# "prompt": st.session_state.messages, | |
# "parameters": params, | |
# "endpoint": "huggingface-pytorch-inference-neuronx-2023-11-28-16-09-51-708" | |
# } | |
# response_one = requests.post(api_url, json=prompt_input_one) | |
# if response_one.status_code == 200: | |
# result_one = response_one.json() | |
# # st.success(f"Prediction result: {result}") | |
# new_response += result_one["generation"] | |
# else: | |
# st.error(f"Error: {response_one.text}") | |
# message_placeholder.markdown(new_response) | |
# st.session_state.messages.append({"role": "assistant", "content": new_response}) | |
pass | |
with tab2: | |
# =========== | |
left_column, _, right_column = st.columns([2,.2,3]) | |
with right_column: | |
# Define Streamlit UI elements | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.image('./architecture.png', caption=f"Application Architecture") | |
with left_column: | |
st.write("## Architecture Overview") | |
st.write("This diagram illustrates the architecture of our Generative AI service, which is composed of several interconnected AWS services, notable Amazon Elastic Compute Cloud (Amazon EC2). Here's a detailed look at each component:") | |
with st.expander("(1) Inference Models"): | |
st.markdown(""" | |
- The architecture starts with our trained machine learning models hosted on Amazon SageMaker, running on AWS Inferentia 2 instance (`inf2.xlarge`). | |
- There are two models shown here, Stable Diffusion XL for image generation, and Llama 2 7B for text generation. | |
""") | |
with st.expander("(2) Amazon SageMaker Endpoints"): | |
st.markdown(""" | |
- The models are exposed via SageMaker Endpoints, which provide scalable and secure real-time inference services. | |
- These endpoints are the interfaces through which the models receive input data and return predictions. | |
""") | |
with st.expander("(3) AWS Lambda"): | |
st.markdown(""" | |
- AWS Lambda functions serve as the middle layer, handling the logic of communicating with the SageMaker Endpoints. | |
- Lambda can process the incoming requests, perform any necessary transformations, call the endpoints, and then process the results before sending them back. | |
""") | |
with st.expander("(4) Amazon API Gateway"): | |
st.markdown(""" | |
- The processed results from Lambda are then routed through Amazon API Gateway. | |
- API Gateway acts as a front door to manage all incoming API requests, including authorization, throttling, and CORS handling. | |
""") | |
with st.expander("(5) Streamlit Frontend"): | |
st.markdown(""" | |
- Finally, our Streamlit application provides a user-friendly interface for end-users to interact with the service. | |
- It sends requests to the API Gateway and displays the returned predictions from the machine learning models. | |
""") | |
st.write(""" | |
In summary, this architecture enables a scalable, serverless, and responsive Generative AI service that can serve real-time predictions to users directly from a web interface. | |
""") | |
with tab3: | |
left_column, _, right_column = st.columns([2,.2,3]) | |
with right_column: | |
# Define Streamlit UI elements | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
st.image('./sdxl_arch.png', caption=f"SDXL Architecture") | |
with left_column: | |
st.write("## SDXL Architecture Overview") | |
st.write(""" | |
The stable diffusion model takes both a latent seed and a text prompt as an input. The latent seed is then used to generate random latent image representations of size 64×64 where as the text prompt is transformed to text embeddings of size 77×768 via CLIP's text encoder. | |
Next the U-Net iteratively denoises the random latent image representations while being conditioned on the text embeddings. The output of the U-Net, being the noise residual, is used to compute a denoised latent image representation via a scheduler algorithm. Many different scheduler algorithms can be used for this computation, each having its pro- and cons. | |
Theory on how the scheduler algorithm function is out-of-scope for this demo, but in short one should remember that they compute the predicted denoised image representation from the previous noise representation and the predicted noise residual. | |
The denoising process is repeated ca. 50 times to step-by-step retrieve better latent image representations. Once complete, the latent image representation is decoded by the decoder part of the variational auto encoder. | |
""") | |
with tab4: | |
with st.expander("(1) Deploy GenAI Model to AWS Inferentia 2 Instance and Amazon SageMaker Endpoint"): | |
st.markdown( | |
""" | |
[Source] This code is modified from this fantastic blog by Phil Schmid at HuggingFace: https://www.philschmid.de/inferentia2-stable-diffusion-xl | |
# Deploy Stable Diffusion on AWS inferentia2 with Amazon SageMaker | |
In this end-to-end tutorial, you will learn how to deploy and speed up Stable Diffusion XL inference using AWS Inferentia2 and [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) on Amazon SageMaker. [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index) is the interface between the Hugging Face Transformers & Diffusers library and AWS Accelerators including AWS Trainium and AWS Inferentia2. | |
You will learn how to: | |
1. Convert Stable Diffusion XL to AWS Neuron (Inferentia2) with `optimum-neuron` | |
2. Create a custom `inference.py` script for Stable Diffusion | |
3. Upload the neuron model and inference script to Amazon S3 | |
4. Deploy a Real-time Inference Endpoint on Amazon SageMaker | |
5. Generate images using the deployed model | |
## Quick intro: AWS Inferentia 2 | |
[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Inferentia 2 is the successor of [AWS Inferentia](https://aws.amazon.com/ec2/instance-types/inf1/?nc1=h_ls), which promises to deliver up to 4x higher throughput and up to 10x lower latency. | |
| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) | | |
| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- | | |
| inf2.xlarge | 1 | 2 | 32 | 4 | 16 | 0.76 | | |
| inf2.8xlarge | 1 | 2 | 32 | 32 | 128 | 1.97 | | |
| inf2.24xlarge | 6 | 12 | 192 | 96 | 384 | 6.49 | | |
| inf2.48xlarge | 12 | 24 | 384 | 192 | 768 | 12.98 | | |
Additionally, inferentia 2 will support the writing of custom operators in c++ and new datatypes, including `FP8` (cFP8). | |
Let's get started! 🚀 | |
*If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.* | |
## 1. Convert Stable Diffusion to AWS Neuron (Inferentia2) with `optimum-neuron` | |
We are going to use the [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) to compile/convert our model to neuronx. Optimum Neuron provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks. | |
As a first step, we need to install the `optimum-neuron` and other required packages. | |
*Tip: If you are using Amazon SageMaker Notebook Instances or Studio you can go with the `conda_python3` conda kernel.* | |
```python | |
# Install the required packages | |
%pip install "optimum-neuron==0.0.13" "diffusers==0.21.4" --upgrade | |
%pip install "sagemaker>=2.197.0" --upgrade | |
``` | |
After we have installed the `optimum-neuron` we can convert load and convert our model. | |
We are going to use the [stabilityai/stable-diffusion-xl-base-1.0](hstabilityai/stable-diffusion-xl-base-1.0) model. Stable Diffusion XL (SDXL) from [Stability AI](https://stability.ai/) is the newset text-to-image generation model, which can create photorealistic images with detailed imagery and composition compared to previous SD models, including SD 2.1. | |
At the time of writing, the [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-features/dynamic-shapes.html?highlight=dynamic%20shapes#), which means that the we need to specify our image size in advanced for compiling and inference. | |
In simpler terms, this means we need to define the input shapes for our prompt (sequence length), batch size, height and width of the image. | |
We precompiled the model with the following parameters and pushed it to the Hugging Face Hub: | |
* `height`: 1024 | |
* `width`: 1024 | |
* `sequence_length`: 128 | |
* `num_images_per_prompt`: 1 | |
* `batch_size`: 1 | |
* `neuron`: 2.15.0 | |
_Note: If you want to compile your own model or a different Stable Diffusion XL checkpoint you need to use ~120GB of memory and the compilation can take ~45 minutes. We used an `inf2.8xlarge` ec2 instance with the [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) to compile the model._ | |
```python | |
from huggingface_hub import snapshot_download | |
# compiled model id | |
compiled_model_id = "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024" | |
# save compiled model to local directory | |
save_directory = "sdxl_neuron" | |
# Downloads our compiled model from the HuggingFace Hub | |
# using the revision as neuron version reference | |
# and makes sure we exlcude the symlink files and "hidden" files, like .DS_Store, .gitignore, etc. | |
snapshot_download(compiled_model_id, revision="2.15.0", local_dir=save_directory, local_dir_use_symlinks=False, allow_patterns=["[!.]*.*"]) | |
############################################### | |
# COMMENT IN BELOW TO COMPILE DIFFERENT MODEL # | |
############################################### | |
# | |
# from optimum.neuron import NeuronStableDiffusionXLPipeline | |
# | |
# # model id you want to compile | |
# vanilla_model_id = "stabilityai/stable-diffusion-xl-base-1.0" | |
# | |
# # configs for compiling model | |
# compiler_args = {"auto_cast": "all", "auto_cast_type": "bf16"} | |
# input_shapes = { | |
# "height": 1024, # width of the image | |
# "width": 1024, # height of the image | |
# "num_images_per_prompt": 1, # number of images to generate per prompt | |
# "batch_size": 1 # batch size for the model | |
# } | |
# | |
# sd = NeuronStableDiffusionXLPipeline.from_pretrained(vanilla_model_id, export=True, **input_shapes, **compiler_args) | |
# | |
# # Save locally or upload to the HuggingFace Hub | |
# save_directory = "sdxl_neuron" | |
# sd.save_pretrained(save_directory) | |
``` | |
## 2. Create a custom `inference.py` script for Stable Diffusion | |
The [Hugging Face Inference Toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit) supports zero-code deployments on top of the [pipeline feature](https://huggingface.co/transformers/main_classes/pipelines.html) from 🤗 Transformers. This allows users to deploy Hugging Face transformers without an inference script [[Example](https://github.com/huggingface/notebooks/blob/master/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb)]. | |
Currently is this feature not supported with AWS Inferentia2, which means we need to provide an `inference.py` for running inference. But `optimum-neuron` has integrated support for the 🤗 Diffusers pipeline feature. That way we can use the `optimum-neuron` to create a pipeline for our model. | |
If you want to know more about the `inference.py` script check out this [example](https://github.com/huggingface/notebooks/blob/master/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb). It explains amongst other things what the `model_fn` and `predict_fn` are. | |
```python | |
# create code directory in our model directory | |
!mkdir {save_directory}/code | |
``` | |
We are using the `NEURON_RT_NUM_CORES=2` to make sure that each HTTP worker uses 2 Neuron core to maximize throughput. | |
```python | |
%%writefile {save_directory}/code/inference.py | |
import os | |
# To use two neuron core per worker | |
os.environ["NEURON_RT_NUM_CORES"] = "2" | |
import torch | |
import torch_neuronx | |
import base64 | |
from io import BytesIO | |
from optimum.neuron import NeuronStableDiffusionXLPipeline | |
def model_fn(model_dir): | |
# load local converted model into pipeline | |
pipeline = NeuronStableDiffusionXLPipeline.from_pretrained(model_dir, device_ids=[0, 1]) | |
return pipeline | |
def predict_fn(data, pipeline): | |
# extract prompt from data | |
prompt = data.pop("inputs", data) | |
parameters = data.pop("parameters", None) | |
if parameters is not None: | |
generated_images = pipeline(prompt, **parameters)["images"] | |
else: | |
generated_images = pipeline(prompt)["images"] | |
# postprocess convert image into base64 string | |
encoded_images = [] | |
for image in generated_images: | |
buffered = BytesIO() | |
image.save(buffered, format="JPEG") | |
encoded_images.append(base64.b64encode(buffered.getvalue()).decode()) | |
# always return the first | |
return {"generated_images": encoded_images} | |
``` | |
## 3. Upload the neuron model and inference script to Amazon S3 | |
Before we can deploy our neuron model to Amazon SageMaker we need to upload it all our model artifacts to Amazon S3. | |
_Note: Currently `inf2` instances are only available in the `us-east-2` & `us-east-1` region [[REF](https://aws.amazon.com/de/about-aws/whats-new/2023/05/sagemaker-ml-inf2-ml-trn1-instances-model-deployment/)]. Therefore we need to force the region to us-east-2._ | |
Lets create our SageMaker session and upload our model to Amazon S3. | |
```python | |
import sagemaker | |
import boto3 | |
sess = sagemaker.Session() | |
# sagemaker session bucket -> used for uploading data, models and logs | |
# sagemaker will automatically create this bucket if it not exists | |
sagemaker_session_bucket=None | |
if sagemaker_session_bucket is None and sess is not None: | |
# set to default bucket if a bucket name is not given | |
sagemaker_session_bucket = sess.default_bucket() | |
try: | |
role = sagemaker.get_execution_role() | |
except ValueError: | |
iam = boto3.client('iam') | |
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] | |
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) | |
print(f"sagemaker role arn: {role}") | |
print(f"sagemaker bucket: {sess.default_bucket()}") | |
print(f"sagemaker session region: {sess.boto_region_name}") | |
assert sess.boto_region_name in ["us-east-2", "us-east-1"] , "region must be us-east-2 or us-west-2, due to instance availability" | |
``` | |
We create our `model.tar.gz` with our `inference.py`` script | |
```python | |
# create a model.tar.gz archive with all the model artifacts and the inference.py script. | |
%cd {save_directory} | |
!tar zcvf model.tar.gz * | |
%cd .. | |
``` | |
Next, we upload our `model.tar.gz` to Amazon S3 using our session bucket and `sagemaker` sdk. | |
```python | |
from sagemaker.s3 import S3Uploader | |
# create s3 uri | |
s3_model_path = f"s3://{sess.default_bucket()}/neuronx/sdxl" | |
# upload model.tar.gz | |
s3_model_uri = S3Uploader.upload(local_path=f"{save_directory}/model.tar.gz", desired_s3_uri=s3_model_path) | |
print(f"model artifcats uploaded to {s3_model_uri}") | |
``` | |
## 4. Deploy a Real-time Inference Endpoint on Amazon SageMaker | |
After we have uploaded our model artifacts to Amazon S3 can we create a custom `HuggingfaceModel`. This class will be used to create and deploy our real-time inference endpoint on Amazon SageMaker. | |
The `inf2.xlarge` instance type is the smallest instance type with AWS Inferentia2 support. It comes with 1 Inferentia2 chip with 2 Neuron Cores. This means we can use 2 Neuron Cores to minimize latency for our image generation. | |
```python | |
from sagemaker.huggingface.model import HuggingFaceModel | |
# create Hugging Face Model Class | |
huggingface_model = HuggingFaceModel( | |
model_data=s3_model_uri, # path to your model.tar.gz on s3 | |
role=role, # iam role with permissions to create an Endpoint | |
transformers_version="4.34.1", # transformers version used | |
pytorch_version="1.13.1", # pytorch version used | |
py_version='py310', # python version used | |
model_server_workers=1, # number of workers for the model server | |
) | |
# deploy the endpoint endpoint | |
predictor = huggingface_model.deploy( | |
initial_instance_count=1, # number of instances | |
instance_type="ml.inf2.xlarge", # AWS Inferentia Instance | |
volume_size = 100 | |
) | |
# ignore the "Your model is not compiled. Please compile your model before using Inferentia." warning, we already compiled our model. | |
``` | |
# 5.Generate images using the deployed model | |
The `.deploy()` returns an `HuggingFacePredictor` object which can be used to request inference. Our endpoint expects a `json` with at least `inputs` key. The `inputs` key is the input prompt for the model, which will be used to generate the image. Additionally, we can provide inference parameters, e.g. `num_inference_steps`. | |
The `predictor.predict()` function returns a `json` with the `generated_images` key. The `generated_images` key contains the `1` generated image as a `base64` encoded string. To decode our response we added a small helper function `decode_base64_to_image` which takes the `base64` encoded string and returns a `PIL.Image` object and `display_image` displays them. | |
```python | |
from PIL import Image | |
from io import BytesIO | |
from IPython.display import display | |
import base64 | |
# helper decoder | |
def decode_base64_image(image_string): | |
base64_image = base64.b64decode(image_string) | |
buffer = BytesIO(base64_image) | |
return Image.open(buffer) | |
# display PIL images as grid | |
def display_image(image=None,width=500,height=500): | |
img = image.resize((width, height)) | |
display(img) | |
``` | |
Now, lets generate some images. As example `A dog trying catch a flying pizza in style of comic book, at a street corner.`. Generating an image with 25 steps takes around ~6 seconds, except for the first request which can take 45-60s. | |
_note: If the request times out, just rerun again. Only the first request takes a long time._ | |
```python | |
prompt = "A dog trying catch a flying pizza at a street corner, comic book, well lit, night time" | |
# run prediction | |
response = predictor.predict(data={ | |
"inputs": prompt, | |
"parameters": { | |
"num_inference_steps" : 25, | |
"negative_prompt" : "disfigured, ugly, deformed" | |
} | |
} | |
) | |
# decode and display image | |
display_image(decode_base64_image(response["generated_images"][0])) | |
``` | |
### Delete model and endpoint | |
To clean up, we can delete the model and endpoint. | |
```python | |
predictor.delete_model() | |
predictor.delete_endpoint() | |
``` | |
```python | |
``` | |
""" | |
) | |
with st.expander("(2) AWS Lambda Function to handle inference requests"): | |
st.markdown( | |
""" | |
```python | |
import boto3 | |
import json | |
def lambda_handler(event, context): | |
# SageMaker endpoint details | |
endpoint_name = 'INSERT_YOUR_SAGEMAKER_ENDPOINT_NAME_HERE' | |
runtime = boto3.client('sagemaker-runtime') | |
# Sample input data (modify as per your model's input requirements) | |
# Get the prompt from the Lambda function input | |
print("======== event payload: ==========") | |
print(event['body']) | |
print("======== prompt payload: ==========") | |
event_parsed = json.loads(event['body']) | |
prompt = event_parsed.get('prompt', '') | |
print(prompt) | |
print("======== params payload: ==========") | |
params = event_parsed.get('parameters','') | |
print(params) | |
# Prepare input data | |
model_input = { | |
'inputs': prompt, | |
'parameters': params | |
} | |
input_data = json.dumps(model_input) | |
# Make a prediction request to the SageMaker endpoint | |
response = runtime.invoke_endpoint(EndpointName=endpoint_name, | |
ContentType='application/json', | |
Body=input_data) | |
# Parse the response | |
result = response['Body'].read() | |
return { | |
'statusCode': 200, | |
'body': result | |
} | |
``` | |
""" | |
) | |
with st.expander("(3) Streamlit app.py, running on Amazon EC2 t2.micro instance"): | |
st.markdown( | |
""" | |
```python | |
import streamlit as st | |
# Set the page layout to 'wide' | |
st.set_page_config(layout="wide") | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
import base64 | |
import time | |
# helper decoder | |
def decode_base64_image(image_string): | |
base64_image = base64.b64decode(image_string) | |
buffer = BytesIO(base64_image) | |
return Image.open(buffer) | |
# display PIL images as grid | |
def display_image(image=None,width=500,height=500): | |
img = image.resize((width, height)) | |
return img | |
# API Gateway endpoint URL | |
api_url = 'INSERT_YOUR_API_GATEWAY_ENDPOINT_URL_HERE' | |
# Create two columns for layout | |
left_column, right_column = st.columns(2) | |
# =========== | |
with left_column: | |
# Define Streamlit UI elements | |
st.title('Stable Diffusion XL Image Generation with AWS Inferentia') | |
prompt_one = st.text_area("Enter your prompt:", | |
f"Raccoon astronaut in space, sci-fi, future, cold color palette, muted colors, detailed, 8k") | |
# Number of inference steps | |
num_inference_steps_one = st.slider("Number of Inference Steps", | |
min_value=1, | |
max_value=100, | |
value=30, | |
help="More steps might improve quality, with diminishing marginal returns. 30-50 seems best, but your mileage may vary.") | |
# Create an expandable section for optional parameters | |
with st.expander("Optional Parameters"): | |
# Random seed input | |
seed_one = st.number_input("Random seed", | |
value=555, | |
help="Set to the same value to generate the same image if other inputs are the same, change to generate a different image for same inputs.") | |
# Negative prompt input | |
negative_prompt_one = st.text_area("Enter your negative prompt:", | |
"cartoon, graphic, text, painting, crayon, graphite, abstract glitch, blurry") | |
if st.button('Generate Image'): | |
with st.spinner(f'Generating Image with {num_inference_steps_one} iterations'): | |
with right_column: | |
start_time = time.time() | |
# =============== | |
# Example input data | |
prompt_input_one = { | |
"prompt": prompt_one, | |
"parameters": { | |
"num_inference_steps": num_inference_steps_one, | |
"seed": seed_one, | |
"negative_prompt": negative_prompt_one | |
} | |
} | |
# Make API request | |
response_one = requests.post(api_url, json=prompt_input_one) | |
# Process and display the response | |
if response_one.status_code == 200: | |
result_one = response_one.json() | |
# st.success(f"Prediction result: {result}") | |
image_one = display_image(decode_base64_image(result_one["generated_images"][0])) | |
st.image(image_one, | |
caption=f"{prompt_one}") | |
end_time = time.time() | |
total_time = round(end_time - start_time, 2) | |
st.text(f"Prompt: {prompt_one}") | |
st.text(f"Number of Iterations: {num_inference_steps_one}") | |
st.text(f"Random Seed: {seed_one}") | |
st.text(f'Total time taken: {total_time} seconds') | |
# Calculate and display the time per iteration in milliseconds | |
time_per_iteration_ms = (total_time / num_inference_steps_one) | |
st.text(f'Time per iteration: {time_per_iteration_ms:.2f} seconds') | |
else: | |
st.error(f"Error: {response_one.text}") | |
``` | |
""" | |
) |