Spaces:

chats-bug
/

ai-image-captioning

Runtime error

App Files Files Community

chats-bug commited on May 5, 2023

Commit

a95ba86

1 Parent(s): 1d4f82c

Added fine-tuning options

Browse files

Files changed (2) hide show

app.py +28 -21
model.py +76 -61

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import torch
 from PIL import Image
-from model import GitBaseCocoModel, BlipBaseModel
 MODELS = {
 	"Git-Base-COCO": GitBaseCocoModel,
@@ -12,33 +12,38 @@ MODELS = {
 def generate_captions(
 	image,
 	num_captions,
 	max_length,
 	temperature,
 	top_k,
 	top_p,
 	repetition_penalty,
 	diversity_penalty,
-	model_name,
 	):
 	"""
 	Generates captions for the given image.
 	-----
 	Parameters:
 	image: PIL.Image
 		The image to generate captions for.
-	max_len: int
-		The maximum length of the caption.
 	num_captions: int
 		The number of captions to generate.
 	-----
 	Returns:
 	list[str]
 	"""
 	device = "cuda" if torch.cuda.is_available() else "cpu"
 	model = MODELS[model_name](device)
 	captions = model.generate(
@@ -56,32 +61,34 @@ def generate_captions(
 	captions = "\n".join(captions)
 	return captions
-title = "Git-Base-COCO Image Captioning"
-description = "A model for generating captions for images."
 interface = gr.Interface(
 	fn=generate_captions,
 	inputs=[
-		gr.inputs.Image(type="pil", label="Image"),
-		gr.inputs.Slider(minimum=1, maximum=10, step=1, default=1, label="Number of Captions to Generate"),
-		gr.inputs.Slider(minimum=20, maximum=100, step=5, default=50, label="Maximum Caption Length"),
-		gr.inputs.Slider(minimum=0.1, maximum=10.0, step=0.1, default=1.0, label="Temperature"),
-		gr.inputs.Slider(minimum=1, maximum=100, step=1, default=50, label="Top K"),
-		gr.inputs.Slider(minimum=-5.0, maximum=5.0, step=0.1, default=1.0, label="Top P"),
-		gr.inputs.Slider(minimum=1.0, maximum=10.0, step=0.1, default=1.0, label="Repetition Penalty"),
-		gr.inputs.Slider(minimum=0.0, maximum=10.0, step=0.1, default=0.0, label="Diversity Penalty"),
-		gr.inputs.Dropdown(MODELS.keys(), label="Model"),
 	],
 	outputs=[
-		gr.outputs.Textbox(label="Caption"),
 	],
 	title=title,
 	description=description,
-	)
 if __name__ == "__main__":
 	interface.launch(
 		enable_queue=True,
-		debug=True
 	)

 import torch
 from PIL import Image
+from model import BlipBaseModel, GitBaseCocoModel
 MODELS = {
 	"Git-Base-COCO": GitBaseCocoModel,
 def generate_captions(
 	image,
 	num_captions,
+	model_name,
 	max_length,
 	temperature,
 	top_k,
 	top_p,
 	repetition_penalty,
 	diversity_penalty,
 	):
 	"""
 	Generates captions for the given image.
 	-----
 	Parameters:
 	image: PIL.Image
 		The image to generate captions for.
 	num_captions: int
 		The number of captions to generate.
+	** Rest of the parameters are the same as in the model.generate method. **
 	-----
 	Returns:
 	list[str]
 	"""
+	# Convert the numerical values to their corresponding types.
+	# Gradio Slider returns values as floats: except when the value is a whole number, in which case it returns an int.
+	# Only float values suffer from this issue.
+	temperature = float(temperature)
+	top_p = float(top_p)
+	repetition_penalty = float(repetition_penalty)
+	diversity_penalty = float(diversity_penalty)
 	device = "cuda" if torch.cuda.is_available() else "cpu"
 	model = MODELS[model_name](device)
 	captions = model.generate(
 	captions = "\n".join(captions)
 	return captions
+title = "AI tool for generating captions for images"
+description = "This tool uses pretrained models to generate captions for images."
 interface = gr.Interface(
 	fn=generate_captions,
 	inputs=[
+		gr.components.Image(type="pil", label="Image"),
+		gr.components.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Captions to Generate"),
+		gr.components.Dropdown(MODELS.keys(), label="Model", value=list(MODELS.keys())[1]), # Default to Blip Base
+		gr.components.Slider(minimum=20, maximum=100, step=5, value=50, label="Maximum Caption Length"),
+		gr.components.Slider(minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Temperature"),
+		gr.components.Slider(minimum=1, maximum=100, step=1, value=50, label="Top K"),
+		gr.components.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Top P"),
+		gr.components.Slider(minimum=1.0, maximum=10.0, step=0.1, value=2.0, label="Repetition Penalty"),
+		gr.components.Slider(minimum=0.0, maximum=10.0, step=0.1, value=2.0, label="Diversity Penalty"),
 	],
 	outputs=[
+		gr.components.Textbox(label="Caption"),
 	],
 	title=title,
 	description=description,
+	allow_flagging="never",
+)
 if __name__ == "__main__":
+    # Launch the interface.
 	interface.launch(
 		enable_queue=True,
+		debug=True,
 	)

model.py CHANGED Viewed

@@ -7,26 +7,41 @@ class ImageCaptionModel:
 		processor,
 		model,
 	) -> None:
 		self.device = device
 		self.processor = processor
 		self.model = model
 		self.model.to(self.device)
 	def generate(
 		self,
 		image,
-		num_captions=1,
-		max_length=50,
-		num_beam_groups=1,
-		temperature=1.0,
-		top_k=50,
-		top_p=1.0,
-		repetition_penalty=1.0,
-		diversity_penalty=0.0,
 	):
 		"""
 		Generates captions for the given image.
 		-----
 		Parameters:
 		preprocessor: transformers.PreTrainedTokenizerFast
@@ -37,8 +52,6 @@ class ImageCaptionModel:
 			The image to generate captions for.
 		num_captions: int
 			The number of captions to generate.
-		num_beam_groups: int
-			The number of beam groups to use for beam search in order to maintain diversity. Must be between 1 and num_beams. 1 means no group_beam_search..
 		temperature: float
 			The temperature to use for sampling. The value used to module the next token probabilities that will be used by default in the generate method of the model. Must be strictly positive. Defaults to 1.0.
 		top_k: int
@@ -49,25 +62,45 @@ class ImageCaptionModel:
 			The parameter for repetition penalty. 1.0 means no penalty. Defaults to 1.0.
 		diversity_penalty: float
 			The parameter for diversity penalty. 0.0 means no penalty. Defaults to 0.0.
 		"""
-		pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device)
-		generated_ids = self.model.generate(
-			pixel_values=pixel_values,
-			max_length=max_length,
-			num_beams=num_captions*2,
-			num_beam_groups=num_beam_groups,
-			num_return_sequences=num_captions*2,
-			temperature=temperature,
-			top_k=top_k,
-			top_p=top_p,
-			repetition_penalty=repetition_penalty,
-			diversity_penalty=diversity_penalty,
-		)
 		generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
-		generated_caption = [generated_caption[i] for i in range(0, num_captions*2, 2)]
 		return generated_caption
@@ -79,8 +112,8 @@ class GitBaseCocoModel(ImageCaptionModel):
 		-----
 		Parameters:
-		device: torch.device
-			The device to run the model on.
 		checkpoint: str
 			The checkpoint to load the model from.
@@ -93,42 +126,24 @@ class GitBaseCocoModel(ImageCaptionModel):
 		model = AutoModelForCausalLM.from_pretrained(checkpoint)
 		super().__init__(device, processor, model)
-	def generate(self, image, max_length=50, num_captions=1, **kwargs):
-		"""
-		Generates captions for the given image.
-		-----
-		Parameters:
-		image: PIL.Image
-			The image to generate captions for.
-		max_len: int
-			The maximum length of the caption.
-		num_captions: int
-			The number of captions to generate.
-		"""
-		captions = super().generate(image, max_length, num_captions, **kwargs)
-		return captions
 class BlipBaseModel(ImageCaptionModel):
 	def __init__(self, device):
-		self.checkpoint = "Salesforce/blip-image-captioning-base"
-		processor = AutoProcessor.from_pretrained(self.checkpoint)
-		model = BlipForConditionalGeneration.from_pretrained(self.checkpoint)
-		super().__init__(device, processor, model)
-	def generate(self, image, max_length=50, num_captions=1, **kwargs):
 		"""
-		Generates captions for the given image.
 		-----
 		Parameters:
-		image: PIL.Image
-			The image to generate captions for.
-		max_len: int
-			The maximum length of the caption.
-		num_captions: int
-			The number of captions to generate.
 		"""
-		captions = super().generate(image, max_length, num_captions, **kwargs)
-		return captions

 		processor,
 		model,
 	) -> None:
+		"""
+		Initializes the model for generating captions for images.
+		-----
+		Parameters:
+		device: str
+			The device to use for the model. Must be either "cpu" or "cuda".
+		processor: transformers.AutoProcessor
+			The preprocessor to use for the model.
+		model: transformers.AutoModelForCausalLM or transformers.BlipForConditionalGeneration
+			The model to use for generating captions.
+		-----
+		Returns:
+		None
+		"""
 		self.device = device
 		self.processor = processor
 		self.model = model
 		self.model.to(self.device)
 	def generate(
 		self,
 		image,
+		num_captions: int = 1,
+		max_length: int = 50,
+		temperature: float = 1.0,
+		top_k: int = 50,
+		top_p: float = 1.0,
+		repetition_penalty: float = 1.0,
+		diversity_penalty: float = 0.0,
 	):
 		"""
 		Generates captions for the given image.
 		-----
 		Parameters:
 		preprocessor: transformers.PreTrainedTokenizerFast
 			The image to generate captions for.
 		num_captions: int
 			The number of captions to generate.
 		temperature: float
 			The temperature to use for sampling. The value used to module the next token probabilities that will be used by default in the generate method of the model. Must be strictly positive. Defaults to 1.0.
 		top_k: int
 			The parameter for repetition penalty. 1.0 means no penalty. Defaults to 1.0.
 		diversity_penalty: float
 			The parameter for diversity penalty. 0.0 means no penalty. Defaults to 0.0.
 		"""
+		# Type checking and making sure the values are valid.
+		assert type(num_captions) == int and num_captions > 0, "num_captions must be a positive integer."
+		assert type(max_length) == int and max_length > 0, "max_length must be a positive integer."
+		assert type(temperature) == float and temperature > 0.0, "temperature must be a positive float."
+		assert type(top_k) == int and top_k > 0, "top_k must be a positive integer."
+		assert type(top_p) == float and top_p > 0.0, "top_p must be a positive float."
+		assert type(repetition_penalty) == float and repetition_penalty >= 1.0, "repetition_penalty must be a positive float greater than or equal to 1."
+		assert type(diversity_penalty) == float and diversity_penalty >= 0.0, "diversity_penalty must be a non negative float."
+		pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device) # Convert the image to pixel values.
+		# Generate captions ids.
+		if num_captions == 1:
+			generated_ids = self.model.generate(
+				pixel_values=pixel_values,
+				max_length=max_length,
+				num_return_sequences=1,
+				temperature=temperature,
+				top_k=top_k,
+				top_p=top_p,
+			)
+		else:
+			generated_ids = self.model.generate(
+				pixel_values=pixel_values,
+				max_length=max_length,
+				num_beams=num_captions, # num_beams must be greater than or equal to num_captions and must be divisible by num_beam_groups.
+				num_beam_groups=num_captions, # num_beam_groups is set to equal to num_captions so that all the captions are diverse
+				num_return_sequences=num_captions, # generate multiple captions which are very similar to each other due to the grouping effect of beam search.
+				temperature=temperature,
+				top_k=top_k,
+				top_p=top_p,
+				repetition_penalty=repetition_penalty,
+				diversity_penalty=diversity_penalty,
+			)
+		# Decode the generated ids to get the captions.
 		generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
 		return generated_caption
 		-----
 		Parameters:
+		device: str
+			The device to run the model on, either "cpu" or "cuda".
 		checkpoint: str
 			The checkpoint to load the model from.
 		model = AutoModelForCausalLM.from_pretrained(checkpoint)
 		super().__init__(device, processor, model)
 class BlipBaseModel(ImageCaptionModel):
 	def __init__(self, device):
 		"""
+		A wrapper class for the Blip-Base model. It is a pretrained model for image captioning.
 		-----
 		Parameters:
+		device: str
+			The device to run the model on, either "cpu" or "cuda".
+		checkpoint: str
+			The checkpoint to load the model from.
+		-----
+		Returns:
+		None
 		"""
+		self.checkpoint = "Salesforce/blip-image-captioning-base"
+		processor = AutoProcessor.from_pretrained(self.checkpoint)
+		model = BlipForConditionalGeneration.from_pretrained(self.checkpoint)
+		super().__init__(device, processor, model)