alcaitiff
/

LLM-CAPTION

Safetensors

English

Model card Files Files and versions Community

Alan Rabello commited on Dec 6, 2024

Commit

d400378

1 Parent(s): 0d413ec

Add img extensions

Browse files

Files changed (1) hide show

caption.py +22 -20

caption.py CHANGED Viewed

@@ -11,7 +11,7 @@ CHECKPOINT_PATH = Path("./checkpoint")
 LLMA_CHECKPOINT = "John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4"
 WORDS=200
 PROMPT = "In one paragraph, write a very descriptive caption for this image, describe all objects, characters and their actions, describe in detail what is happening and their emotions. Include information about lighting, the style of this image and information about camera angle within {word_count} words. Don't create any title for the image."
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
@@ -130,9 +130,6 @@ def proc_img(input_image):
 	], dim=1).to(device)
 	attention_mask = torch.ones_like(input_ids)
-	# Debugging
-	#print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
 	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
 	generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)	 # Uses the default which is temp=0.6, top_p=0.9
@@ -150,10 +147,6 @@ def describe_image(image_path):
 				print(f"File not found: {image_path}")
 				return
-		if not image_path.lower().endswith(".png"):
-				print("File must be PNG.")
-				return
 		image = Image.open(image_path).convert("RGB")
 		description = proc_img(image)
@@ -173,27 +166,36 @@ if __name__ == "__main__":
 		parser = argparse.ArgumentParser(description="Caption all PNG image files in a folder")
 		parser.add_argument("folder_path", type=str, help="Folder containing images.")
 		parser.add_argument("--prompt", type=str, help="Prompt to ask a caption.", default=None, required=False)
 		args = parser.parse_args()
-		# Process all PNG images in the folder
 		folder_path = Path(args.folder_path)
 		if not folder_path.is_dir():
 				print(f"Error: {folder_path} is not a valid directory.")
 				exit(1)
-		png_files = list(folder_path.glob("*.png"))
-		if not png_files:
-				print(f"No PNG files found in the directory: {folder_path}")
-				exit(1)
 		# Prompt
-		if args.prompt is None:
-			prompt_str = PROMPT.format(word_count=WORDS)
 		else:
-			prompt_str = args.prompt
-		total = len(png_files)
-		print(f"Found {total} PNG files. Processing...")
 		device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -228,7 +230,7 @@ if __name__ == "__main__":
 		image_adapter.to(device)
 		curr = 1
-		for image_path in png_files:
 				print(f"Processing image {curr} of {total}: {image_path}")
 				curr += 1
 				describe_image(str(image_path))

 LLMA_CHECKPOINT = "John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4"
 WORDS=200
 PROMPT = "In one paragraph, write a very descriptive caption for this image, describe all objects, characters and their actions, describe in detail what is happening and their emotions. Include information about lighting, the style of this image and information about camera angle within {word_count} words. Don't create any title for the image."
+IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
 	], dim=1).to(device)
 	attention_mask = torch.ones_like(input_ids)
 	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
 	generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)	 # Uses the default which is temp=0.6, top_p=0.9
 				print(f"File not found: {image_path}")
 				return
 		image = Image.open(image_path).convert("RGB")
 		description = proc_img(image)
 		parser = argparse.ArgumentParser(description="Caption all PNG image files in a folder")
 		parser.add_argument("folder_path", type=str, help="Folder containing images.")
 		parser.add_argument("--prompt", type=str, help="Prompt to ask a caption.", default=None, required=False)
+		parser.add_argument("--output_dir", type=str, help="Output dir.", default=None, required=False)
 		args = parser.parse_args()
+		# Prompt
+		if args.prompt is None:
+			prompt_str = PROMPT.format(word_count=WORDS)
+		else:
+			prompt_str = args.prompt
+		# Process all images in the folder
 		folder_path = Path(args.folder_path)
 		if not folder_path.is_dir():
 				print(f"Error: {folder_path} is not a valid directory.")
 				exit(1)
 		# Prompt
+		if args.output_dir is None:
+			output_dir = folder_path
 		else:
+			output_dir = args.output_dir
+		img_files = [f for f in folder_path.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
+		img_files = [f for f in img_files if not Path(output_dir,f"{f.stem}.txt").exists()]
+		if not img_files:
+			print(f"No image files without caption found in the directory: {folder_path}")
+			exit(1)
+		total = len(img_files)
+		print(f"Found {total} IMAGE files without caption. Processing...")
 		device = "cuda" if torch.cuda.is_available() else "cpu"
 		image_adapter.to(device)
 		curr = 1
+		for image_path in img_files:
 				print(f"Processing image {curr} of {total}: {image_path}")
 				curr += 1
 				describe_image(str(image_path))