Spaces:

Jyothirmai
/

image-captioning-chest-xrays

Sleeping

App Files Files Community

Jyothirmai commited on Feb 17, 2024

Commit

afda258

verified ·

1 Parent(s): 8a13188

Upload 8 files

Browse files

Files changed (8) hide show

CXR191_IM-0591-1001.png +0 -0
CXR192_IM-0598-1001.png +0 -0
CXR193_IM-0601-1001.png +0 -0
CXR194_IM-0609-1001.png +0 -0
CXR195_IM-0618-1001.png +0 -0
app.py +35 -23
clipGPT.py +164 -0
model_train_best_run_clipGPT.pt +3 -0

CXR191_IM-0591-1001.png ADDED Viewed

CXR192_IM-0598-1001.png ADDED Viewed

CXR193_IM-0601-1001.png ADDED Viewed

CXR194_IM-0609-1001.png ADDED Viewed

CXR195_IM-0618-1001.png ADDED Viewed

app.py CHANGED Viewed

@@ -1,37 +1,49 @@
 import gradio as gr
-from PIL import Image  # For image handling
-# Replace with paths or loading functions for your specific models
-def load_model_1():
-    # ... load your first model
-    return model_1
-def load_model_2():
-    # ... load your second model
-    return model_2
-def load_model_3():
-    # ... load your third model
-    return model_3
-def generate_caption(model, image):
-    # ... perform inference with your model
     return caption
-# models = [load_model_1(), load_model_2(), load_model_3()]
 with gr.Blocks() as demo:
     with gr.Row():
-        image = gr.Image(label="Upload Chest X-ray")
-    with gr.Row():
-        gr.Radio(["Model 1", "Model 2", "Model 3"], label="Select Model")
     with gr.Row():
         caption = gr.Textbox(label="Generated Caption")
-    # image.change(
-    #     fn=generate_caption,
-    #     inputs=[image, gr.inputs.Radio],
-    #     outputs=caption
-    # )
 demo.launch()

 import gradio as gr
+from PIL import Image
+import clipGPT
+# Define model loading functions (if needed)
+def load_model_1():  # CLIP-GPT2
+    # Load model components here if necessary
+    return None
+# ... load_model_2(), load_model_3() - Define if and when needed
+# Caption generation functions
+def generate_caption_clipgpt(image):
+    caption = clipGPT.generate_caption_clipgpt(image)
     return caption
+# ... Add more caption generation functions for future models
+# Sample image paths
+sample_images = [
+    "CXR191_IM-0591-1001.jpg",
+    "CXR191_IM-0598-1001.jpg",
+    "CXR191_IM-0601-1001.jpg",
+    "CXR191_IM-0609-1001.jpg",
+    "CXR191_IM-0618-1001.jpg"
+]
+# Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
+        image = gr.Image(label="Upload Chest X-ray", source="upload")
+        sample_image_gallery = gr.ImageGallery(sample_images, label="Sample Images")
+    with gr.Row():
+        model_choice = gr.Radio(["CLIP-GPT2", "ViT-GPT2", "ViT-CoAttention"], label="Select Model")
     with gr.Row():
         caption = gr.Textbox(label="Generated Caption")
+    def predict(img, model_name):
+        if model_name == "CLIP-GPT2":
+            return generate_caption_clipgpt(img)
+        # Add elif blocks for "ViT-GPT2", "ViT-CoAttention"  as you implement them
+        else:
+            return "Caption generation for this model is not yet implemented."
+    # Handle changes for both uploaded and sample images
+    gr.Image.change(predict, [image, model_choice], caption)
+    sample_image_gallery.change(predict, [sample_image_gallery, model_choice], caption)
 demo.launch()

clipGPT.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from transformers import AutoTokenizer, AutoModel
+import clip
+import skimage.io as io
+import PIL.Image
+from IPython.display import Image
+from transformers import AutoTokenizer, AutoModel
+import skimage.io as io
+import PIL.Image
+from IPython.display import Image
+import pandas as pd
+import numpy as np
+import time
+import json
+import nltk
+nltk.download('punkt')
+class ClipGPT2Model(nn.Module):
+    def __init__(self, img_feature_length, img_feature_size = 512):
+        super(ClipGPT2Model, self).__init__()
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.img_feature_length = img_feature_length
+        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
+        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
+        self.clip_project = Adapter((img_feature_size,
+                                       (self.gpt_embedding_size * img_feature_length) // 2,
+                                       self.gpt_embedding_size * img_feature_length))
+        torch.cuda.empty_cache()
+    def get_dummy_token(self,
+                        batch_size: int,
+                        device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.img_feature_length, dtype=torch.int64, device=device)
+    def forward(self,
+                tokens: torch.Tensor,
+                feature: torch.Tensor,
+                mask = None,
+                labels = None):
+        torch.cuda.empty_cache()
+        gc.collect()
+        embedding_text = self.gpt.transformer.wte(tokens)
+        feature_projections = self.clip_project(feature).view(-1, self.img_feature_length, self.gpt_embedding_size)
+        embedding_cat = torch.cat((feature_projections, embedding_text), dim=1)
+        if labels is not None:
+            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
+            labels = torch.cat((dummy_token, tokens), dim=1)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
+        return out
+def generate_beam(
+    model,
+    tokenizer,
+    beam_size: int = 10,
+    prompt=None,
+    embed=None,
+    entry_length=76,
+    temperature=0.9,
+    stop_token: str = ".",
+):
+    model.eval()
+    stop_token_index = tokenizer.encode(stop_token)[0]
+    tokens = None
+    scores = None
+    device = next(model.parameters()).device
+    seq_lengths = torch.ones(beam_size, device=device)
+    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+    with torch.no_grad():
+        if embed is not None:
+            generated = embed
+        else:
+            if tokens is None:
+                tokens = torch.tensor(tokenizer.encode(prompt))
+                tokens = tokens.unsqueeze(0).to(device)
+                generated = model.gpt.transformer.wte(tokens)
+        for i in range(entry_length):
+            outputs = model.gpt(inputs_embeds=generated)
+            logits = outputs.logits
+            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+            logits = logits.softmax(-1).log()
+            if scores is None:
+                scores, next_tokens = logits.topk(beam_size, -1)
+                generated = generated.expand(beam_size, *generated.shape[1:])
+                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                if tokens is None:
+                    tokens = next_tokens
+                else:
+                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+            else:
+                logits[is_stopped] = -float(np.inf)
+                logits[is_stopped, 0] = 0
+                scores_sum = scores[:, None] + logits
+                seq_lengths[~is_stopped] += 1
+                scores_sum_average = scores_sum / seq_lengths[:, None]
+                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(
+                    beam_size, -1
+                )
+                next_tokens_source = next_tokens // scores_sum.shape[1]
+                seq_lengths = seq_lengths[next_tokens_source]
+                next_tokens = next_tokens % scores_sum.shape[1]
+                next_tokens = next_tokens.unsqueeze(1)
+                tokens = tokens[next_tokens_source]
+                tokens = torch.cat((tokens, next_tokens), dim=1)
+                generated = generated[next_tokens_source]
+                scores = scores_sum_average * seq_lengths
+                is_stopped = is_stopped[next_tokens_source]
+            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(
+                generated.shape[0], 1, -1
+            )
+            generated = torch.cat((generated, next_token_embed), dim=1)
+            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+            if is_stopped.all():
+                break
+    scores = scores / seq_lengths
+    output_list = tokens.cpu().numpy()
+    output_texts = [
+        tokenizer.decode(output[: int(length)])
+        for output, length in zip(output_list, seq_lengths)
+    ]
+    order = scores.argsort(descending=True)
+    output_texts = [output_texts[i] for i in order]
+    return output_texts
+def generate_caption_clipgpt(img):
+    prefix_length = 10
+    model = ClipGPT2Model(prefix_length, img_feature_size = feature_dim)
+    model.load_state_dict(torch.load('model_train_best_run_clipGPT.pt'))
+    model = model.eval()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    clip_model, preprocess = clip.load('ViT-B/32', device, jit=False)
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    start_time = time.time()
+    image = io.imread(img)
+    pil_image = PIL.Image.fromarray(image)
+    image = preprocess(pil_image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
+        prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
+        beam_caption = generate_beam(model, tokenizer, embed=prefix_embed)[0]
+    end_time = time.time()
+    print("--- Time taken to generate: %s seconds ---" % (end_time - start_time))
+    return beam_caption

model_train_best_run_clipGPT.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d75b4bf1a982290d2675a78b1f2bc39fa212178f5f609a555a1725150fe5275
+size 561159626