import gradio as gr import torch from transformers import AutoTokenizer, AutoModel from PIL import Image from torchvision import transforms import json from torch import nn from typing import Literal # Define Multimodal Classifier class MultimodalClassifier(nn.Module): def __init__( self, text_encoder_id_or_path: str, image_encoder_id_or_path: str, projection_dim: int, fusion_method: Literal["concat", "align", "cosine_similarity"] = "concat", proj_dropout: float = 0.1, fusion_dropout: float = 0.1, num_classes: int = 1, ) -> None: super().__init__() self.fusion_method = fusion_method self.projection_dim = projection_dim self.num_classes = num_classes # Text Encoder self.text_encoder = AutoModel.from_pretrained(text_encoder_id_or_path) self.text_projection = nn.Sequential( nn.Linear(self.text_encoder.config.hidden_size, self.projection_dim), nn.Dropout(proj_dropout), ) # Image Encoder self.image_encoder = AutoModel.from_pretrained(image_encoder_id_or_path, trust_remote_code=True) self.image_encoder.classifier = nn.Identity() # Remove classification head self.image_projection = nn.Sequential( nn.Linear(512, self.projection_dim), nn.Dropout(proj_dropout), ) # Fusion Layer fusion_input_dim = self.projection_dim * 2 if fusion_method == "concat" else self.projection_dim self.fusion_layer = nn.Sequential( nn.Dropout(fusion_dropout), nn.Linear(fusion_input_dim, self.projection_dim), nn.GELU(), nn.Dropout(fusion_dropout), ) # Classification Layer self.classifier = nn.Linear(self.projection_dim, self.num_classes) def forward(self, pixel_values: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: # Text Encoder Projection full_text_features = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True).last_hidden_state full_text_features = full_text_features[:, 0, :] # CLS token full_text_features = self.text_projection(full_text_features) # Image Encoder Projection resnet_image_features = self.image_encoder(pixel_values=pixel_values).last_hidden_state resnet_image_features = resnet_image_features.mean(dim=[-2, -1]) # Global average pooling resnet_image_features = self.image_projection(resnet_image_features) # Fusion if self.fusion_method == "concat": fused_features = torch.cat([full_text_features, resnet_image_features], dim=-1) else: fused_features = full_text_features * resnet_image_features # Classification fused_features = self.fusion_layer(fused_features) classification_output = self.classifier(fused_features) return classification_output # Load the model def load_model(): with open("config.json", "r") as f: config = json.load(f) model = MultimodalClassifier( text_encoder_id_or_path=config["text_encoder_id_or_path"], image_encoder_id_or_path="microsoft/resnet-34", projection_dim=config["projection_dim"], fusion_method=config["fusion_method"], proj_dropout=config["proj_dropout"], fusion_dropout=config["fusion_dropout"], num_classes=config["num_classes"] ) checkpoint = torch.load("model_weights.pth", map_location=torch.device('cpu')) model.load_state_dict(checkpoint, strict=False) return model # Load model and tokenizer model = load_model() model.eval() text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Image transform pipeline image_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # Prediction function def predict(image: Image.Image, text: str) -> str: # Process text input text_inputs = text_tokenizer( text, return_tensors="pt", padding="max_length", truncation=True, max_length=512 ) # Process image input image_input = image_transform(image).unsqueeze(0) # Add batch dimension # Model inference with torch.no_grad(): classification_output = model( pixel_values=image_input, input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"] ) predicted_class = torch.sigmoid(classification_output).round().item() return "Fake News" if predicted_class == 1 else "Real News" # Gradio Interface interface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Upload Related Image"), gr.Textbox(lines=2, placeholder="Enter news text for classification...", label="Input Text") ], outputs=gr.Label(label="Prediction"), title="Fake News Detector", description="Upload an image and provide text to classify the news as 'Fake' or 'Real'." ) interface.launch()