# Example inspired from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct # Import necessary libraries from PIL import Image import requests from transformers import AutoModelForCausalLM from transformers import AutoProcessor from transformers import BitsAndBytesConfig from transformers import TrOCRProcessor, VisionEncoderDecoderModel import torch import pandas as pd from torchmetrics.text import CharErrorRate from peft import PeftModel, PeftConfig # Define model ID model_id = "microsoft/Phi-3-vision-128k-instruct" peft_model_id = "hadrakey/alphapen_phi3" peft_model_id_new = "hadrakey/alphapen_new_large" # Load processor processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # phi3 finetuned # config = PeftConfig.from_pretrained(peft_model_id) # processor_fine = AutoProcessor.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) # Finetuned model # config_new = PeftConfig.from_pretrained(peft_model_id_new) model_finetune = VisionEncoderDecoderModel.from_pretrained("hadrakey/alphapen_large") # model_new_finetune = AutoModelForCausalLM.from_pretrained(config_new.base_model_name_or_path, device_map="auto", trust_remote_code=True, torch_dtype="auto") # model_finetune_phi3 = AutoModelForCausalLM.from_pretrained("hadrakey/alphapen_phi3", trust_remote_code=True) #Baseline model_base = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") processor_ocr = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") # processor_ocr_new = AutoProcessor.from_pretrained(config_new.base_model_name_or_path, device_map="auto", trust_remote_code=True, torch_dtype="auto") # Define BitsAndBytes configuration for 4-bit quantization nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, ) # Load model with 4-bit quantization and map to CUDA model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", quantization_config=nf4_config, ) # base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map="auto", trust_remote_code=True, torch_dtype="auto") # model_finetune_phi3 = PeftModel.from_pretrained(base_model, peft_model_id) # Define initial chat message with image placeholder messages = [{"role": "user", "content": """<|image_1|>\nThis image contains handwritten French characters forming a complete or partial word. The image is blurred, which makes recognition challenging. Please analyze the image to the best of your ability and provide your best guess of the French word or partial word shown, even if you're not certain. Follow these guidelines: 1. Examine the overall shape and any discernible character features. 2. Consider common French letter combinations and word patterns. 3. If you can only identify some characters, provide those as a partial word. 4. Make an educated guess based on what you can see, even if it's just a few letters. 5. If you can see any characters at all, avoid responding with "indiscernible." Your response should be only the predicted French word or partial word, using lowercase letters unless capital letters are clearly visible. If you can see any characters or shapes at all, provide the OCR from the image. """}] # messages = [{"role": "user", "content": """<|image_1|>\nWhat is shown is this images ? You should only output only your guess otherwise output the OCR. # """}] # Download image from URL url = "https://images.unsplash.com/photo-1528834342297-fdefb9a5a92b?ixlib=rb-4.0.3&q=85&fm=jpg&crop=entropy&cs=srgb&dl=roonz-nl-vjDbHCjHlEY-unsplash.jpg&w=640" # image = Image.open(requests.get(url, stream=True).raw) df_path = "/mnt/data1/Datasets/AlphaPen/" + "testing_data.csv" data = pd.read_csv(df_path) data.dropna(inplace=True) data.reset_index(inplace=True) sample = data.iloc[:5000,:] root_dir = "/mnt/data1/Datasets/OCR/Alphapen/clean_data/" # Prepare prompt with image token prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) cer_metric = CharErrorRate() phi_output=[] phi_finetune_output=[] inf_baseline = [] inf_finetune = [] inf_finetune_new = [] cer_phi = [] cer_phi_finetune = [] cer_trocr_fine_new = [] cer_trocr_fine = [] cer_trocr_base = [] for idx in range(len(sample)): # idx=30 # choose the image image = Image.open(root_dir + "final_cropped_rotated_" + data.filename[idx]).convert("RGB") # Process prompt and image for model input inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0") # Generate text response using model generate_ids = model.generate( **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=500, do_sample=False, ) # Remove input tokens from generated response generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] # Decode generated IDs to text response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] phi_output.append(response) cer_phi.append(cer_metric(response.lower(), data.text[idx].lower()).detach().numpy()) # Generate text response using model finetuned # generate_ids_fine = model_finetune_phi3.generate( # **inputs, # eos_token_id=processor.tokenizer.eos_token_id, # max_new_tokens=500, # do_sample=False, # ) # # Remove input tokens from generated response # inputs = processor_fine(prompt, [image], return_tensors="pt").to("cuda:0") # generate_ids_fine = generate_ids_fine[:, inputs["input_ids"].shape[1] :] # Decode generated IDs to text # response = processor.batch_decode( # generate_ids_fine, skip_special_tokens=True, clean_up_tokenization_spaces=False # )[0] # phi_finetune_output.append(response) # cer_phi_finetune.append(cer_metric(response, data.text[idx]).detach().numpy()) # Trocr pixel_values = processor_ocr(image, return_tensors="pt").pixel_values generated_ids_base = model_base.generate(pixel_values) generated_ids_fine = model_finetune.generate(pixel_values) # generated_ids_fine_new = model_finetune_new.generate(pixel_values) generated_text_base = processor_ocr.batch_decode(generated_ids_base, skip_special_tokens=True)[0] generated_text_fine= processor_ocr.batch_decode(generated_ids_fine, skip_special_tokens=True)[0] # generated_text_fine_new= processor_ocr_new.batch_decode(generated_ids_fine_new, skip_special_tokens=True)[0] inf_baseline.append(generated_text_base) inf_finetune.append(generated_text_fine) # inf_finetune_new.append(generated_text_fine_new) # cer_trocr_fine_new.append(cer_metric(generated_text_fine_new, data.text[idx]).detach().numpy()) cer_trocr_fine.append(cer_metric(generated_text_fine.lower(), data.text[idx].lower()).detach().numpy()) cer_trocr_base.append(cer_metric(generated_text_base.lower(), data.text[idx].lower()).detach().numpy()) # Print the generated response sample["phi3"]=phi_output # sample["phi3_fine"]=phi_finetune_output sample["Baseline"]=inf_baseline sample["Finetune"]=inf_finetune # sample["Finetune_new"]=inf_finetune_new sample["cer_phi"]=cer_phi # sample["cer_phi_fine"]=cer_phi_finetune sample["cer_trocr_base"]=cer_trocr_base sample["cer_trocr_fine"]=cer_trocr_fine # sample["cer_trocr_fine_new"]=cer_trocr_fine_new sample.to_csv("/mnt/data1/Datasets/AlphaPen/" + "sample_data.csv")