import gradio as gr from transformers import AutoTokenizer import re from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM from transformers import BitsAndBytesConfig import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM # The model that you want to train from the Hugging Face hub import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, Trainer ) from peft import LoraConfig, PeftModel from trl import SFTTrainer model_name ="bigscience/bloomz-7b1" # The instruction dataset to use # Fine-tuned model name new_model = "bigscience/bloomz-7b1" # LoRA attention dimension lora_r = 16 lora_alpha = 16 # Dropout probability for LoRA layers lora_dropout = 0.05 use_4bit = True # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4) bnb_4bit_quant_type = "nf4" use_nested_quant = False output_dir = "./results" # Number of training epochs num_train_epochs = 1 fp16 = False bf16 = False per_device_train_batch_size =1 per_device_eval_batch_size = 4 gradient_accumulation_steps = 8 gradient_checkpointing = True max_grad_norm = 0.3 learning_rate = 5e-5 weight_decay = 0.001 optim = "paged_adamw_8bit" lr_scheduler_type = "constant" max_steps = -1 warmup_ratio = 0.03 group_by_length = True save_steps = 100 logging_steps = 25 max_seq_length = False packing = False #device_map = {"": 0} compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) # Check GPU compatibility with bfloat16 if compute_dtype == torch.float16 and use_4bit: major, _ = torch.cuda.get_device_capability() if major >= 8: print("=" * 80) print("Your GPU supports bfloat16: accelerate training with bf16=True") print("=" * 80) # Load base model config = PeftConfig.from_pretrained("mohamedemam/essay_checker") model = AutoModelForCausalLM.from_pretrained("nfaheem/Marcoroni-7b-DPO-Merge", quantization_config=bnb_config) model = PeftModel.from_pretrained(model, "mohamedemam/essay_checker") model.eval() def chat_Format(context,quetion,answer): return "Instruction:\n check answer is true or false of next quetion using context below:\n"+"context: "+context+"\nquetion:"+quetion+ f".\n#student answer: "+answer+".\n#response:" # Create a Wikipedia API instance # Function to generate questions and answers with configurable parameters def generate_qa(context,quetion,answer,max_new_token): input_text = chat_Format(context,quetion,answer) a = tokenizer(text=input_text, return_tensors='pt') # Generate with configurable parameters output = model.generate(input_ids=a['input_ids'],attention_mask=w['attention_mask'], max_new_tokens=max_new_token ) # generated_text = tokenizer.batch_decode(output, skip_special_tokens=True) formatted_output = "\n\n".join(set(generated_text)) return formatted_output iface = gr.Interface( fn=generate_qa, inputs=[ "text", "text2", "text3", gr.inputs.Slider(minimum=1, maximum=100, default=3, step=1, label="max token"), ],theme="red-black", outputs=gr.outputs.Textbox(label="Generated Output"), title="check answers", description="put you context ", ) # Launch the interface iface.launch()