import gradio as gr from huggingface_hub import InferenceClient import pytesseract from PIL import Image from pypdf import PdfReader import ocrmypdf import os # Image to Text def fn_image_to_text(input_image): return pytesseract.image_to_string(Image.open(input_image)) # PDF to Text def fn_pdf_to_text(input_pdf): reader = PdfReader(input_pdf) output_pdf = "" for page in reader.pages: output_pdf+=page.extract_text() image_count = 0 for page in reader.pages: image_count += len(page.images) if image_count > 0 and len(output_pdf) < 1000: input_pdf_ocr = input_pdf.replace(".pdf", " - OCR.pdf") ocrmypdf.ocr(input_pdf, input_pdf_ocr, force_ocr=True) reader = PdfReader(input_pdf_ocr) output_pdf = "" for page in reader.pages: output_pdf+=page.extract_text() os.remove(input_pdf_ocr) return output_pdf # Inference model_text = "google/gemma-2-27b-it" model_vision = "google/paligemma2-3b-pt-224" client = InferenceClient() def fn_text( prompt, history, input, #system_prompt, max_tokens, temperature, top_p, ): if input: if os.path.splitext(input)[1].lower() in [".png", ".jpg", ".jpeg"]: output = fn_image_to_text(input) if os.path.splitext(input)[1].lower() == ".pdf": output = fn_pdf_to_text(input) else: output = "" #messages = [{"role": "system", "content": system_prompt}] #history.append(messages[0]) #messages.append({"role": "user", "content": prompt}) #history.append(messages[1]) messages = [{"role": "user", "content": prompt + " " + output}] history.append(messages[0]) stream = client.chat.completions.create( model = model_text, messages = history, max_tokens = max_tokens, temperature = temperature, top_p = top_p, stream = True, ) chunks = [] for chunk in stream: chunks.append(chunk.choices[0].delta.content or "") yield "".join(chunks) app_text = gr.ChatInterface( fn = fn_text, type = "messages", additional_inputs = [ gr.File(type="filepath", label="Input"), #gr.Textbox(value="You are a helpful assistant.", label="System Prompt"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"), ], title = "Google Gemma", description = model_text, ) def fn_vision( prompt, image_url, #system_prompt, max_tokens, temperature, top_p, ): messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] if image_url: messages[0]["content"].append({"type": "image_url", "image_url": {"url": image_url}}) stream = client.chat.completions.create( model = model_vision, messages = messages, max_tokens = max_tokens, temperature = temperature, top_p = top_p, stream = True, ) chunks = [] for chunk in stream: chunks.append(chunk.choices[0].delta.content or "") yield "".join(chunks) app_vision = gr.Interface( fn = fn_vision, inputs = [ gr.Textbox(label="Prompt"), gr.Textbox(label="Image URL") ], outputs = [ gr.Textbox(label="Output") ], additional_inputs = [ #gr.Textbox(value="You are a helpful assistant.", label="System Prompt"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"), ], title = "Google Gemma", description = model_vision, ) app = gr.TabbedInterface( [app_text, app_vision], ["Text", "Vision"] ).launch() #if __name__ == "__main__": # app.launch()