Spaces:
Paused
Paused
File size: 4,005 Bytes
ee02a28 3066997 ee02a28 d95871f ee02a28 3a29dde 1b04b40 ee02a28 1b04b40 3a29dde 067f161 3066997 ee02a28 d4156d6 d95871f d4156d6 3a29dde 1b04b40 3a29dde d95871f 067f161 3066997 ee02a28 3066997 ee02a28 3066997 3a29dde d95871f ee02a28 3a29dde ee02a28 3a29dde ee02a28 d1d4918 ee02a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from threading import Thread
from transformers import TextStreamer, TextIteratorStreamer
from unsloth import FastLanguageModel
import torch
import gradio as gr
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_name = "Danielrahmai1991/llama32_ganjoor_adapt_basic_model_16bit_v1"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
trust_remote_code=True,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
FastLanguageModel.for_inference(model)
print("model loaded")
import re
from deep_translator import (GoogleTranslator,
PonsTranslator,
LingueeTranslator,
MyMemoryTranslator,
YandexTranslator,
DeeplTranslator,
QcriTranslator,
single_detection,
batch_detection)
# from pyaspeller import YandexSpeller
# def error_correct_pyspeller(sample_text):
# """ grammer correction of input text"""
# speller = YandexSpeller()
# fixed = speller.spelled(sample_text)
# return fixed
# def postprocerssing(inp_text: str):
# """Post preocessing of the llm response"""
# inp_text = re.sub('<[^>]+>', '', inp_text)
# inp_text = inp_text.split('##', 1)[0]
# inp_text = error_correct_pyspeller(inp_text)
# return inp_text
# streamer = TextStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens = True)
messages = []
def generate_text(prompt, max_length, top_p, top_k):
global messages
lang = single_detection(prompt, api_key='4ab77f25578d450f0902fb42c66d5e11')
# if lang == 'en':
# prompt = error_correct_pyspeller(prompt)
en_translated = GoogleTranslator(source='auto', target='en').translate(prompt)
messages.append({"role": "user", "content": en_translated})
# messages.append({"role": "user", "content": prompt})
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt = True,
return_tensors = "pt",
)
streamer = TextIteratorStreamer(
tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
)
generate_kwargs = dict(
max_length=int(max_length),top_p=float(top_p), do_sample=True,
top_k=int(top_k), streamer=streamer, temperature=0.6, repetition_penalty=1.2
)
# _ = model.generate(input_ids, streamer = streamer, max_new_tokens = int(max_length), pad_token_id = tokenizer.eos_token_id,
# temperature=0.6, # Adjust this value
# top_k=int(top_k), # Adjust this value
# top_p=float(top_p), # Adjust this value
# repetition_penalty=1.2
# )
t = Thread(target=model.generate, args=(input_ids,), kwargs=generate_kwargs)
t.start()
generated_text=[]
for text in streamer:
generated_text.append(text)
# print(generated_text)
# yield "".join(generated_text)
yield GoogleTranslator(source='auto', target=lang).translate("".join(generated_text))
messages.append({"role": "assistant", "content": "".join(generated_text)})
description = """
# Deploy our LLM
"""
inputs = [
gr.Textbox(label="Prompt text", lines=5),
gr.Textbox(label="max-lenth generation", value=100),
gr.Slider(0.0, 1.0, label="top-p value", value=0.95),
gr.Textbox(label="top-k", value=50,),
]
outputs = [gr.Textbox(label="Generated Text", lines= 10)]
demo = gr.Interface(fn=generate_text, inputs=inputs, outputs=outputs, description=description)
demo.launch(debug=True, share=True) |