eronariodito commited on
Commit
dc706d8
·
verified ·
1 Parent(s): cc064b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -71
app.py CHANGED
@@ -1,77 +1,39 @@
1
  import gradio as gr
2
- # Load model directly
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
 
6
- tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B")
7
- model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B")
8
-
9
- # Move the model to the appropriate device (GPU if available, else CPU)
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
- model.to(device)
12
-
13
-
14
- def respond(
15
- message,
16
- history: list[tuple[str, str]],
17
- system_message,
18
- max_tokens,
19
- temperature,
20
- top_p,
21
- ):
22
- # Prepare prompt with history
23
- messages = [{"role": "system", "content": system_message}]
24
- for val in history:
25
- if val[0]:
26
- messages.append({"role": "user", "content": val[0]})
27
- if val[1]:
28
- messages.append({"role": "assistant", "content": val[1]})
29
-
30
- messages.append({"role": "user", "content": message})
31
-
32
- # Convert conversation into a single input string
33
- prompt = f"{system_message}\n"
34
- for turn in messages[1:]:
35
- if turn["role"] == "user":
36
- prompt += f"User: {turn['content']}\n"
37
- elif turn["role"] == "assistant":
38
- prompt += f"Assistant: {turn['content']}\n"
39
- prompt += "Assistant:"
40
-
41
- # Tokenize input
42
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
43
-
44
- # Generate response
45
- output = model.generate(
46
- inputs["input_ids"],
47
- max_length=inputs["input_ids"].shape[1] + max_tokens,
48
- temperature=temperature,
49
- top_p=top_p,
50
- pad_token_id=tokenizer.eos_token_id,
51
- )
52
-
53
- # Decode response and extract the new assistant message
54
  response = tokenizer.decode(output[0], skip_special_tokens=True)
55
- response = response[len(prompt):].strip() # Strip the input part from the response
56
-
57
- yield response
58
-
59
-
60
- demo = gr.ChatInterface(
61
- respond,
62
- additional_inputs=[
63
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
64
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
65
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
66
- gr.Slider(
67
- minimum=0.1,
68
- maximum=1.0,
69
- value=0.95,
70
- step=0.05,
71
- label="Top-p (nucleus sampling)",
72
- ),
73
- ],
74
  )
75
 
76
- if __name__ == "__main__":
77
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
3
  import torch
4
 
5
+ # Load your model and tokenizer from Hugging Face Hub (forcing CPU usage)
6
+ model_name = "PierreJousselin/lora_model" # Replace with the name you used on Hugging Face
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu") # Force model to load on CPU
9
+
10
+ # Ensure pad_token_id is set to eos_token_id to avoid errors
11
+ model.config.pad_token_id = model.config.eos_token_id
12
+
13
+ # Function for generating responses using the model
14
+ def generate_response(prompt):
15
+ # Tokenize input prompt
16
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
17
+
18
+ # Ensure the inputs are moved to the CPU
19
+ input_ids = inputs["input_ids"].to("cpu")
20
+ print(input_ids)
21
+ # Generate output (ensure it's on CPU)
22
+ output = model.generate(input_ids, max_length=150, num_return_sequences=1,pad_token_id=tokenizer.eos_token_id)
23
+
24
+ # Decode and return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  response = tokenizer.decode(output[0], skip_special_tokens=True)
26
+ print(output)
27
+ return response
28
+
29
+ # Create a Gradio interface with a "Generate" button
30
+ iface = gr.Interface(
31
+ fn=generate_response, # Function to call for generating response
32
+ inputs=gr.Textbox(label="Input Prompt"), # Input type (text box for prompt)
33
+ outputs=gr.Textbox(label="Generated Response"), # Output type (text box for response)
34
+ live=False, # Disable live update; only update when button is clicked
35
+ allow_flagging="never" # Prevent flagging (optional, if you don't need it)
 
 
 
 
 
 
 
 
 
36
  )
37
 
38
+ # Launch the interface with a "Generate" button
39
+ iface.launch(share=True) # You can set share=True if you want a public link