Ehsanedm commited on
Commit
9cceab9
·
verified ·
1 Parent(s): a153691

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -178
app.py CHANGED
@@ -1,188 +1,25 @@
1
- import json
2
- import subprocess
3
- import time
4
- from llama_cpp import Llama
5
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
- from llama_cpp_agent.providers import LlamaCppPythonProvider
7
- from llama_cpp_agent.chat_history import BasicChatHistory
8
- from llama_cpp_agent.chat_history.messages import Roles
9
- import gradio as gr
10
- from huggingface_hub import hf_hub_download
11
 
12
- llm = None
13
- llm_model = None
 
 
 
14
 
15
- # Download the new model
16
- hf_hub_download(
17
- repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
18
- filename="llama-3.2-1b-instruct-q4_k_m.gguf",
19
- local_dir="./models"
20
- )
21
-
22
- def get_messages_formatter_type(model_name):
23
- return MessagesFormatterType.LLAMA_3
24
-
25
- def respond(
26
- message,
27
- history: list[tuple[str, str]],
28
- model,
29
- system_message,
30
- max_tokens,
31
- temperature,
32
- top_p,
33
- top_k,
34
- repeat_penalty,
35
- ):
36
- global llm
37
- global llm_model
38
-
39
- chat_template = get_messages_formatter_type(model)
40
-
41
- if llm is None or llm_model != model:
42
- llm = Llama(
43
- model_path=f"models/{model}",
44
- n_gpu_layers=0, # Adjust based on your GPU
45
- n_batch=32398, # Adjust based on your RAM
46
- n_ctx=512, # Adjust based on your RAM and desired context length
47
- )
48
- llm_model = model
49
-
50
- provider = LlamaCppPythonProvider(llm)
51
-
52
- agent = LlamaCppAgent(
53
- provider,
54
- system_prompt=f"{system_message}",
55
- predefined_messages_formatter_type=chat_template,
56
- debug_output=True
57
- )
58
-
59
- settings = provider.get_provider_default_settings()
60
- settings.temperature = temperature
61
- settings.top_k = top_k
62
- settings.top_p = top_p
63
- settings.max_tokens = max_tokens
64
- settings.repeat_penalty = repeat_penalty
65
- settings.stream = True
66
-
67
- messages = BasicChatHistory()
68
-
69
- for msn in history:
70
- user = {
71
- 'role': Roles.user,
72
- 'content': msn[0]
73
- }
74
- assistant = {
75
- 'role': Roles.assistant,
76
- 'content': msn[1]
77
- }
78
- messages.add_message(user)
79
- messages.add_message(assistant)
80
-
81
- start_time = time.time()
82
- token_count = 0
83
-
84
- stream = agent.get_chat_response(
85
- message,
86
- llm_sampling_settings=settings,
87
- chat_history=messages,
88
- returns_streaming_generator=True,
89
- print_output=False
90
- )
91
-
92
- outputs = ""
93
- for output in stream:
94
- outputs += output
95
- token_count += len(output.split())
96
- yield outputs
97
-
98
- end_time = time.time()
99
- latency = end_time - start_time
100
- speed = token_count / (end_time - start_time)
101
- print(f"Latency: {latency} seconds")
102
- print(f"Speed: {speed} tokens/second")
103
-
104
- description = """<p><center>
105
- <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
106
- Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
107
- </center></p>
108
- """
109
-
110
- demo = gr.ChatInterface(
111
- respond,
112
- additional_inputs=[
113
- gr.Dropdown([
114
- "llama-3.2-1b-instruct-q4_k_m.gguf"
115
- ],
116
- value="llama-3.2-1b-instruct-q4_k_m.gguf",
117
- label="Model"
118
- ),
119
- gr.TextArea(value="""You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta. Your capabilities include:
120
  1. Complex reasoning and problem-solving
121
  2. Multilingual understanding and generation
122
  3. Creative and analytical writing
123
  4. Code understanding and generation
124
  5. Task decomposition and step-by-step guidance
125
  6. Summarization and information extraction
 
126
  Always strive for accuracy, clarity, and helpfulness in your responses. If you're unsure about something, express your uncertainty. Use the following format for your responses:
127
- """, label="System message"),
128
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
129
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
130
- gr.Slider(
131
- minimum=0.1,
132
- maximum=2.0,
133
- value=0.9,
134
- step=0.05,
135
- label="Top-p",
136
- ),
137
- gr.Slider(
138
- minimum=0,
139
- maximum=100,
140
- value=1,
141
- step=1,
142
- label="Top-k",
143
- ),
144
- gr.Slider(
145
- minimum=0.0,
146
- maximum=2.0,
147
- value=1.1,
148
- step=0.1,
149
- label="Repetition penalty",
150
- ),
151
- ],
152
- theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
153
- body_background_fill_dark="#16141c",
154
- block_background_fill_dark="#16141c",
155
- block_border_width="1px",
156
- block_title_background_fill_dark="#1e1c26",
157
- input_background_fill_dark="#292733",
158
- button_secondary_background_fill_dark="#24212b",
159
- border_color_accent_dark="#343140",
160
- border_color_primary_dark="#343140",
161
- background_fill_secondary_dark="#16141c",
162
- color_accent_soft_dark="transparent",
163
- code_background_fill_dark="#292733",
164
- ),
165
- title="Meta Llama 3.2 (1B)",
166
- description=description,
167
- chatbot=gr.Chatbot(
168
- scale=1,
169
- likeable=True,
170
- show_copy_button=True
171
- ),
172
- examples=[
173
- ["Hello! Can you introduce yourself?"],
174
- ["What's the capital of France?"],
175
- ["Can you explain the concept of photosynthesis?"],
176
- ["Write a short story about a robot learning to paint."],
177
- ["Explain the difference between machine learning and deep learning."],
178
- ["Summarize the key points of climate change and its global impact."],
179
- ["Explain quantum computing to a 10-year-old."],
180
- ["Design a step-by-step meal plan for someone trying to lose weight and build muscle."]
181
- ],
182
- cache_examples=False,
183
- autofocus=False,
184
- concurrency_limit=None
185
  )
186
-
187
- if __name__ == "__main__":
188
- demo.launch()
 
1
+ from gradio_client import Client
 
 
 
 
 
 
 
 
 
2
 
3
+ client = Client("KingNish/Llama-3.2-1b-CPU")
4
+ result = client.predict(
5
+ message="Hello!!",
6
+ request="llama-3.2-1b-instruct-q4_k_m.gguf",
7
+ param_3="You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta. Your capabilities include:
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  1. Complex reasoning and problem-solving
10
  2. Multilingual understanding and generation
11
  3. Creative and analytical writing
12
  4. Code understanding and generation
13
  5. Task decomposition and step-by-step guidance
14
  6. Summarization and information extraction
15
+
16
  Always strive for accuracy, clarity, and helpfulness in your responses. If you're unsure about something, express your uncertainty. Use the following format for your responses:
17
+ ",
18
+ param_4=512,
19
+ param_5=0.7,
20
+ param_6=0.9,
21
+ param_7=1,
22
+ param_8=1.1,
23
+ api_name="/chat"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
25
+ print(result)