Spaces:

MadsGalsgaard
/

Project-W

Sleeping

App Files Files Community

MadsGalsgaard commited on Sep 4, 2024

Commit

7daf88a

verified ·

1 Parent(s): 9ae8177

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -96

app.py CHANGED Viewed

@@ -441,80 +441,159 @@
 ###########new clientkey
 # import gradio as gr
-# from huggingface_hub import InferenceClient
-# # Hugging Face Inference Client setup
-# client = InferenceClient(
-#     model="meta-llama/Meta-Llama-3.1-8B-Instruct" # Replace with your actual token
-# )
-# # Function to interact with the Hugging Face model
-# def chat_with_model(message, history):
-#     # Prepare conversation history for the model
-#     conversation = [{"role": "system", "content": "You are a helpful assistant."}]
-#     for past_message, past_response in history:
-#         conversation.append({"role": "user", "content": past_message})
-#         conversation.append({"role": "assistant", "content": past_response})
-#     # Add new user message to the conversation
-#     conversation.append({"role": "user", "content": message})
-#     # Generate response using the Inference API
-#     responses = client.chat_completion(
-#         messages=conversation,
-#         max_tokens=500,
-#         stream=True
 #     )
-#     # Capture streamed response
-#     response_text = ""
-#     for response in responses:
-#         delta_content = response.choices[0].delta.content
-#         response_text += delta_content
-#     history.append((message, response_text))
-#     return history, history  # Update both chatbot history and visible chat
-# # Create Gradio interface
-# with gr.Blocks() as demo:
-#     chatbot = gr.Chatbot(height=600)
-#     msg_input = gr.Textbox(show_label=False, placeholder="Type your message...")
-#     with gr.Row():
-#         clear_btn = gr.Button("Clear Chat")
-#     # Setting up interaction between user input and the chatbot
-#     msg_input.submit(chat_with_model, [msg_input, chatbot], [chatbot, chatbot])
-#     clear_btn.click(lambda: None, None, chatbot, queue=False)
-#     gr.Markdown("## Llama 3.1 Chatbot")
-# # Launch Gradio demo
 # if __name__ == "__main__":
 #     demo.launch()
-import os
-import time
-import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 from threading import Thread
 MODEL = "THUDM/LongWriter-llama3.1-8b"
 TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>"
 PLACEHOLDER = """
 <center>
 <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -527,54 +606,61 @@ h3 {
 }
 """
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
-model = model.eval()
-@spaces.GPU()
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
     temperature: float = 0.5,
-    max_new_tokens: int = 32768,
     top_p: float = 1.0,
     top_k: int = 50,
 ):
-    print(f'message: {message}')
-    print(f'history: {history}')
-    full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
-    for prompt, answer in history:
-        full_prompt += f"[INST]{prompt}[/INST]{answer}"
-    full_prompt += f"[INST]{message}[/INST]"
-    inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device)
-    context_length = inputs.input_ids.shape[-1]
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        inputs=inputs.input_ids,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        streamer=streamer,
-    )
-    thread = Thread(target=model.generate, kwargs=generate_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
@@ -601,9 +687,9 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
             ),
             gr.Slider(
                 minimum=1024,
-                maximum=32768,
                 step=1024,
-                value=32768,
                 label="Max new tokens",
                 render=False,
             ),
@@ -624,12 +710,12 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 render=False,
             ),
         ],
-        examples=[
-            ["Write a 5000-word comprehensive guide on machine learning for beginners."],
-            ["Create a detailed 3000-word business plan for a sustainable energy startup."],
-            ["Compose a 2000-word short story set in a futuristic underwater city."],
-            ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."],
-        ],
         cache_examples=False,
     )

 ###########new clientkey
+# import os
+# import time
+# import spaces
+# import torch
+# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # import gradio as gr
+# from threading import Thread
+# MODEL = "THUDM/LongWriter-llama3.1-8b"
+# TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>"
+# PLACEHOLDER = """
+# <center>
+# <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p>
+# </center>
+# """
+# CSS = """
+# .duplicate-button {
+#     margin: auto !important;
+#     color: white !important;
+#     background: black !important;
+#     border-radius: 100vh !important;
+# }
+# h3 {
+#     text-align: center;
+# }
+# """
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+# model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
+# model = model.eval()
+# @spaces.GPU()
+# def stream_chat(
+#     message: str,
+#     history: list,
+#     system_prompt: str,
+#     temperature: float = 0.5,
+#     max_new_tokens: int = 32768,
+#     top_p: float = 1.0,
+#     top_k: int = 50,
+# ):
+#     print(f'message: {message}')
+#     print(f'history: {history}')
+#     full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
+#     for prompt, answer in history:
+#         full_prompt += f"[INST]{prompt}[/INST]{answer}"
+#     full_prompt += f"[INST]{message}[/INST]"
+#     inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device)
+#     context_length = inputs.input_ids.shape[-1]
+#     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+#     generate_kwargs = dict(
+#         inputs=inputs.input_ids,
+#         max_new_tokens=max_new_tokens,
+#         do_sample=True,
+#         top_p=top_p,
+#         top_k=top_k,
+#         temperature=temperature,
+#         num_beams=1,
+#         streamer=streamer,
 #     )
+#     thread = Thread(target=model.generate, kwargs=generate_kwargs)
+#     thread.start()
+#     buffer = ""
+#     for new_text in streamer:
+#         buffer += new_text
+#         yield buffer
+# chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
+# with gr.Blocks(css=CSS, theme="soft") as demo:
+#     gr.HTML(TITLE)
+#     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+#     gr.ChatInterface(
+#         fn=stream_chat,
+#         chatbot=chatbot,
+#         fill_height=True,
+#         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+#         additional_inputs=[
+#             gr.Textbox(
+#                 value="You are a helpful assistant capable of generating long-form content.",
+#                 label="System Prompt",
+#                 render=False,
+#             ),
+#             gr.Slider(
+#                 minimum=0,
+#                 maximum=1,
+#                 step=0.1,
+#                 value=0.5,
+#                 label="Temperature",
+#                 render=False,
+#             ),
+#             gr.Slider(
+#                 minimum=1024,
+#                 maximum=32768,
+#                 step=1024,
+#                 value=32768,
+#                 label="Max new tokens",
+#                 render=False,
+#             ),
+#             gr.Slider(
+#                 minimum=0.0,
+#                 maximum=1.0,
+#                 step=0.1,
+#                 value=1.0,
+#                 label="Top p",
+#                 render=False,
+#             ),
+#             gr.Slider(
+#                 minimum=1,
+#                 maximum=100,
+#                 step=1,
+#                 value=50,
+#                 label="Top k",
+#                 render=False,
+#             ),
+#         ],
+#         examples=[
+#             ["Write a 5000-word comprehensive guide on machine learning for beginners."],
+#             ["Create a detailed 3000-word business plan for a sustainable energy startup."],
+#             ["Compose a 2000-word short story set in a futuristic underwater city."],
+#             ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."],
+#         ],
+#         cache_examples=False,
+#     )
 # if __name__ == "__main__":
 #     demo.launch()
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 from threading import Thread
+# Model and constants
 MODEL = "THUDM/LongWriter-llama3.1-8b"
 TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>"
 PLACEHOLDER = """
 <center>
 <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
 }
 """
+# Check device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto").eval()
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
     temperature: float = 0.5,
+    max_new_tokens: int = 4096,  # Lowered max tokens for efficiency
     top_p: float = 1.0,
     top_k: int = 50,
 ):
+    try:
+        full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
+        for prompt, answer in history:
+            full_prompt += f"[INST]{prompt}[/INST]{answer}"
+        full_prompt += f"[INST]{message}[/INST]"
+        # Tokenize input
+        inputs = tokenizer(full_prompt, truncation=True, max_length=2048, return_tensors="pt").to(device)
+        context_length = inputs.input_ids.shape[-1]
+        # Setup TextIteratorStreamer for streaming response
+        streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+        # Generation parameters
+        generate_kwargs = dict(
+            inputs=inputs.input_ids,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            num_beams=1,
+            streamer=streamer,
+        )
+        # Generate text in a separate thread to avoid blocking
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+        # Stream response
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            yield buffer
+    except Exception as e:
+        yield f"An error occurred: {str(e)}"
+# Gradio setup
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
             ),
             gr.Slider(
                 minimum=1024,
+                maximum=4096,  # Reduced to a more manageable value
                 step=1024,
+                value=4096,
                 label="Max new tokens",
                 render=False,
             ),
                 render=False,
             ),
         ],
+        # examples=[
+        #     ["Write a 5000-word comprehensive guide on machine learning for beginners."],
+        #     ["Create a detailed 3000-word business plan for a sustainable energy startup."],
+        #     ["Compose a 2000-word short story set in a futuristic underwater city."],
+        #     ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."],
+        # ],
         cache_examples=False,
     )