import gradio as gr from huggingface_hub import InferenceClient # Initialize the Hugging Face Inference Client client = InferenceClient(model="meta-llama/Meta-Llama-3.1-405B-FP8") # Define the response generation function def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] # Add previous messages to the conversation for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) # Add the new user message messages.append({"role": "user", "content": message}) response = "" # Generate the response using the model for message in client.chat_completion( messages=messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response # Define the ChatGPT-like interface with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo: gr.Markdown("