MagpieLM-4B / app_8B.py
yuchenlin
fix typos and add 8B scripts
56548ad
raw
history blame
3.9 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import spaces
from threading import Thread
from typing import Iterator
# Add markdown header
header = """
# 🐦‍⬛ MagpieLMs: Open LLMs with Fully Transparent Alignment Recipes
💬 We've aligned Llama-3.1-8B and a 4B version (distilled by NVIDIA) using purely synthetic data generated by our [Magpie](https://arxiv.org/abs/2406.08464) method. Our open-source post-training recipe includes: SFT and DPO data, all training configs + logs. This allows everyone to reproduce the alignment process for their own research. Note that our data does not contain any GPT-generated data, and has a much friendly license for both commercial and academic use.
- **Magpie Collection**: [Magpie on Hugging Face](https://lnkd.in/g_pgX5Y2)
- **Magpie Paper**: [Read the research paper](https://arxiv.org/abs/2406.08464)
Contact: [Zhangchen Xu](https://zhangchenxu.com) and [Bill Yuchen Lin](https://yuchenlin.xyz).
---
"""
# Load model and tokenizer
model_name = "Magpie-Align/MagpieLM-8B-Chat-v0.1"
device = "cuda" # the device to load the model onto
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
ignore_mismatched_sizes=True
)
model.to(device)
MAX_INPUT_TOKEN_LENGTH = 4096 # You may need to adjust this value
@spaces.GPU
def respond(
message: str,
chat_history: list[tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are Magpie, a helpful AI assistant. For simple queries, try to answer them directly; for complex questions, try to think step-by-step before providing an answer.", label="System message"),
gr.Slider(minimum=128, maximum=2048, value=512, step=64, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top-p (nucleus sampling)",
),
gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.1, label="Repetition Penalty"),
],
description=header, # Add the header as the description
title="MagpieLM-8B Chat (v0.1)",
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.queue()
demo.launch(share=True)