--- library_name: transformers tags: - mergekit - merge - llama-3.1 - roleplay - function calling base_model: - T145/ZEUS-8B-V2 license: llama3.1 model-index: - name: ZEUS-8B-V2-abliterated results: - task: type: text-generation name: Text Generation dataset: name: IFEval (0-Shot) type: wis-k/instruction-following-eval split: train args: num_few_shot: 0 metrics: - type: inst_level_strict_acc and prompt_level_strict_acc value: 78.95 name: averaged accuracy source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: BBH (3-Shot) type: SaylorTwift/bbh split: test args: num_few_shot: 3 metrics: - type: acc_norm value: 30.98 name: normalized accuracy source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: MATH Lvl 5 (4-Shot) type: lighteval/MATH-Hard split: test args: num_few_shot: 4 metrics: - type: exact_match value: 20.62 name: exact match source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: GPQA (0-shot) type: Idavidrein/gpqa split: train args: num_few_shot: 0 metrics: - type: acc_norm value: 8.39 name: acc_norm source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: MuSR (0-shot) type: TAUR-Lab/MuSR args: num_few_shot: 0 metrics: - type: acc_norm value: 7.92 name: acc_norm source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: MMLU-PRO (5-shot) type: TIGER-Lab/MMLU-Pro config: main split: test args: num_few_shot: 5 metrics: - type: acc value: 31.39 name: accuracy source: url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated name: Open LLM Leaderboard --- # ZEUS 8B 🌩️ V2 - ABLITERATED V2 abliterated using the following script: ```python import gc import random import torch from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig MODEL_ID = "T145/ZEUS-8B-V2" # More samples can help find the direction better. NUM_PROMPT_SAMPLES = 32 # Used to skip the first and last layers for the modifications. SKIP_BEGIN_LAYERS = 1 SKIP_END_LAYERS = 1 # The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers). LAYER_FRACTION_TO_USE = 0.6 # Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less. SCALE_FACTOR = 1.0 torch.inference_mode() torch.set_default_device("cpu") torch.set_grad_enabled(False) # Load the model on the GPU in quantized type if we can. model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16, quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16), low_cpu_mem_usage=True, device_map='auto' ) model.requires_grad_(False) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE) print("Layer index for refusal direction: " + str(layer_idx)) with open("harmful.txt", "r", encoding="utf-8") as f: harmful = f.readlines() with open("harmless.txt", "r", encoding="utf-8") as f: harmless = f.readlines() harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful))) harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless))) harmful_toks = [ tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False, return_tensors="pt") for insn in harmful_instructions] harmless_toks = [ tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False, return_tensors="pt") for insn in harmless_instructions] bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples") # Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM. def generate(toks): inputs = tokenizer(toks, return_tensors="pt", padding=True) inputs = inputs.to(model.device) output = model.generate( inputs['input_ids'], use_cache=False, max_new_tokens=1, return_dict_in_generate=True, output_hidden_states=True, attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.eos_token_id ) bar_generate.update(n=1) return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1. harmful_hidden = [generate(toks) for toks in harmful_toks] harmless_hidden = [generate(toks) for toks in harmless_toks] bar_generate.close() harmful_mean = torch.stack(harmful_hidden).mean(dim=0) harmless_mean = torch.stack(harmless_hidden).mean(dim=0) refusal_dir = harmful_mean - harmless_mean refusal_dir = refusal_dir.squeeze() / refusal_dir.norm() torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt") # Free memory del model gc.collect() torch.cuda.empty_cache() # Reload the model in CPU memory with bfloat16 data type model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map='cpu' ) model.requires_grad_(False) # Make sure it's on the 'cpu' device. if refusal_dir.device != model.device: refusal_dir = refusal_dir.to(model.device) # Get the language model component and check it's as expected. lm_model = model.model assert hasattr(lm_model, 'layers'), "The model does not have the expected structure." # Check the ranges are valid. num_layers = len(lm_model.layers) assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0." assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0." assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers." bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors") # NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less. def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0): assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..." tensor_float = tensor_data.to(torch.bfloat16) refusal_dir_float = refusal_dir.to(torch.bfloat16) tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float) tensor_modified = tensor_float.to(torch.bfloat16) bar_layers.update(1) return torch.nn.Parameter(tensor_modified) # Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer. # NOTE: These tensors names are speific to "llama" and may need changing. # - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS): lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor( lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR ) lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor( lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR ) bar_layers.close() print("Saving modified model (with original tokenizer)...") FIXED_ID = f"{MODEL_ID}-abliterated" model.save_pretrained(FIXED_ID) tokenizer.save_pretrained(FIXED_ID) ``` According to the script, **layer 19** is the primary target for abliteration. # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/T145__ZEUS-8B-V2-abliterated-details)! Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=T145%2FZEUS-8B-V2-abliterated&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc)! | Metric |Value (%)| |-------------------|--------:| |**Average** | 29.71| |IFEval (0-Shot) | 78.95| |BBH (3-Shot) | 30.98| |MATH Lvl 5 (4-Shot)| 20.62| |GPQA (0-shot) | 8.39| |MuSR (0-shot) | 7.92| |MMLU-PRO (5-shot) | 31.39|