""" ChatGLM model configuration """ import torch from collections import OrderedDict from typing import List, Mapping, Optional, Any from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from transformers.onnx import OnnxConfigWithPast, PatchingSpec from transformers import PreTrainedTokenizer, TensorType, is_torch_available logger = logging.get_logger(__name__) class ChatGLMConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~ChatGLMModel`]. It is used to instantiate an ChatGLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 150528): Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`~ChatGLMModel`] or [`~TFChatGLMModel`]. hidden_size (`int`, *optional*, defaults to 4096): Dimension of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 28): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. inner_hidden_size (`int`, *optional*, defaults to 16384): Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. max_sequence_length (`int`, *optional*, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). layernorm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether the model should return the last key/values attentions (not used by all models). Example: ```python >>> from configuration_chatglm import ChatGLMConfig >>> from modeling_chatglm import ChatGLMModel >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration >>> configuration = ChatGLMConfig() >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration >>> model = ChatGLMModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ model_type = "chatglm" def __init__( self, vocab_size=150528, hidden_size=4096, num_layers=28, num_attention_heads=32, layernorm_epsilon=1e-5, use_cache=False, bos_token_id=150004, eos_token_id=150005, mask_token_id=150000, gmask_token_id=150001, pad_token_id=0, max_sequence_length=2048, inner_hidden_size=16384, position_encoding_2d=True, quantization_bit=0, pre_seq_len=None, prefix_projection=False, **kwargs ): self.num_layers = num_layers self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.max_sequence_length = max_sequence_length self.layernorm_epsilon = layernorm_epsilon self.inner_hidden_size = inner_hidden_size self.use_cache = use_cache self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.pad_token_id = pad_token_id self.mask_token_id = mask_token_id self.gmask_token_id = gmask_token_id self.position_encoding_2d = position_encoding_2d self.quantization_bit = quantization_bit self.pre_seq_len = pre_seq_len self.prefix_projection = prefix_projection super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs ) class ChatGLMOnnxConfig(OnnxConfigWithPast): r""" This class is the custom configuration of a ChatGLMModel needed in exporting model to ONNX. Currently this need to pre-fix several model struct in modeling_chatglm.py Also there is still a TODO list of current ChatGLMOnnxConfig: 1. add support for batch_size > 1 2. add support for use_past in modeling_chatglm.py and its attention_fn function,we need to change several view into torch tensor action since reshape param may get frozen into constant in onnx model. here is the code: ```python >>> def attention_fn( >>> self, >>> query_layer, >>> key_layer, >>> value_layer, >>> attention_mask, >>> hidden_size_per_partition, >>> layer_id, >>> layer_past=None, >>> scaling_attention_score=True, >>> use_cache=False, >>> ): >>> if layer_past is not None: >>> past_key, past_value = layer_past[0], layer_past[1] >>> key_layer = torch.cat((past_key, key_layer), dim=0) >>> value_layer = torch.cat((past_value, value_layer), dim=0) >>> >>> # seqlen, batch, num_attention_heads, hidden_size_per_attention_head >>> seq_len, b, nh, hidden_size = key_layer.shape >>> >>> if use_cache: >>> present = (key_layer, value_layer) >>> else: >>> present = None >>> >>> query_key_layer_scaling_coeff = float(layer_id + 1) >>> if scaling_attention_score: >>> query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff) >>> >>> # =================================== >>> # Raw attention scores. [b, np, s, s] >>> # =================================== >>> >>> # [b, np, sq, sk] >>> # # output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) >>> >>> # [sq, b, np, hn] -> [sq, b * np, hn] >>> # query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) >>> query_layer = query_layer.flatten(start_dim=1, end_dim=2) >>> >>> # [sk, b, np, hn] -> [sk, b * np, hn] >>> # key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) >>> key_layer = key_layer.flatten(start_dim=1, end_dim=2) >>> >>> matmul_result = torch.zeros( >>> 1, 1, 1, >>> dtype=query_layer.dtype, >>> device=query_layer.device, >>> ) >>> >>> matmul_result = torch.baddbmm( >>> matmul_result, >>> query_layer.transpose(0, 1), # [b * np, sq, hn] >>> key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] >>> beta=0.0, >>> alpha=1.0, >>> ) >>> >>> # [b * np, sq, sk] -> [b, np, sq, sk] >>> # attention_scores = matmul_result.view(*output_size) >>> attention_scores = matmul_result.unsqueeze(0) >>> >>> if self.scale_mask_softmax: >>> self.scale_mask_softmax.scale = query_key_layer_scaling_coeff >>> attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous()) >>> else: >>> # if not (attention_mask == 0).all(): >>> # # if auto-regressive, skip >>> attention_scores.masked_fill_(attention_mask, -10000.0) >>> dtype = attention_scores.dtype >>> attention_scores = attention_scores.float() >>> attention_scores = attention_scores * query_key_layer_scaling_coeff >>> >>> attention_probs = F.softmax(attention_scores, dim=-1) >>> >>> attention_probs = attention_probs.type(dtype) >>> >>> # ========================= >>> # Context layer. [sq, b, hp] >>> # ========================= >>> >>> # value_layer -> context layer. >>> # [sk, b, np, hn] --> [b, np, sq, hn] >>> >>> # context layer shape: [b, np, sq, hn] >>> # output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) >>> >>> # change view [sk, b * np, hn] >>> # value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) >>> value_layer = value_layer.flatten(start_dim=1, end_dim=2) >>> >>> # change view [b * np, sq, sk] >>> # attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) >>> attention_probs = attention_probs.flatten(start_dim=0, end_dim=1) >>> >>> # matmul: [b * np, sq, hn] >>> context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) >>> >>> # change view [b, np, sq, hn] >>> # context_layer = context_layer.reshape(b, np, sq, hidden_size) >>> context_layer = context_layer.unsqueeze(0) >>> >>> # [b, np, sq, hn] --> [sq, b, np, hn] >>> context_layer = context_layer.permute(2, 0, 1, 3).contiguous() >>> >>> # [sq, b, np, hn] --> [sq, b, hp] >>> # new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,) >>> # context_layer = context_layer.view(*new_context_layer_shape) >>> context_layer = context_layer.flatten(start_dim=2) >>> >>> outputs = (context_layer, present, attention_probs) >>> >>> return outputs ''' mainly aviod using view with dynamic size after change the modeling_chatglm.py, you can simply use following code to export and test the onnx model ```python >>> from pathlib import Path >>> from transformers import AutoTokenizer, AutoModel >>> from transformers.onnx import export, validate_model_outputs >>> >>> # load model >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) >>> pt_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) >>> pt_model = pt_model.float() # only tested in CPU for now >>> pt_model.eval() >>> # define path for saving onnx model >>> onnx_path = Path(f"model/chatglm-6b.onnx") >>> onnx_path.parent.mkdir(exist_ok=True) >>> # convert model to onnx >>> onnx_config_chatglm = ChatGLMOnnxConfig(pt_model.config, task="causal-lm") >>> onnx_inputs, onnx_outputs = export(tokenizer, pt_model, >>> onnx_config_chatglm, onnx_config_chatglm.default_onnx_opset, >>> onnx_path) >>> # test onnx model >>> validate_model_outputs(onnx_config_chatglm, tokenizer, pt_model, onnx_path, onnx_outputs, atol=1e-4) ``` """ # TODO support dynamic batch size default_fixed_batch = 1 def __init__( self, config: PretrainedConfig, task: str = "default", patching_specs: List[PatchingSpec] = None, use_past: bool = False, ): super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) @property def inputs(self) -> Mapping[str, Mapping[int, str]]: common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) if self.use_past: # TODO support use_past # self.fill_with_past_key_values_(common_inputs, direction="inputs") # common_inputs["attention_mask"] = \ # {0: "batch", 1: "past_sequence + sequence", 2: "past_sequence + sequence"} raise NotImplementedError('position_ids do not support past_key_values yet.') else: # remind the order common_inputs["position_ids"] = {0: "batch", 2: "sequence"} common_inputs["attention_mask"] = {0: "batch", 2: "sequence", 3: "sequence"} return common_inputs @property def num_layers(self) -> int: return self._config.n_layer @property def num_attention_heads(self) -> int: return self._config.n_head def get_masks(self, input_ids, device=None): """ reference from modeling_chatglm.get_masks """ batch_size, seq_length = input_ids.shape context_lengths = [seq.tolist().index(self._config.bos_token_id) for seq in input_ids] if device: attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device) else: attention_mask = torch.ones((batch_size, seq_length, seq_length), device=input_ids.device) attention_mask.tril_() for i, context_length in enumerate(context_lengths): attention_mask[i, :, :context_length] = 1 attention_mask.unsqueeze_(1) attention_mask = (attention_mask < 0.5).bool() # print("attention_mask", attention_mask.shape) return attention_mask def get_position_ids(self, input_ids, mask_positions, device=None, use_gmasks=None): batch_size, seq_length = input_ids.shape if device is None: device = input_ids.device if use_gmasks is None: use_gmasks = [False] * batch_size context_lengths = [seq.tolist().index(self._config.bos_token_id) for seq in input_ids] if self._config.position_encoding_2d: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) for i, context_length in enumerate(context_lengths): position_ids[i, context_length:] = mask_positions[i] block_position_ids = [torch.cat(( torch.zeros(context_length, dtype=torch.long, device=device), torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1 )) for context_length in context_lengths] block_position_ids = torch.stack(block_position_ids, dim=0) position_ids = torch.stack((position_ids, block_position_ids), dim=1) else: position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) for i, context_length in enumerate(context_lengths): if not use_gmasks[i]: position_ids[context_length:] = mask_positions[i] # print("position_ids", position_ids.shape) return position_ids def generate_dummy_inputs( self, tokenizer: PreTrainedTokenizer, batch_size: int = default_fixed_batch, seq_length: int = -1, is_pair: bool = False, framework: Optional[TensorType] = None, ) -> Mapping[str, Any]: common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( tokenizer, batch_size=self.default_fixed_batch, seq_length=seq_length, is_pair=is_pair, framework=framework ) # check if the mode is using fixed batch size if batch_size != self.default_fixed_batch: logger.warning('batch size is not fixed, force change into fixed batch size: %d.' % self.default_fixed_batch) # We need to order the input in the way they appears in the forward() ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) # Need to add the past_keys if self.use_past: if not is_torch_available(): raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") else: # TODO support use_past # import torch # # batch, seqlen = common_inputs["input_ids"].shape # # Not using the same length for past_key_values # past_key_values_length = seqlen + 2 # past_shape = ( # batch, # self.num_attention_heads, # past_key_values_length, # self._config.hidden_size // self.num_attention_heads, # ) # ordered_inputs["past_key_values"] = [ # (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers) # ] raise NotImplementedError('position_ids do not support past_key_values yet.') # Need to add the attention_mask manually # 1. add attention_mask ordered_inputs["attention_mask"] = self.get_masks(common_inputs["input_ids"]) # 2. add position_ids MASK, gMASK = self._config.mask_token_id, self._config.gmask_token_id seqs = common_inputs["input_ids"].tolist() mask_positions, use_gmasks = [], [] for seq in seqs: mask_token = gMASK if gMASK in seq else MASK use_gmask = mask_token == gMASK mask_positions.append(seq.index(mask_token)) use_gmasks.append(use_gmask) ordered_inputs["position_ids"] = self.get_position_ids(common_inputs["input_ids"], mask_positions, use_gmasks=use_gmasks) if self.use_past: # mask_dtype = ordered_inputs["attention_mask"].dtype # ordered_inputs["attention_mask"] = torch.cat( # [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 # ) raise NotImplementedError('position_ids do not support past_key_values yet.') return ordered_inputs @property def default_onnx_opset(self) -> int: return 13