From edb41b343375369ba0b943100177d67a12d987a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 02:41:56 +0000 Subject: [PATCH 001/110] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20T5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/T5/.keep diff --git a/MindIE/MindIE-Torch/built-in/T5/.keep b/MindIE/MindIE-Torch/built-in/T5/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 11d9724cff613b632f2bd41e67db0f624f1fd26d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 02:44:45 +0000 Subject: [PATCH 002/110] add MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 181 +++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_t5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py new file mode 100644 index 0000000000..2b421aff68 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -0,0 +1,181 @@ + +import torch +import torch_npu +import argparse +import os +import mindietorch +from transformers import T5ForConditionalGeneration + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--output_dir", + type=str, + default="./models", + help="save dir" + ) + parser.add_argument( + "--model_path", + type=str, + default="./DeepFloyd--t5-v1_1-xxl", + help="encoder model path" + ) + parser.add_argument( + "--max_batchsize", + type=int, + default=1, + help="max batchsize when running" + ) + + parser.add_argument( + "--max_input_seq_len", + type=int, + default=256, + help="max input_sequence length when running" + ) + + + parser.add_argument( + "--device_id", + type=int, + default=0, + help="npu device id" + ) + return parser.parse_args() + + +class TextEncoderExport(torch.nn.Module): + def __init__(self, textencoder_model): + super(TextEncoderExport, self).__init__() + self.textencoder_model = textencoder_model + + def forward(self, input_ids): + return self.textencoder_model(input_ids=input_ids) + +class TextDecoderExport(torch.nn.Module): + def __init__(self, textdecoder_model): + super(TextDecoderExport, self).__init__() + self.textdecoder_model = textdecoder_model + + def forward(self, + input_ids, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + past_cross_key_values): + return self.textdecoder_model(input_ids=input_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + past_cross_key_values=past_cross_key_values, + return_dict=True) + +def export_textencoder(args, model, save_dir, batch_size): + encoder_path = os.path.join(save_dir, "encoder") + if not os.path.exists(encoder_path): + os.makedirs(encoder_path, mode=0o640) + traced_path = os.path.join(encoder_path, "encoder.pt") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(traced_path): + text_encoder = model.encoder + dummy_input = ( + torch.ones([1, 128], dtype=torch.int64).npu() + ) + encoder = TextEncoderExport(text_encoder) + encoder.eval() + torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) + if not os.path.exists(compiled_path): + model = torch.jit.load(traced_path).eval() + + inputs0 = [] + # inputs1 = [] + inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) + print("compiling encoder") + compiled_model = mindietorch.compile( + model, + inputs=inputs0, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def export_textdecoder(args, model, save_dir, batch_size): + decoder_path = os.path.join(save_dir, "decoder") + if not os.path.exists(decoder_path): + os.makedirs(decoder_path, mode=0o640) + traced_path = os.path.join(decoder_path, "decoder.pt") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + model_path = args.model_path + max_lenth = 120 + if not os.path.exists(traced_path): + text_decoder = model.decoder + dummy_input = ( + torch.ones([1, 1], dtype=torch.int64).npu(), + torch.randn(1,16,512).to(torch.float16).npu(), + torch.ones(1,16).npu(), + torch.randn(6,2,1,8,1,64).to(torch.float16).npu(), + torch.randn(6,2,1,8,24,64).to(torch.float16).npu() + ) + decoder = TextDecoderExport(text_decoder).npu() + decoder.eval() + torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) + if not os.path.exists(compiled_path): + model = torch.jit.load(traced_path).eval() + print("compiling decoder") + compiled_model = mindietorch.compile( + model, + inputs=[mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,1), + dtype=mindietorch.dtype.INT64), + + mindietorch.Input(min_shape =(1, 1, 512), + max_shape=(args.max_batchsize, args.max_input_seq_len, 512), + dtype=mindietorch.dtype.FLOAT16), + + mindietorch.Input(min_shape = (1,1), + max_shape =(args.max_batchsize,args.max_input_seq_len), + dtype=mindietorch.dtype.INT64), + mindietorch.Input(min_shape = (6,2,1,8,0,64), + max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64), + dtype=mindietorch.dtype.FLOAT16), + + mindietorch.Input(min_shape = (6,2,1,8,1,64), + max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64), + dtype=mindietorch.dtype.FLOAT16)], + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def main(): + args = parse_arguments() + device_id = args.device_id + save_dir = args.output_dir + torch.npu.set_device(device_id) + batch_size = 1 + model = T5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu() + encoder_path = os.path.join(save_dir, "encoder") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textencoder(args, model, save_dir, batch_size) + print("export encoder_model done!") + + decoder_path = os.path.join(save_dir, "decoder") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textdecoder(args, model, save_dir, batch_size) + print("export decoder_model done!") + + + + +if __name__ == "__main__": + main() -- Gitee From 978760fe5e3cc2deaaa3641b4000c5c346385d67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:28:31 +0000 Subject: [PATCH 003/110] transformers patch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/modeling_outputs.patch | 0 .../MindIE-Torch/built-in/modeling_t5.patch | 819 ++++++++++++++++++ .../built-in/modeling_utils.patch | 0 MindIE/MindIE-Torch/built-in/utils.patch | 0 4 files changed, 819 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/modeling_outputs.patch create mode 100644 MindIE/MindIE-Torch/built-in/modeling_t5.patch create mode 100644 MindIE/MindIE-Torch/built-in/modeling_utils.patch create mode 100644 MindIE/MindIE-Torch/built-in/utils.patch diff --git a/MindIE/MindIE-Torch/built-in/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/modeling_outputs.patch new file mode 100644 index 0000000000..e69de29bb2 diff --git a/MindIE/MindIE-Torch/built-in/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/modeling_t5.patch new file mode 100644 index 0000000000..4a376cf5eb --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/modeling_t5.patch @@ -0,0 +1,819 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-08-29 11:11:23.852000000 +0800 ++++ modeling_t5.py 2024-08-29 11:19:34.572000000 +0800 +@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple + import torch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +-import torch_npu +-import mindietorch + + from ...activations import ACT2FN + from ...modeling_outputs import ( +@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module): + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) +- # print("self.weight.dtype=",self.weight.dtype) ++ + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) +@@ -451,7 +449,6 @@ class T5Attention(nn.Module): + key_value_states=None, + position_bias=None, + past_key_value=None, +- past_cross_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -468,8 +465,7 @@ class T5Attention(nn.Module): + real_seq_length = seq_length + + if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: ++ if len(past_key_value) != 2: + raise ValueError( + f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + ) +@@ -497,7 +493,6 @@ class T5Attention(nn.Module): + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: +- past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) +@@ -571,261 +566,7 @@ class T5Attention(nn.Module): + + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs +- +- +-class T5SelfAttention(T5Attention): +- def __init__(self, config: T5Config, has_relative_attention_bias=False): +- super().__init__(config, has_relative_attention_bias) +- +- def forward( +- self, +- hidden_states, +- mask=None, +- position_bias=None, +- past_key_value=None, +- layer_head_mask=None, +- use_cache=False, +- output_attentions=False, +- ): +- """ +- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). +- """ +- # Input is (batch_size, seq_length, dim) +- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) +- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) +- batch_size, seq_length = hidden_states.shape[:2] +- +- real_seq_length = seq_length +- +- if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] +- # print("key_value_states=",real_seq_length) +- key_length = real_seq_length +- +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) +- +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) +- +- def project(hidden_states, proj_layer, past_key_value): +- """projects hidden states correctly to key/query states""" +- if past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- +- if past_key_value is not None: +- hidden_states = shape(proj_layer(hidden_states)) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- return hidden_states +- +- # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- +- # get key/value states +- key_states = project( +- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None +- ) +- value_states = project( +- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None +- ) +- # print("key_states=",hidden_states.dtype,key_states.dtype) +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- # print("scores=",scores.dtype) +- if position_bias is None: +- if not self.has_relative_attention_bias: +- position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype +- ) +- if self.gradient_checkpointing and self.training: +- position_bias.requires_grad = True +- else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) +- +- # if key and values are already calculated +- # we want only the last query position bias +- if past_key_value is not None: +- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- +- if mask is not None: +- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) +- +- if self.pruned_heads: +- mask = torch.ones(position_bias.shape[1]) +- mask[list(self.pruned_heads)] = 0 +- position_bias_masked = position_bias[:, mask.bool()] +- else: +- position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) +- attn_output = self.o(attn_output) + +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs +- +- +-class T5CrossAttention(T5Attention): +- def __init__(self, config: T5Config, has_relative_attention_bias=False): +- super().__init__(config, has_relative_attention_bias) +- +- def forward( +- self, +- hidden_states, +- mask=None, +- key_value_states=None, +- position_bias=None, +- past_cross_key_value=None, +- layer_head_mask=None, +- query_length=None, +- use_cache=False, +- output_attentions=False, +- ): +- """ +- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). +- """ +- # Input is (batch_size, seq_length, dim) +- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) +- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) +- batch_size, seq_length = hidden_states.shape[:2] +- +- real_seq_length = seq_length +- +- if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length +- # print("key_value_states=",key_value_states, real_seq_length) +- key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] +- +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) +- +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) +- +- def project(hidden_states, proj_layer, key_value_states, past_key_value): +- """projects hidden states correctly to key/query states""" +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- elif past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) +- +- if past_key_value is not None: +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, key_length, dim_per_head) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- # print("hidden_states=",hidden_states.shape) +- elif past_key_value.shape[2] != key_value_states.shape[1]: +- # checking that the `sequence_length` of the `past_key_value` is the same as +- # the provided `key_value_states` to support prefix tuning +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) +- else: +- # cross-attn +- hidden_states = past_key_value +- return hidden_states +- +- # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- +- # get key/value states +- key_states = project( +- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None +- ) +- value_states = project( +- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None +- ) +- +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- +- if position_bias is None: +- if not self.has_relative_attention_bias: +- position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype +- ) +- if self.gradient_checkpointing and self.training: +- position_bias.requires_grad = True +- else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) +- +- # if key and values are already calculated +- # we want only the last query position bias +- if past_key_value is not None: +- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- +- if mask is not None: +- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) +- +- if self.pruned_heads: +- mask = torch.ones(position_bias.shape[1]) +- mask[list(self.pruned_heads)] = 0 +- position_bias_masked = position_bias[:, mask.bool()] +- else: +- position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) +- attn_output = self.o(attn_output) +- +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs +@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention): + class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() +- self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -921,7 +662,6 @@ class T5Block(nn.Module): + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, +- past_cross_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, +@@ -931,17 +671,15 @@ class T5Block(nn.Module): + logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + +- # if len(past_key_value) != expected_num_past_key_values: +- # raise ValueError( +- # f"There should be {expected_num_past_key_values} past states. " +- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- # f"Got {len(past_key_value)} past key / value states" +- # ) +- +- self_attn_past_key_value = past_key_value +- # print("self_attn_past_key_value=",self_attn_past_key_value.dtype) +- cross_attn_past_key_value = past_cross_key_value +- # cross_attn_past_key_value = past_key_value[2:] ++ if len(past_key_value) != expected_num_past_key_values: ++ raise ValueError( ++ f"There should be {expected_num_past_key_values} past states. " ++ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" ++ f"Got {len(past_key_value)} past key / value states" ++ ) ++ ++ self_attn_past_key_value = past_key_value[:2] ++ cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + +@@ -955,8 +693,6 @@ class T5Block(nn.Module): + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] +- # if self.is_decoder: +- # print("present_key_value_state=",present_key_value_state[0].dtype) + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training +@@ -967,7 +703,7 @@ class T5Block(nn.Module): + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) +- present_cross_key_value_state = () ++ + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + # the actual query length is unknown for cross attention +@@ -1000,10 +736,9 @@ class T5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- # if present_key_value_state is not None: +- # present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- cross_attn_past_key_values = cross_attention_outputs[1] +- # print("cross_attn_past_key_values=",cross_attn_past_key_values) ++ if present_key_value_state is not None: ++ present_key_value_state = present_key_value_state + cross_attention_outputs[1] ++ + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + +@@ -1022,7 +757,7 @@ class T5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs ++ outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) + + + class T5Stack(T5PreTrainedModel): +- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): ++ def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder +- self.lm_head=lm_head +- self.encodecrosskeyvalue = encodecrosskeyvalue +- self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel): + def forward( + self, + input_ids=None, ++ attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, +- past_key_values=None, +- past_cross_key_values=None, +- attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, ++ past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + # Model parallel +- # print("aaaaaaaaaaaaaaaaa") + if self.model_parallel: + torch.cuda.set_device(self.first_device) + self.embed_tokens = self.embed_tokens.to(self.first_device) +@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel): + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- #modified +- # if past_key_values is None: +- # past_key_values = [None] * len(self.block) +- #added +- if not self.is_decoder: ++ if past_key_values is None: + past_key_values = [None] * len(self.block) +- past_cross_key_values = [None] * len(self.block) ++ + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + +@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- # present_key_value_states = () if use_cache else None +- # present_cross_key_value_states = () if use_cache else None +- present_key_value_states = [] if use_cache else None +- # present_cross_key_value_states = [] if use_cache else None ++ present_key_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- for i, layer_module in enumerate(self.block): +- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): +- past_key_value = past_key_values[i] +- past_cross_key_value = past_cross_key_values[i] ++ ++ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, +- past_cross_key_value=past_cross_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] ++ hidden_states, present_key_value_state = layer_outputs[:2] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[3] ++ position_bias = layer_outputs[2] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] ++ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] + # append next layer key value states + if use_cache: +- present_key_value_states.extend(present_key_value_state) +- # present_cross_key_value_states.extend(present_cross_key_value_state) +- # present_key_value_states = present_key_value_states + (present_key_value_state,) +- # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,) ++ present_key_value_states = present_key_value_states + (present_key_value_state,) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states).half() ++ hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) +- # print("return_dict=",return_dict) ++ + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, +- # present_cross_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) +- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None +- # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2, +- # *present_cross_key_value_states[0].shape) if use_cache else None +- # print("dddddddddddd") +- # if use_cache: +- # print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype) +- # return BaseModelOutputWithPastAndCrossAttentions( +- # last_hidden_state=hidden_states, +- # past_key_values=present_key_value_states, +- # past_cross_key_values=present_cross_key_value_states +- # ) +- if not self.is_decoder and self.encodecrosskeyvalue: +- res = self.encodecrosskeyvalue(hidden_states) +- return tuple((hidden_states, res)) +- # return BaseModelOutputWithPastAndCrossAttentions( +- # last_hidden_state=hidden_states, +- # past_key_values=present_key_value_states, +- # # past_cross_key_values=past_cross_key_values, +- # hidden_states=all_hidden_states, +- # attentions=all_attentions, +- # cross_attentions=all_cross_attentions, +- # ) +- if self.is_decoder: +- if self.config.tie_word_embeddings: +- hidden_states_1 = hidden_states * (self.model_dim ** -0.5) +- lm_logits = self.lm_head(hidden_states_1) +- return tuple((lm_logits, present_key_value_states)) ++ return BaseModelOutputWithPastAndCrossAttentions( ++ last_hidden_state=hidden_states, ++ past_key_values=present_key_value_states, ++ hidden_states=all_hidden_states, ++ attentions=all_attentions, ++ cross_attentions=all_cross_attentions, ++ ) + + + T5_START_DOCSTRING = r""" +@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel): + ) + + +- +-class EncoderToCrossKeyValue(nn.Module): +- def __init__(self, cross_key, cross_value, num_heads, d_kv): +- super().__init__() +- self.cross_key = cross_key +- self.cross_value = cross_value +- self.num_heads = num_heads +- self.d_kv = d_kv +- +- +- def forward(self, hidden_states): +- batch_size = hidden_states.shape[0] +- encoder_hidden_states_kvs = [] +- for i in range(len(self.cross_value)): +- encoder_hidden_states_kvs.append( +- torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), +- self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) +- +- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) +- return past_cross_key_values +- +- + @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) + class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ +@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + +- def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): ++ def __init__(self, config: T5Config): + super().__init__(config) +- self.encoder_path = encoder_path +- self.decoder_path = decoder_path +- if not self.encoder_path or not self.decoder_path: +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) +- self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) +- +- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) +- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) +- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) +- self.encoder_mindie = None +- self.decoder_mindie = None +- if self.encoder_path: +- self.encoder_mindie = torch.jit.load(self.encoder_path) +- if self.decoder_path: +- self.decoder_mindie = torch.jit.load(self.decoder_path) +- self.stream = torch.npu.Stream(f"npu:{device_id}") +- self.device_id = device_id +- +- +- def get_device(self): +- return f"npu:{self.device_id}" ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = T5Stack(encoder_config, self.shared) ++ ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ self.decoder = T5Stack(decoder_config, self.shared) ++ ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing +- # self.post_init() ++ self.post_init() + + # Model parallel + self.model_parallel = False +@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, +@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask +- +- hidden_states = encoder_outputs["last_hidden_state"] +- # import pdb +- # pdb.set_trace() + +- # if self.model_parallel: +- # torch.cuda.set_device(self.decoder.first_device) ++ # Encode if needed (training, first prediction pass) ++ if encoder_outputs is None: ++ # Convert encoder inputs in embeddings if needed ++ encoder_outputs = self.encoder( ++ input_ids=input_ids, ++ attention_mask=attention_mask, ++ inputs_embeds=inputs_embeds, ++ head_mask=head_mask, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): ++ encoder_outputs = BaseModelOutput( ++ last_hidden_state=encoder_outputs[0], ++ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, ++ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ++ ) ++ ++ hidden_states = encoder_outputs[0] ++ ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + +- import time +- start_time = time.time() +- with torch.npu.stream(self.stream): # set stream +- +- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) +- self.stream.synchronize() # synchronize +- print("time is", time.time() - start_time) ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) ++ hidden_states = hidden_states.to(self.decoder.first_device) ++ if decoder_input_ids is not None: ++ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) ++ if attention_mask is not None: ++ attention_mask = attention_mask.to(self.decoder.first_device) ++ if decoder_attention_mask is not None: ++ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) ++ ++ # Decode ++ decoder_outputs = self.decoder( ++ input_ids=decoder_input_ids, ++ attention_mask=decoder_attention_mask, ++ inputs_embeds=decoder_inputs_embeds, ++ past_key_values=past_key_values, ++ encoder_hidden_states=hidden_states, ++ encoder_attention_mask=attention_mask, ++ head_mask=decoder_head_mask, ++ cross_attn_head_mask=cross_attn_head_mask, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ ++ sequence_output = decoder_outputs[0] ++ ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.encoder.first_device) ++ self.lm_head = self.lm_head.to(self.encoder.first_device) ++ sequence_output = sequence_output.to(self.lm_head.weight.device) ++ ++ if self.config.tie_word_embeddings: ++ # Rescale output before projecting on vocab ++ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 ++ sequence_output = sequence_output * (self.model_dim**-0.5) ++ ++ lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: +@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output ++ + return Seq2SeqLMOutput( + loss=loss, +- logits=decoder_outputs[0], +- past_key_values=decoder_outputs[1] ++ logits=lm_logits, ++ past_key_values=decoder_outputs.past_key_values, ++ decoder_hidden_states=decoder_outputs.hidden_states, ++ decoder_attentions=decoder_outputs.attentions, ++ cross_attentions=decoder_outputs.cross_attentions, ++ encoder_last_hidden_state=encoder_outputs.last_hidden_state, ++ encoder_hidden_states=encoder_outputs.hidden_states, ++ encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, +- past_cross_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, +- "past_cross_key_values": past_cross_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, +@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel): + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) +- self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt") +- # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt") +- self.stream = torch.npu.Stream("npu:2") + + # Initialize weights and apply final processing + self.post_init() +@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, diff --git a/MindIE/MindIE-Torch/built-in/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/modeling_utils.patch new file mode 100644 index 0000000000..e69de29bb2 diff --git a/MindIE/MindIE-Torch/built-in/utils.patch b/MindIE/MindIE-Torch/built-in/utils.patch new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 575815494b5d89ed682b83353806888aae398671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:28:45 +0000 Subject: [PATCH 004/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Foutputs.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/modeling_outputs.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_outputs.patch diff --git a/MindIE/MindIE-Torch/built-in/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/modeling_outputs.patch deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From a4e490eadbd112d032d6bdac5aa4942a9cf70a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:28:56 +0000 Subject: [PATCH 005/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Ft5.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MindIE-Torch/built-in/modeling_t5.patch | 819 ------------------ 1 file changed, 819 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_t5.patch diff --git a/MindIE/MindIE-Torch/built-in/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/modeling_t5.patch deleted file mode 100644 index 4a376cf5eb..0000000000 --- a/MindIE/MindIE-Torch/built-in/modeling_t5.patch +++ /dev/null @@ -1,819 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-08-29 11:11:23.852000000 +0800 -+++ modeling_t5.py 2024-08-29 11:19:34.572000000 +0800 -@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple - import torch - from torch import nn - from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss --import torch_npu --import mindietorch - - from ...activations import ACT2FN - from ...modeling_outputs import ( -@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module): - - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) -- # print("self.weight.dtype=",self.weight.dtype) -+ - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) -@@ -451,7 +449,6 @@ class T5Attention(nn.Module): - key_value_states=None, - position_bias=None, - past_key_value=None, -- past_cross_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, -@@ -468,8 +465,7 @@ class T5Attention(nn.Module): - real_seq_length = seq_length - - if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -+ if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) -@@ -497,7 +493,6 @@ class T5Attention(nn.Module): - hidden_states = shape(proj_layer(key_value_states)) - - if past_key_value is not None: -- past_key_value = shape(past_key_value) - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) -@@ -571,261 +566,7 @@ class T5Attention(nn.Module): - - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) -- if output_attentions: -- outputs = outputs + (attn_weights,) -- return outputs -- -- --class T5SelfAttention(T5Attention): -- def __init__(self, config: T5Config, has_relative_attention_bias=False): -- super().__init__(config, has_relative_attention_bias) -- -- def forward( -- self, -- hidden_states, -- mask=None, -- position_bias=None, -- past_key_value=None, -- layer_head_mask=None, -- use_cache=False, -- output_attentions=False, -- ): -- """ -- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -- """ -- # Input is (batch_size, seq_length, dim) -- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -- batch_size, seq_length = hidden_states.shape[:2] -- -- real_seq_length = seq_length -- -- if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -- raise ValueError( -- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -- ) -- real_seq_length += past_key_value[0].shape[2] -- # print("key_value_states=",real_seq_length) -- key_length = real_seq_length -- -- def shape(states): -- """projection""" -- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -- -- def unshape(states): -- """reshape""" -- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -- -- def project(hidden_states, proj_layer, past_key_value): -- """projects hidden states correctly to key/query states""" -- if past_key_value is None: -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(hidden_states)) -- -- if past_key_value is not None: -- hidden_states = shape(proj_layer(hidden_states)) -- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -- return hidden_states -- -- # get query states -- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -- -- # get key/value states -- key_states = project( -- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None -- ) -- value_states = project( -- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None -- ) -- # print("key_states=",hidden_states.dtype,key_states.dtype) -- # compute scores -- scores = torch.matmul( -- query_states, key_states.transpose(3, 2) -- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- # print("scores=",scores.dtype) -- if position_bias is None: -- if not self.has_relative_attention_bias: -- position_bias = torch.zeros( -- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -- ) -- if self.gradient_checkpointing and self.training: -- position_bias.requires_grad = True -- else: -- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -- -- # if key and values are already calculated -- # we want only the last query position bias -- if past_key_value is not None: -- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -- -- if mask is not None: -- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -- -- if self.pruned_heads: -- mask = torch.ones(position_bias.shape[1]) -- mask[list(self.pruned_heads)] = 0 -- position_bias_masked = position_bias[:, mask.bool()] -- else: -- position_bias_masked = position_bias -- -- scores += position_bias_masked -- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -- scores -- ) # (batch_size, n_heads, seq_length, key_length) -- attn_weights = nn.functional.dropout( -- attn_weights, p=self.dropout, training=self.training -- ) # (batch_size, n_heads, seq_length, key_length) -- -- # Mask heads if we want to -- if layer_head_mask is not None: -- attn_weights = attn_weights * layer_head_mask -- -- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -- attn_output = self.o(attn_output) - -- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) -- if output_attentions: -- outputs = outputs + (attn_weights,) -- return outputs -- -- --class T5CrossAttention(T5Attention): -- def __init__(self, config: T5Config, has_relative_attention_bias=False): -- super().__init__(config, has_relative_attention_bias) -- -- def forward( -- self, -- hidden_states, -- mask=None, -- key_value_states=None, -- position_bias=None, -- past_cross_key_value=None, -- layer_head_mask=None, -- query_length=None, -- use_cache=False, -- output_attentions=False, -- ): -- """ -- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -- """ -- # Input is (batch_size, seq_length, dim) -- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -- batch_size, seq_length = hidden_states.shape[:2] -- -- real_seq_length = seq_length -- -- if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -- raise ValueError( -- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -- ) -- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length -- # print("key_value_states=",key_value_states, real_seq_length) -- key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] -- -- def shape(states): -- """projection""" -- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -- -- def unshape(states): -- """reshape""" -- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -- -- def project(hidden_states, proj_layer, key_value_states, past_key_value): -- """projects hidden states correctly to key/query states""" -- if key_value_states is None: -- # self-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(hidden_states)) -- elif past_key_value is None: -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(key_value_states)) -- -- if past_key_value is not None: -- if key_value_states is None: -- # self-attn -- # (batch_size, n_heads, key_length, dim_per_head) -- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -- # print("hidden_states=",hidden_states.shape) -- elif past_key_value.shape[2] != key_value_states.shape[1]: -- # checking that the `sequence_length` of the `past_key_value` is the same as -- # the provided `key_value_states` to support prefix tuning -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(key_value_states)) -- else: -- # cross-attn -- hidden_states = past_key_value -- return hidden_states -- -- # get query states -- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -- -- # get key/value states -- key_states = project( -- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None -- ) -- value_states = project( -- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None -- ) -- -- # compute scores -- scores = torch.matmul( -- query_states, key_states.transpose(3, 2) -- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- -- if position_bias is None: -- if not self.has_relative_attention_bias: -- position_bias = torch.zeros( -- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -- ) -- if self.gradient_checkpointing and self.training: -- position_bias.requires_grad = True -- else: -- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -- -- # if key and values are already calculated -- # we want only the last query position bias -- if past_key_value is not None: -- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -- -- if mask is not None: -- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -- -- if self.pruned_heads: -- mask = torch.ones(position_bias.shape[1]) -- mask[list(self.pruned_heads)] = 0 -- position_bias_masked = position_bias[:, mask.bool()] -- else: -- position_bias_masked = position_bias -- -- scores += position_bias_masked -- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -- scores -- ) # (batch_size, n_heads, seq_length, key_length) -- attn_weights = nn.functional.dropout( -- attn_weights, p=self.dropout, training=self.training -- ) # (batch_size, n_heads, seq_length, key_length) -- -- # Mask heads if we want to -- if layer_head_mask is not None: -- attn_weights = attn_weights * layer_head_mask -- -- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -- attn_output = self.o(attn_output) -- -- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs -@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention): - class T5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() -- self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) -+ self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - -@@ -921,7 +662,6 @@ class T5Block(nn.Module): - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, -- past_cross_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, -@@ -931,17 +671,15 @@ class T5Block(nn.Module): - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - -- # if len(past_key_value) != expected_num_past_key_values: -- # raise ValueError( -- # f"There should be {expected_num_past_key_values} past states. " -- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -- # f"Got {len(past_key_value)} past key / value states" -- # ) -- -- self_attn_past_key_value = past_key_value -- # print("self_attn_past_key_value=",self_attn_past_key_value.dtype) -- cross_attn_past_key_value = past_cross_key_value -- # cross_attn_past_key_value = past_key_value[2:] -+ if len(past_key_value) != expected_num_past_key_values: -+ raise ValueError( -+ f"There should be {expected_num_past_key_values} past states. " -+ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -+ f"Got {len(past_key_value)} past key / value states" -+ ) -+ -+ self_attn_past_key_value = past_key_value[:2] -+ cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - -@@ -955,8 +693,6 @@ class T5Block(nn.Module): - output_attentions=output_attentions, - ) - hidden_states, present_key_value_state = self_attention_outputs[:2] -- # if self.is_decoder: -- # print("present_key_value_state=",present_key_value_state[0].dtype) - attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training -@@ -967,7 +703,7 @@ class T5Block(nn.Module): - torch.finfo(hidden_states.dtype).max, - ) - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) -- present_cross_key_value_state = () -+ - do_cross_attention = self.is_decoder and encoder_hidden_states is not None - if do_cross_attention: - # the actual query length is unknown for cross attention -@@ -1000,10 +736,9 @@ class T5Block(nn.Module): - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states -- # if present_key_value_state is not None: -- # present_key_value_state = present_key_value_state + cross_attention_outputs[1] -- cross_attn_past_key_values = cross_attention_outputs[1] -- # print("cross_attn_past_key_values=",cross_attn_past_key_values) -+ if present_key_value_state is not None: -+ present_key_value_state = present_key_value_state + cross_attention_outputs[1] -+ - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - -@@ -1022,7 +757,7 @@ class T5Block(nn.Module): - outputs = (hidden_states,) - - if use_cache: -- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs -+ outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - -@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) - - - class T5Stack(T5PreTrainedModel): -- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): -+ def __init__(self, config, embed_tokens=None): - super().__init__(config) - - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder -- self.lm_head=lm_head -- self.encodecrosskeyvalue = encodecrosskeyvalue -- self.model_dim = config.d_model - - self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel): - def forward( - self, - input_ids=None, -+ attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, -- past_key_values=None, -- past_cross_key_values=None, -- attention_mask=None, - inputs_embeds=None, - head_mask=None, - cross_attn_head_mask=None, -+ past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - # Model parallel -- # print("aaaaaaaaaaaaaaaaa") - if self.model_parallel: - torch.cuda.set_device(self.first_device) - self.embed_tokens = self.embed_tokens.to(self.first_device) -@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel): - raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - - # initialize past_key_values with `None` if past does not exist -- #modified -- # if past_key_values is None: -- # past_key_values = [None] * len(self.block) -- #added -- if not self.is_decoder: -+ if past_key_values is None: - past_key_values = [None] * len(self.block) -- past_cross_key_values = [None] * len(self.block) -+ - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - -@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel): - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) -- # present_key_value_states = () if use_cache else None -- # present_cross_key_value_states = () if use_cache else None -- present_key_value_states = [] if use_cache else None -- # present_cross_key_value_states = [] if use_cache else None -+ present_key_value_states = () if use_cache else None - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) -- for i, layer_module in enumerate(self.block): -- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): -- past_key_value = past_key_values[i] -- past_cross_key_value = past_cross_key_values[i] -+ -+ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): - layer_head_mask = head_mask[i] - cross_attn_layer_head_mask = cross_attn_head_mask[i] - # Model parallel -@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): - layer_head_mask=layer_head_mask, - cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, -- past_cross_key_value=past_cross_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) -@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel): - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - if use_cache is False: -- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] -+ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - -- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] -+ hidden_states, present_key_value_state = layer_outputs[:2] - - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), - # (cross-attention position bias), (cross-attention weights) -- position_bias = layer_outputs[3] -+ position_bias = layer_outputs[2] - if self.is_decoder and encoder_hidden_states is not None: -- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] -+ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: -- present_key_value_states.extend(present_key_value_state) -- # present_cross_key_value_states.extend(present_cross_key_value_state) -- # present_key_value_states = present_key_value_states + (present_key_value_state,) -- # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,) -+ present_key_value_states = present_key_value_states + (present_key_value_state,) - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[3],) -@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel): - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.final_layer_norm(hidden_states) -- hidden_states = self.dropout(hidden_states).half() -+ hidden_states = self.dropout(hidden_states) - - # Add last layer - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) -- # print("return_dict=",return_dict) -+ - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, -- # present_cross_key_value_states, - all_hidden_states, - all_attentions, - all_cross_attentions, - ] - if v is not None - ) -- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None -- # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2, -- # *present_cross_key_value_states[0].shape) if use_cache else None -- # print("dddddddddddd") -- # if use_cache: -- # print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype) -- # return BaseModelOutputWithPastAndCrossAttentions( -- # last_hidden_state=hidden_states, -- # past_key_values=present_key_value_states, -- # past_cross_key_values=present_cross_key_value_states -- # ) -- if not self.is_decoder and self.encodecrosskeyvalue: -- res = self.encodecrosskeyvalue(hidden_states) -- return tuple((hidden_states, res)) -- # return BaseModelOutputWithPastAndCrossAttentions( -- # last_hidden_state=hidden_states, -- # past_key_values=present_key_value_states, -- # # past_cross_key_values=past_cross_key_values, -- # hidden_states=all_hidden_states, -- # attentions=all_attentions, -- # cross_attentions=all_cross_attentions, -- # ) -- if self.is_decoder: -- if self.config.tie_word_embeddings: -- hidden_states_1 = hidden_states * (self.model_dim ** -0.5) -- lm_logits = self.lm_head(hidden_states_1) -- return tuple((lm_logits, present_key_value_states)) -+ return BaseModelOutputWithPastAndCrossAttentions( -+ last_hidden_state=hidden_states, -+ past_key_values=present_key_value_states, -+ hidden_states=all_hidden_states, -+ attentions=all_attentions, -+ cross_attentions=all_cross_attentions, -+ ) - - - T5_START_DOCSTRING = r""" -@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel): - ) - - -- --class EncoderToCrossKeyValue(nn.Module): -- def __init__(self, cross_key, cross_value, num_heads, d_kv): -- super().__init__() -- self.cross_key = cross_key -- self.cross_value = cross_value -- self.num_heads = num_heads -- self.d_kv = d_kv -- -- -- def forward(self, hidden_states): -- batch_size = hidden_states.shape[0] -- encoder_hidden_states_kvs = [] -- for i in range(len(self.cross_value)): -- encoder_hidden_states_kvs.append( -- torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), -- self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) -- -- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) -- return past_cross_key_values -- -- - @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) - class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [ -@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr - ] - _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - -- def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): -+ def __init__(self, config: T5Config): - super().__init__(config) -- self.encoder_path = encoder_path -- self.decoder_path = decoder_path -- if not self.encoder_path or not self.decoder_path: -- self.model_dim = config.d_model -- -- self.shared = nn.Embedding(config.vocab_size, config.d_model) -- -- decoder_config = copy.deepcopy(config) -- decoder_config.is_decoder = True -- decoder_config.is_encoder_decoder = False -- decoder_config.num_layers = config.num_decoder_layers -- -- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -- self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) -- -- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) -- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) -- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) -- -- encoder_config = copy.deepcopy(config) -- encoder_config.is_decoder = False -- encoder_config.use_cache = False -- encoder_config.is_encoder_decoder = False -- self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) -- self.encoder_mindie = None -- self.decoder_mindie = None -- if self.encoder_path: -- self.encoder_mindie = torch.jit.load(self.encoder_path) -- if self.decoder_path: -- self.decoder_mindie = torch.jit.load(self.decoder_path) -- self.stream = torch.npu.Stream(f"npu:{device_id}") -- self.device_id = device_id -- -- -- def get_device(self): -- return f"npu:{self.device_id}" -+ self.model_dim = config.d_model -+ -+ self.shared = nn.Embedding(config.vocab_size, config.d_model) -+ -+ encoder_config = copy.deepcopy(config) -+ encoder_config.is_decoder = False -+ encoder_config.use_cache = False -+ encoder_config.is_encoder_decoder = False -+ self.encoder = T5Stack(encoder_config, self.shared) -+ -+ decoder_config = copy.deepcopy(config) -+ decoder_config.is_decoder = True -+ decoder_config.is_encoder_decoder = False -+ decoder_config.num_layers = config.num_decoder_layers -+ self.decoder = T5Stack(decoder_config, self.shared) -+ -+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - - # Initialize weights and apply final processing -- # self.post_init() -+ self.post_init() - - # Model parallel - self.model_parallel = False -@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, -- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, -@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr - if self.config.num_layers == self.config.num_decoder_layers: - warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) - decoder_head_mask = head_mask -- -- hidden_states = encoder_outputs["last_hidden_state"] -- # import pdb -- # pdb.set_trace() - -- # if self.model_parallel: -- # torch.cuda.set_device(self.decoder.first_device) -+ # Encode if needed (training, first prediction pass) -+ if encoder_outputs is None: -+ # Convert encoder inputs in embeddings if needed -+ encoder_outputs = self.encoder( -+ input_ids=input_ids, -+ attention_mask=attention_mask, -+ inputs_embeds=inputs_embeds, -+ head_mask=head_mask, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): -+ encoder_outputs = BaseModelOutput( -+ last_hidden_state=encoder_outputs[0], -+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, -+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, -+ ) -+ -+ hidden_states = encoder_outputs[0] -+ -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) - - if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: - # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(labels) - -- import time -- start_time = time.time() -- with torch.npu.stream(self.stream): # set stream -- -- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -- self.stream.synchronize() # synchronize -- print("time is", time.time() - start_time) -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) -+ hidden_states = hidden_states.to(self.decoder.first_device) -+ if decoder_input_ids is not None: -+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) -+ if attention_mask is not None: -+ attention_mask = attention_mask.to(self.decoder.first_device) -+ if decoder_attention_mask is not None: -+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) -+ -+ # Decode -+ decoder_outputs = self.decoder( -+ input_ids=decoder_input_ids, -+ attention_mask=decoder_attention_mask, -+ inputs_embeds=decoder_inputs_embeds, -+ past_key_values=past_key_values, -+ encoder_hidden_states=hidden_states, -+ encoder_attention_mask=attention_mask, -+ head_mask=decoder_head_mask, -+ cross_attn_head_mask=cross_attn_head_mask, -+ use_cache=use_cache, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ -+ sequence_output = decoder_outputs[0] -+ -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.encoder.first_device) -+ self.lm_head = self.lm_head.to(self.encoder.first_device) -+ sequence_output = sequence_output.to(self.lm_head.weight.device) -+ -+ if self.config.tie_word_embeddings: -+ # Rescale output before projecting on vocab -+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 -+ sequence_output = sequence_output * (self.model_dim**-0.5) -+ -+ lm_logits = self.lm_head(sequence_output) - - loss = None - if labels is not None: -@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr - if not return_dict: - output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs - return ((loss,) + output) if loss is not None else output -+ - return Seq2SeqLMOutput( - loss=loss, -- logits=decoder_outputs[0], -- past_key_values=decoder_outputs[1] -+ logits=lm_logits, -+ past_key_values=decoder_outputs.past_key_values, -+ decoder_hidden_states=decoder_outputs.hidden_states, -+ decoder_attentions=decoder_outputs.attentions, -+ cross_attentions=decoder_outputs.cross_attentions, -+ encoder_last_hidden_state=encoder_outputs.last_hidden_state, -+ encoder_hidden_states=encoder_outputs.hidden_states, -+ encoder_attentions=encoder_outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, -- past_cross_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, -@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, -- "past_cross_key_values": past_cross_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, -@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel): - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False - self.encoder = T5Stack(encoder_config, self.shared) -- self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt") -- # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt") -- self.stream = torch.npu.Stream("npu:2") - - # Initialize weights and apply final processing - self.post_init() -@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): - >>> last_hidden_states = outputs.last_hidden_state - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict -+ - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, -- Gitee From ba6268d1d3922bea5f0c0e443aaf6251f442816a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:29:04 +0000 Subject: [PATCH 006/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Futils.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/modeling_utils.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_utils.patch diff --git a/MindIE/MindIE-Torch/built-in/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/modeling_utils.patch deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 93ef152147c62f305265ec275f12c358315970be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:29:12 +0000 Subject: [PATCH 007/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/utils.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/utils.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/utils.patch diff --git a/MindIE/MindIE-Torch/built-in/utils.patch b/MindIE/MindIE-Torch/built-in/utils.patch deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 0c06288203ab056ddcb8442914db41a5cec65d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 03:29:32 +0000 Subject: [PATCH 008/110] transformers patch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_outputs.patch | 0 .../built-in/T5/modeling_t5.patch | 819 ++++++++++++++++++ .../built-in/T5/modeling_utils.patch | 0 MindIE/MindIE-Torch/built-in/T5/utils.patch | 0 4 files changed, 819 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch create mode 100644 MindIE/MindIE-Torch/built-in/T5/utils.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch new file mode 100644 index 0000000000..e69de29bb2 diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch new file mode 100644 index 0000000000..4a376cf5eb --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -0,0 +1,819 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-08-29 11:11:23.852000000 +0800 ++++ modeling_t5.py 2024-08-29 11:19:34.572000000 +0800 +@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple + import torch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +-import torch_npu +-import mindietorch + + from ...activations import ACT2FN + from ...modeling_outputs import ( +@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module): + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) +- # print("self.weight.dtype=",self.weight.dtype) ++ + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) +@@ -451,7 +449,6 @@ class T5Attention(nn.Module): + key_value_states=None, + position_bias=None, + past_key_value=None, +- past_cross_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -468,8 +465,7 @@ class T5Attention(nn.Module): + real_seq_length = seq_length + + if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: ++ if len(past_key_value) != 2: + raise ValueError( + f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + ) +@@ -497,7 +493,6 @@ class T5Attention(nn.Module): + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: +- past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) +@@ -571,261 +566,7 @@ class T5Attention(nn.Module): + + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs +- +- +-class T5SelfAttention(T5Attention): +- def __init__(self, config: T5Config, has_relative_attention_bias=False): +- super().__init__(config, has_relative_attention_bias) +- +- def forward( +- self, +- hidden_states, +- mask=None, +- position_bias=None, +- past_key_value=None, +- layer_head_mask=None, +- use_cache=False, +- output_attentions=False, +- ): +- """ +- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). +- """ +- # Input is (batch_size, seq_length, dim) +- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) +- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) +- batch_size, seq_length = hidden_states.shape[:2] +- +- real_seq_length = seq_length +- +- if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] +- # print("key_value_states=",real_seq_length) +- key_length = real_seq_length +- +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) +- +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) +- +- def project(hidden_states, proj_layer, past_key_value): +- """projects hidden states correctly to key/query states""" +- if past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- +- if past_key_value is not None: +- hidden_states = shape(proj_layer(hidden_states)) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- return hidden_states +- +- # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- +- # get key/value states +- key_states = project( +- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None +- ) +- value_states = project( +- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None +- ) +- # print("key_states=",hidden_states.dtype,key_states.dtype) +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- # print("scores=",scores.dtype) +- if position_bias is None: +- if not self.has_relative_attention_bias: +- position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype +- ) +- if self.gradient_checkpointing and self.training: +- position_bias.requires_grad = True +- else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) +- +- # if key and values are already calculated +- # we want only the last query position bias +- if past_key_value is not None: +- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- +- if mask is not None: +- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) +- +- if self.pruned_heads: +- mask = torch.ones(position_bias.shape[1]) +- mask[list(self.pruned_heads)] = 0 +- position_bias_masked = position_bias[:, mask.bool()] +- else: +- position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) +- attn_output = self.o(attn_output) + +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs +- +- +-class T5CrossAttention(T5Attention): +- def __init__(self, config: T5Config, has_relative_attention_bias=False): +- super().__init__(config, has_relative_attention_bias) +- +- def forward( +- self, +- hidden_states, +- mask=None, +- key_value_states=None, +- position_bias=None, +- past_cross_key_value=None, +- layer_head_mask=None, +- query_length=None, +- use_cache=False, +- output_attentions=False, +- ): +- """ +- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). +- """ +- # Input is (batch_size, seq_length, dim) +- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) +- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) +- batch_size, seq_length = hidden_states.shape[:2] +- +- real_seq_length = seq_length +- +- if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length +- # print("key_value_states=",key_value_states, real_seq_length) +- key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] +- +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) +- +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) +- +- def project(hidden_states, proj_layer, key_value_states, past_key_value): +- """projects hidden states correctly to key/query states""" +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- elif past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) +- +- if past_key_value is not None: +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, key_length, dim_per_head) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- # print("hidden_states=",hidden_states.shape) +- elif past_key_value.shape[2] != key_value_states.shape[1]: +- # checking that the `sequence_length` of the `past_key_value` is the same as +- # the provided `key_value_states` to support prefix tuning +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) +- else: +- # cross-attn +- hidden_states = past_key_value +- return hidden_states +- +- # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- +- # get key/value states +- key_states = project( +- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None +- ) +- value_states = project( +- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None +- ) +- +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- +- if position_bias is None: +- if not self.has_relative_attention_bias: +- position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype +- ) +- if self.gradient_checkpointing and self.training: +- position_bias.requires_grad = True +- else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) +- +- # if key and values are already calculated +- # we want only the last query position bias +- if past_key_value is not None: +- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- +- if mask is not None: +- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) +- +- if self.pruned_heads: +- mask = torch.ones(position_bias.shape[1]) +- mask[list(self.pruned_heads)] = 0 +- position_bias_masked = position_bias[:, mask.bool()] +- else: +- position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) +- attn_output = self.o(attn_output) +- +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- # print("output_attentions=",output_attentions) + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs +@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention): + class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() +- self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -921,7 +662,6 @@ class T5Block(nn.Module): + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, +- past_cross_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, +@@ -931,17 +671,15 @@ class T5Block(nn.Module): + logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + +- # if len(past_key_value) != expected_num_past_key_values: +- # raise ValueError( +- # f"There should be {expected_num_past_key_values} past states. " +- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- # f"Got {len(past_key_value)} past key / value states" +- # ) +- +- self_attn_past_key_value = past_key_value +- # print("self_attn_past_key_value=",self_attn_past_key_value.dtype) +- cross_attn_past_key_value = past_cross_key_value +- # cross_attn_past_key_value = past_key_value[2:] ++ if len(past_key_value) != expected_num_past_key_values: ++ raise ValueError( ++ f"There should be {expected_num_past_key_values} past states. " ++ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" ++ f"Got {len(past_key_value)} past key / value states" ++ ) ++ ++ self_attn_past_key_value = past_key_value[:2] ++ cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + +@@ -955,8 +693,6 @@ class T5Block(nn.Module): + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] +- # if self.is_decoder: +- # print("present_key_value_state=",present_key_value_state[0].dtype) + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training +@@ -967,7 +703,7 @@ class T5Block(nn.Module): + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) +- present_cross_key_value_state = () ++ + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + # the actual query length is unknown for cross attention +@@ -1000,10 +736,9 @@ class T5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- # if present_key_value_state is not None: +- # present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- cross_attn_past_key_values = cross_attention_outputs[1] +- # print("cross_attn_past_key_values=",cross_attn_past_key_values) ++ if present_key_value_state is not None: ++ present_key_value_state = present_key_value_state + cross_attention_outputs[1] ++ + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + +@@ -1022,7 +757,7 @@ class T5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs ++ outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) + + + class T5Stack(T5PreTrainedModel): +- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): ++ def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder +- self.lm_head=lm_head +- self.encodecrosskeyvalue = encodecrosskeyvalue +- self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel): + def forward( + self, + input_ids=None, ++ attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, +- past_key_values=None, +- past_cross_key_values=None, +- attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, ++ past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + # Model parallel +- # print("aaaaaaaaaaaaaaaaa") + if self.model_parallel: + torch.cuda.set_device(self.first_device) + self.embed_tokens = self.embed_tokens.to(self.first_device) +@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel): + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- #modified +- # if past_key_values is None: +- # past_key_values = [None] * len(self.block) +- #added +- if not self.is_decoder: ++ if past_key_values is None: + past_key_values = [None] * len(self.block) +- past_cross_key_values = [None] * len(self.block) ++ + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + +@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- # present_key_value_states = () if use_cache else None +- # present_cross_key_value_states = () if use_cache else None +- present_key_value_states = [] if use_cache else None +- # present_cross_key_value_states = [] if use_cache else None ++ present_key_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- for i, layer_module in enumerate(self.block): +- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): +- past_key_value = past_key_values[i] +- past_cross_key_value = past_cross_key_values[i] ++ ++ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, +- past_cross_key_value=past_cross_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] ++ hidden_states, present_key_value_state = layer_outputs[:2] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[3] ++ position_bias = layer_outputs[2] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] ++ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] + # append next layer key value states + if use_cache: +- present_key_value_states.extend(present_key_value_state) +- # present_cross_key_value_states.extend(present_cross_key_value_state) +- # present_key_value_states = present_key_value_states + (present_key_value_state,) +- # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,) ++ present_key_value_states = present_key_value_states + (present_key_value_state,) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states).half() ++ hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) +- # print("return_dict=",return_dict) ++ + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, +- # present_cross_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) +- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None +- # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2, +- # *present_cross_key_value_states[0].shape) if use_cache else None +- # print("dddddddddddd") +- # if use_cache: +- # print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype) +- # return BaseModelOutputWithPastAndCrossAttentions( +- # last_hidden_state=hidden_states, +- # past_key_values=present_key_value_states, +- # past_cross_key_values=present_cross_key_value_states +- # ) +- if not self.is_decoder and self.encodecrosskeyvalue: +- res = self.encodecrosskeyvalue(hidden_states) +- return tuple((hidden_states, res)) +- # return BaseModelOutputWithPastAndCrossAttentions( +- # last_hidden_state=hidden_states, +- # past_key_values=present_key_value_states, +- # # past_cross_key_values=past_cross_key_values, +- # hidden_states=all_hidden_states, +- # attentions=all_attentions, +- # cross_attentions=all_cross_attentions, +- # ) +- if self.is_decoder: +- if self.config.tie_word_embeddings: +- hidden_states_1 = hidden_states * (self.model_dim ** -0.5) +- lm_logits = self.lm_head(hidden_states_1) +- return tuple((lm_logits, present_key_value_states)) ++ return BaseModelOutputWithPastAndCrossAttentions( ++ last_hidden_state=hidden_states, ++ past_key_values=present_key_value_states, ++ hidden_states=all_hidden_states, ++ attentions=all_attentions, ++ cross_attentions=all_cross_attentions, ++ ) + + + T5_START_DOCSTRING = r""" +@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel): + ) + + +- +-class EncoderToCrossKeyValue(nn.Module): +- def __init__(self, cross_key, cross_value, num_heads, d_kv): +- super().__init__() +- self.cross_key = cross_key +- self.cross_value = cross_value +- self.num_heads = num_heads +- self.d_kv = d_kv +- +- +- def forward(self, hidden_states): +- batch_size = hidden_states.shape[0] +- encoder_hidden_states_kvs = [] +- for i in range(len(self.cross_value)): +- encoder_hidden_states_kvs.append( +- torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), +- self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) +- +- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) +- return past_cross_key_values +- +- + @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) + class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ +@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + +- def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): ++ def __init__(self, config: T5Config): + super().__init__(config) +- self.encoder_path = encoder_path +- self.decoder_path = decoder_path +- if not self.encoder_path or not self.decoder_path: +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) +- self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) +- +- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) +- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) +- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) +- self.encoder_mindie = None +- self.decoder_mindie = None +- if self.encoder_path: +- self.encoder_mindie = torch.jit.load(self.encoder_path) +- if self.decoder_path: +- self.decoder_mindie = torch.jit.load(self.decoder_path) +- self.stream = torch.npu.Stream(f"npu:{device_id}") +- self.device_id = device_id +- +- +- def get_device(self): +- return f"npu:{self.device_id}" ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = T5Stack(encoder_config, self.shared) ++ ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ self.decoder = T5Stack(decoder_config, self.shared) ++ ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing +- # self.post_init() ++ self.post_init() + + # Model parallel + self.model_parallel = False +@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, +@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask +- +- hidden_states = encoder_outputs["last_hidden_state"] +- # import pdb +- # pdb.set_trace() + +- # if self.model_parallel: +- # torch.cuda.set_device(self.decoder.first_device) ++ # Encode if needed (training, first prediction pass) ++ if encoder_outputs is None: ++ # Convert encoder inputs in embeddings if needed ++ encoder_outputs = self.encoder( ++ input_ids=input_ids, ++ attention_mask=attention_mask, ++ inputs_embeds=inputs_embeds, ++ head_mask=head_mask, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): ++ encoder_outputs = BaseModelOutput( ++ last_hidden_state=encoder_outputs[0], ++ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, ++ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ++ ) ++ ++ hidden_states = encoder_outputs[0] ++ ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + +- import time +- start_time = time.time() +- with torch.npu.stream(self.stream): # set stream +- +- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) +- self.stream.synchronize() # synchronize +- print("time is", time.time() - start_time) ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) ++ hidden_states = hidden_states.to(self.decoder.first_device) ++ if decoder_input_ids is not None: ++ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) ++ if attention_mask is not None: ++ attention_mask = attention_mask.to(self.decoder.first_device) ++ if decoder_attention_mask is not None: ++ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) ++ ++ # Decode ++ decoder_outputs = self.decoder( ++ input_ids=decoder_input_ids, ++ attention_mask=decoder_attention_mask, ++ inputs_embeds=decoder_inputs_embeds, ++ past_key_values=past_key_values, ++ encoder_hidden_states=hidden_states, ++ encoder_attention_mask=attention_mask, ++ head_mask=decoder_head_mask, ++ cross_attn_head_mask=cross_attn_head_mask, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ ++ sequence_output = decoder_outputs[0] ++ ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.encoder.first_device) ++ self.lm_head = self.lm_head.to(self.encoder.first_device) ++ sequence_output = sequence_output.to(self.lm_head.weight.device) ++ ++ if self.config.tie_word_embeddings: ++ # Rescale output before projecting on vocab ++ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 ++ sequence_output = sequence_output * (self.model_dim**-0.5) ++ ++ lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: +@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output ++ + return Seq2SeqLMOutput( + loss=loss, +- logits=decoder_outputs[0], +- past_key_values=decoder_outputs[1] ++ logits=lm_logits, ++ past_key_values=decoder_outputs.past_key_values, ++ decoder_hidden_states=decoder_outputs.hidden_states, ++ decoder_attentions=decoder_outputs.attentions, ++ cross_attentions=decoder_outputs.cross_attentions, ++ encoder_last_hidden_state=encoder_outputs.last_hidden_state, ++ encoder_hidden_states=encoder_outputs.hidden_states, ++ encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, +- past_cross_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, +- "past_cross_key_values": past_cross_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, +@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel): + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) +- self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt") +- # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt") +- self.stream = torch.npu.Stream("npu:2") + + # Initialize weights and apply final processing + self.post_init() +@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch new file mode 100644 index 0000000000..e69de29bb2 diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 59bbe4d5c5e66a61b9bd3065e86c01aa03e6f20f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 05:59:44 +0000 Subject: [PATCH 009/110] add MindIE/MindIE-Torch/built-in/T5. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/T5/readme.md diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From d15ef4b1938a88d76022499f95a03b9dbeec2256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 06:02:43 +0000 Subject: [PATCH 010/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index 2b421aff68..cdb7631c82 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -17,8 +17,8 @@ def parse_arguments(): parser.add_argument( "--model_path", type=str, - default="./DeepFloyd--t5-v1_1-xxl", - help="encoder model path" + default="./T5-Small", + help="T5 model path" ) parser.add_argument( "--max_batchsize", -- Gitee From 11a6d322502e2b580ae035a5eb7dd1a31d626d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 06:54:23 +0000 Subject: [PATCH 011/110] add MindIE/MindIE-Torch/built-in/T5. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From fd06db61299ee5ae48455a317c17b4a80962b4f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 06:54:38 +0000 Subject: [PATCH 012/110] update MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/T5/perf_test_aie.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py index e69de29bb2..97c02916fe 100644 --- a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py +++ b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py @@ -0,0 +1,115 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import time +import argparse +import json + +import numpy as np +import torch +import torch_npu +import mindietorch +from tqdm import tqdm + +def test_encoder(aie_path, args, device_id = 0): + batch_size = args.batch_size + device_id = args.device_id + seq_len = args.seq_len + device = f'npu:{device_id}' + stream = torch.npu.Stream(f"npu:{device_id}") + print("Start loading ts module...") + ts = torch.jit.load(aie_path) + print("Ts module loaded.") + ts.eval() + dummy_input = (torch.ones([batch_size, seq_len], dtype=torch.int64).npu()) + print("Start infering...") + # warmup + for _ in range(10): + with torch.npu.stream(stream): + ts(dummy_input) + stream.synchronize() + + # performance test + num_infer = 100 + + start = time.time() + for _ in tqdm(range(num_infer)): + with torch.npu.stream(stream): + ts(dummy_input) + stream.synchronize() + end = time.time() + print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") + print(f"Encoder throughput: {num_infer * batch_size / (end - start):.2f} fps") + + +def test_decoder(aie_path, args): + batch_size = args.batch_size + device_id = args.device_id + seq_len = args.seq_len + device = f'npu:{device_id}' + stream = torch.npu.Stream(f"npu:{device_id}") + print("Start loading ts module...") + ts = torch.jit.load(aie_path) + print("Ts module loaded.") + ts.eval() + dummy_input = ( + torch.ones([batch_size, 1], dtype=torch.int64).npu(), + torch.randn(batch_size,seq_len,512).to(torch.float16).npu(), + torch.ones(batch_size,seq_len, dtype=torch.int64).npu(), + torch.randn(6,2,batch_size,8,1,64).to(torch.float16).npu(), + torch.randn(6,2,batch_size,8,24,64).to(torch.float16).npu() + ) + + # warmup + for _ in range(10): + with torch.npu.stream(stream): + ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4]) + stream.synchronize() + + # performance test + num_infer = 100 + start = time.time() + for _ in tqdm(range(num_infer)): + with torch.npu.stream(stream): + ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4]) + stream.synchronize() + end = time.time() + + print(f"Decoder latency: {(end - start) / num_infer * 1000:.2f} ms") + print(f"Decoder throughput: {num_infer * batch_size / (end - start):.2f} fps") + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--encoder_aie_path", type=str, required=True) + parser.add_argument("--decoder_aie_path", type=str, required=True) + parser.add_argument("--batch_size", type=int, help="NPU device id", default=1) + parser.add_argument("--seq_len", type=int, help="NPU device id", default=128) + + parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + torch.npu.set_device(args.device_id) + test_encoder(args.encoder_aie_path, args) + test_decoder(args.decoder_aie_path, args) + + +if __name__ == "__main__": + main() -- Gitee From f974576d601c3bdc1820aef67834fd274e3ce2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 07:04:47 +0000 Subject: [PATCH 013/110] add MindIE/MindIE-Torch/built-in/T5/main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/main.py | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/main.py diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py new file mode 100644 index 0000000000..e1ec51d66a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/main.py @@ -0,0 +1,43 @@ +import torch +import time +import argparse +import torch_npu +from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--hf_model_path", type=str, required=True) + + parser.add_argument("--encoder_aie_path", type=str, required=True) + parser.add_argument("--decoder_aie_path", type=str, required=True) + + parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + + args = parser.parse_args() + return args + +def main(): + args = parse_args() + torch.npu.set_device(args.device_id) + tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path) + text = [ + "translate English to German: The house is wonderful.", + "summarize: I am a high-performance inference optimizer and runtime.", + "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world", + ] + t5_config = T5Config.from_pretrained(args.hf_model_path) + model = T5ForConditionalGeneration(config=t5_config, + encoder_path=args.encoder_aie_path, + decoder_path=args.decoder_aie_path, + device_id=args.device_id).half().npu() + input_ids = tokenizer(text, return_tensors = "pt", padding=True).input_ids + outputs = model.generate(input_ids.npu(),max_new_tokens=24) + start_time = time.time() + outputs = model.generate(input_ids.npu(),max_new_tokens=24) + print("time_cost=", time.time()-start_time) + print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) + + +if __name__ == "__main__": + main() + -- Gitee From 135e51f350773ea57bdce3c5f22b5211589f64de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 07:40:39 +0000 Subject: [PATCH 014/110] add MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_modeling_t5_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py new file mode 100644 index 0000000000..3922ae56a8 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') + + +if __name__ == '__main__': + main() -- Gitee From d691a8c3f840018a71f0fdca845efb1dda030a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 07:42:31 +0000 Subject: [PATCH 015/110] add MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_modeling_outputs_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py new file mode 100644 index 0000000000..7569722529 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch') + + +if __name__ == '__main__': + main() -- Gitee From abb3fe7763bd8b36523281114778321dab5ab2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 07:43:40 +0000 Subject: [PATCH 016/110] add MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_modeling_utils_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py new file mode 100644 index 0000000000..743c7a1f00 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch') + + +if __name__ == '__main__': + main() -- Gitee From 43c14e121e44859d51093a03d5340f01ef4c00bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 07:44:32 +0000 Subject: [PATCH 017/110] add MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_utils_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py new file mode 100644 index 0000000000..993a4b6789 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch') + + +if __name__ == '__main__': + main() -- Gitee From 1b7477a08815d7aa77447efc44e2f41b77ba379f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:06:55 +0000 Subject: [PATCH 018/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 3922ae56a8..6a64343800 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + assert transformers_version !='4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') -- Gitee From b214ebc3fccf5fa553c3b5b81eada31058aadc6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:07:15 +0000 Subject: [PATCH 019/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py index 7569722529..1cbad93665 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + assert transformers_version != '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch') -- Gitee From 0b60438572729ee8ed45a9d881ba379534ecaedc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:07:39 +0000 Subject: [PATCH 020/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py index 743c7a1f00..80cba46f9c 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + assert transformers_version != '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch') -- Gitee From be1acb7d45cbbf68bc26f724f6de6a50a967d6ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:08:01 +0000 Subject: [PATCH 021/110] update MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py index 993a4b6789..18b0475a14 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version is not '4.42.0', "expectation transformers==4.42.0" + assert transformers_version != '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch') -- Gitee From 249cb2430953c3284fc718ef5c8584c87850f307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:12:00 +0000 Subject: [PATCH 022/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py index 1cbad93665..21cd251b95 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version != '4.42.0', "expectation transformers==4.42.0" + assert transformers_version == '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch') -- Gitee From 67ba1eac70c2ac44bf8bf4626c78857922a14dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:12:18 +0000 Subject: [PATCH 023/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 6a64343800..e304f4f9f2 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version !='4.42.0', "expectation transformers==4.42.0" + assert transformers_version =='4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') -- Gitee From e6e0fd40a9de20e813cd3b7728b4664436f374b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:12:33 +0000 Subject: [PATCH 024/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py index 80cba46f9c..b3ad7bc20b 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version != '4.42.0', "expectation transformers==4.42.0" + assert transformers_version == '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch') -- Gitee From 13ead6a9d36c959d87c8ce113c29439685dafea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:12:56 +0000 Subject: [PATCH 025/110] update MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py index 18b0475a14..046b6e6b85 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py @@ -20,7 +20,7 @@ def main(): transformers_path = transformers.__path__ transformers_version = transformers.__version__ - assert transformers_version != '4.42.0', "expectation transformers==4.42.0" + assert transformers_version == '4.42.0', "expectation transformers==4.42.0" os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch') -- Gitee From 37a9a62070eb6df465715e4bf19c726f60b922b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:31:08 +0000 Subject: [PATCH 026/110] update MindIE/MindIE-Torch/built-in/T5/utils.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/utils.patch | 103 ++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch index e69de29bb2..811327bbc6 100644 --- a/MindIE/MindIE-Torch/built-in/T5/utils.patch +++ b/MindIE/MindIE-Torch/built-in/T5/utils.patch @@ -0,0 +1,103 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py 2024-08-29 11:22:09.280000000 +0800 ++++ utils.py 2024-08-29 16:28:18.360000000 +0800 +@@ -507,7 +507,7 @@ class GenerationMixin: + generation_config: GenerationConfig, + ) -> Dict[str, Any]: + # 1. get encoder +- encoder = self.encoder_mindie ++ encoder = self.get_encoder() + # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device + # as the inputs. + if hasattr(self, "hf_device_map"): +@@ -523,12 +523,12 @@ class GenerationMixin: + for argument, value in model_kwargs.items() + if not any(argument.startswith(p) for p in irrelevant_prefix) + } +- # encoder_signature = set(inspect.signature(encoder.forward).parameters) +- # encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature +- # if not encoder_accepts_wildcard: +- # encoder_kwargs = { +- # argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature +- # } ++ encoder_signature = set(inspect.signature(encoder.forward).parameters) ++ encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature ++ if not encoder_accepts_wildcard: ++ encoder_kwargs = { ++ argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature ++ } + encoder_kwargs["output_attentions"] = generation_config.output_attentions + encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states + +@@ -536,13 +536,8 @@ class GenerationMixin: + model_input_name = model_input_name if model_input_name is not None else self.main_input_name + encoder_kwargs["return_dict"] = True + encoder_kwargs[model_input_name] = inputs_tensor +- with torch.npu.stream(self.stream): # set stream +- encoder_outputs=encoder.forward(encoder_kwargs["input_ids"]) +- self.stream.synchronize() # synchronize +- model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]} +- # import pdb +- # pdb.set_trace() +- # print("encoder_finished") ++ model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs) ++ + return model_kwargs + + def _prepare_decoder_input_ids_for_generation( +@@ -667,9 +662,6 @@ class GenerationMixin: + outputs, standardize_cache_format=standardize_cache_format + ) + model_kwargs[cache_name] = cache +- if "past_cross_key_values" in outputs: +- past_cross_key_values = outputs.past_cross_key_values +- model_kwargs["past_cross_key_values"] = past_cross_key_values + if getattr(outputs, "state", None) is not None: + model_kwargs["state"] = outputs.state + +@@ -1801,16 +1793,16 @@ class GenerationMixin: + "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." + ) + +- # if self.device.type != input_ids.device.type: +- # warnings.warn( +- # "You are calling .generate() with the `input_ids` being on a device type different" +- # f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" +- # f" is on {self.device.type}. You may experience unexpected behaviors or slower generation." +- # " Please make sure that you have put `input_ids` to the" +- # f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before" +- # " running `.generate()`.", +- # UserWarning, +- # ) ++ if self.device.type != input_ids.device.type: ++ warnings.warn( ++ "You are calling .generate() with the `input_ids` being on a device type different" ++ f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" ++ f" is on {self.device.type}. You may experience unexpected behaviors or slower generation." ++ " Please make sure that you have put `input_ids` to the" ++ f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before" ++ " running `.generate()`.", ++ UserWarning, ++ ) + + # 8. prepare distribution pre_processing samplers + prepared_logits_processor = self._get_logits_processor( +@@ -2650,10 +2642,7 @@ class GenerationMixin: + this_peer_finished = False + unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) +- num_layers = self.config.num_layers +- num_heads = self.config.num_heads +- d_kv = self.config.d_kv +- model_kwargs["past_key_values"] = torch.randn(num_layers, 2, batch_size, num_heads, 0, d_kv).half().npu() ++ + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) +@@ -2711,7 +2700,6 @@ class GenerationMixin: + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) +- # print("aaaa",input_ids) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( -- Gitee From 91efc78b45a04cd938bef53754b96c96eec5fa50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 08:56:07 +0000 Subject: [PATCH 027/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/.keep diff --git a/MindIE/MindIE-Torch/built-in/T5/.keep b/MindIE/MindIE-Torch/built-in/T5/.keep deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 0442dd26898430ca897579a363ccacef0220313a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 11:38:36 +0000 Subject: [PATCH 028/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 111 ++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index e69de29bb2..e9bf20bf51 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -0,0 +1,111 @@ +# T5模型-推理指导 + + +- [概述](#ZH-CN_TOPIC_0000001172161501) + + - [输入输出数据](#section540883920406) + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + + - [获取源码](#section4622531142816) + - [准备数据集](#section183221994411) + - [模型推理](#section741711594517) + +- [模型推理性能](#ZH-CN_TOPIC_0000001172201573) + + +# 概述 + + T5的全称为Text to Text Transfer Transformer,是谷歌提出的预训练语言模型领域的通用模型,该模型将所有自然语言问题都转化成文本到文本的形式,并用一个统一的模型解决.T5最核心的理念是:使用前缀任务声明及文本答案生成,统一所有自然语言处理任务的输入和输出。在此之前的几乎所有预训练语言模型,在下游任务微调过程中都需要添加非线性层,将模型的输出转化为任务指定的输出格式。T5不需要对模型做任何改动,只需要提供下游任务的微调数据;不需要添加任何非线性层,唯一需要做的就是在输入数据前加上任务声明前缀.T5将自然语言处理任务都转化成几乎一致的格式,即输入是带有任务前缀声明的文本序列,输出的文本序列是相应任务的结果 + + +## 输入输出数据 + +- 输入数据 + + | 输入数据 | 大小 | 数据类型 | 数据排布格式 | + | -------- | -------- | -------- | ------------ | + | input | batchsize x input_seq_len | FLOAT16 | NHWC | + + +- 输出数据 + + | 输出数据 | 大小 | 数据类型 | 数据排布格式 | + | -------- | -------- | -------- | ------------ | + | output | batchsize x input_seq_len | INT32 | NTHWC | + + +# 推理环境准备 + +- 该模型需要以下插件与驱动 + + **表 1** 版本配套表 +- + | 配套 | 版本 | 备注 | + | ------------------------------------------------------------ |--------| ------------------------------------------------------------ | + | Python | 3.10.2 | - | + | torch | 2.1.0 | 导出pt模型所需版本 | + | torch_npu | 2.1.0 | 模型编译和推理所需版本 | + + +# 快速上手 + +## 获取源码 + +1. 安装transformers4.42.0版本。 + ```bash + pip3 install transformers==4.42.0 + ``` + +2. 安装mindie包,需要与torch_npu配合使用,请参考mindietorch配套torch_npu配置环境 + + ```bash + # 安装mindie + chmod +x ./Ascend-mindie_xxx.run + ./Ascend-mindie_xxx.run --install + source /usr/local/Ascend/mindie/set_env.sh + ``` + +3. 代码修改,在T5目录下 + + 执行命令: + + ```bash + python T5_modeling_outputs_patch.py + ``` + + ```bash + python T5_modeling_t5_patch.py + ``` + + ```bash + python T5_modeling_utils_patch.py + ``` + ```bash + python T5_utils_patch.py + ``` +4.导出mindietorch模型 + ```bash + python export_t5.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} + ``` +参数说明: +{output_path}是输出的目录 +{model_path}模型所在目录 +{max_batchsize}推理过程中最大的batchsize +{max_input_seq_len}推理过程中最大输入长度 +{device_id} 用哪个npu device + +运行该命令后会自动生成encoder和decoder优化后的模型 + +5.运行 + ```bash +python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2 +``` + +参数说明: +{model_path}模型所在目录 +{encoder_aie_path}优化后的encoder的模型路径,要具体到.pt文件 +{decoder_aie_path}优化后的decoder的模型路径,要具体到.pt文件 +{device_id} 用哪个npu device \ No newline at end of file -- Gitee From dc2a507c211359dabdff9b8b91c91881caf236fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 29 Aug 2024 11:45:23 +0000 Subject: [PATCH 029/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index e9bf20bf51..f518880708 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -9,11 +9,8 @@ - [快速上手](#ZH-CN_TOPIC_0000001126281700) - - [获取源码](#section4622531142816) - - [准备数据集](#section183221994411) - [模型推理](#section741711594517) -- [模型推理性能](#ZH-CN_TOPIC_0000001172201573) # 概述 @@ -52,7 +49,6 @@ # 快速上手 -## 获取源码 1. 安装transformers4.42.0版本。 ```bash -- Gitee From 47fcbb80564196028f32657e053d2974077aae03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 2 Sep 2024 10:33:32 +0000 Subject: [PATCH 030/110] update MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_utils.patch | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch index e69de29bb2..1b9fef8cd2 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch @@ -0,0 +1,41 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_utils.py 2024-08-28 20:15:38.524000000 +0800 ++++ modeling_utils.py 2024-09-02 17:29:43.700000000 +0800 +@@ -975,7 +975,7 @@ class ModuleUtilsMixin: + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ +- return self.get_device() ++ return get_parameter_device(self) + + @property + def dtype(self) -> torch.dtype: +@@ -1004,8 +1004,7 @@ class ModuleUtilsMixin: + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility +- #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min +- encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min + + return encoder_extended_attention_mask + +@@ -1019,9 +1018,7 @@ class ModuleUtilsMixin: + device = attention_mask.device + batch_size, seq_length = input_shape + seq_ids = torch.arange(seq_length, device=device) +- # print("seq_ids=",seq_ids) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] +- # print("causal_mask=",causal_mask) + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) +@@ -1088,8 +1085,7 @@ class ModuleUtilsMixin: + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility +- #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min +- extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min + return extended_attention_mask + + def get_head_mask( -- Gitee From a4e9fb7f5a7ac2dfc51d8ff0a5fdb0b7defc5601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 2 Sep 2024 10:35:18 +0000 Subject: [PATCH 031/110] update MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch index e69de29bb2..6c99414a69 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch @@ -0,0 +1,10 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_outputs.py 2024-08-28 19:20:22.112000000 +0800 ++++ modeling_outputs.py 2024-09-02 18:32:37.720000000 +0800 +@@ -282,7 +282,6 @@ class BaseModelOutputWithPastAndCrossAtt + + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None +- past_cross_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None -- Gitee From 139447e492458113aad3e3d96bd415af96a31c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 4 Sep 2024 11:14:12 +0000 Subject: [PATCH 032/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5.patch | 315 +++--------------- 1 file changed, 46 insertions(+), 269 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index 4a376cf5eb..40920ac007 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,5 +1,5 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-08-29 11:11:23.852000000 +0800 -+++ modeling_t5.py 2024-08-29 11:19:34.572000000 +0800 +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-09-04 19:03:55.080000000 +0800 ++++ modling_t5.py 2024-09-04 19:04:47.048000000 +0800 @@ -23,8 +23,6 @@ from typing import List, Optional, Tuple import torch from torch import nn @@ -9,15 +9,6 @@ from ...activations import ACT2FN from ...modeling_outputs import ( -@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module): - - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) -- # print("self.weight.dtype=",self.weight.dtype) -+ - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) @@ -451,7 +449,6 @@ class T5Attention(nn.Module): key_value_states=None, position_bias=None, @@ -44,11 +35,11 @@ if key_value_states is None: # self-attn # (batch_size, n_heads, key_length, dim_per_head) -@@ -571,261 +566,7 @@ class T5Attention(nn.Module): +@@ -571,133 +566,16 @@ class T5Attention(nn.Module): present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) +- - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs @@ -85,7 +76,6 @@ - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] -- # print("key_value_states=",real_seq_length) - key_length = real_seq_length - - def shape(states): @@ -118,7 +108,6 @@ - value_states = project( - hidden_states, self.v, past_key_value[1] if past_key_value is not None else None - ) -- # print("key_states=",hidden_states.dtype,key_states.dtype) - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) @@ -141,143 +130,7 @@ - - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -- -- if self.pruned_heads: -- mask = torch.ones(position_bias.shape[1]) -- mask[list(self.pruned_heads)] = 0 -- position_bias_masked = position_bias[:, mask.bool()] -- else: -- position_bias_masked = position_bias -- -- scores += position_bias_masked -- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -- scores -- ) # (batch_size, n_heads, seq_length, key_length) -- attn_weights = nn.functional.dropout( -- attn_weights, p=self.dropout, training=self.training -- ) # (batch_size, n_heads, seq_length, key_length) -- -- # Mask heads if we want to -- if layer_head_mask is not None: -- attn_weights = attn_weights * layer_head_mask -- -- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -- attn_output = self.o(attn_output) -- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) -- if output_attentions: -- outputs = outputs + (attn_weights,) -- return outputs -- -- --class T5CrossAttention(T5Attention): -- def __init__(self, config: T5Config, has_relative_attention_bias=False): -- super().__init__(config, has_relative_attention_bias) -- -- def forward( -- self, -- hidden_states, -- mask=None, -- key_value_states=None, -- position_bias=None, -- past_cross_key_value=None, -- layer_head_mask=None, -- query_length=None, -- use_cache=False, -- output_attentions=False, -- ): -- """ -- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -- """ -- # Input is (batch_size, seq_length, dim) -- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -- batch_size, seq_length = hidden_states.shape[:2] -- -- real_seq_length = seq_length -- -- if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -- raise ValueError( -- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -- ) -- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length -- # print("key_value_states=",key_value_states, real_seq_length) -- key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] -- -- def shape(states): -- """projection""" -- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -- -- def unshape(states): -- """reshape""" -- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -- -- def project(hidden_states, proj_layer, key_value_states, past_key_value): -- """projects hidden states correctly to key/query states""" -- if key_value_states is None: -- # self-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(hidden_states)) -- elif past_key_value is None: -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(key_value_states)) -- -- if past_key_value is not None: -- if key_value_states is None: -- # self-attn -- # (batch_size, n_heads, key_length, dim_per_head) -- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -- # print("hidden_states=",hidden_states.shape) -- elif past_key_value.shape[2] != key_value_states.shape[1]: -- # checking that the `sequence_length` of the `past_key_value` is the same as -- # the provided `key_value_states` to support prefix tuning -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(key_value_states)) -- else: -- # cross-attn -- hidden_states = past_key_value -- return hidden_states -- -- # get query states -- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -- -- # get key/value states -- key_states = project( -- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None -- ) -- value_states = project( -- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None -- ) -- -- # compute scores -- scores = torch.matmul( -- query_states, key_states.transpose(3, 2) -- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- -- if position_bias is None: -- if not self.has_relative_attention_bias: -- position_bias = torch.zeros( -- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -- ) -- if self.gradient_checkpointing and self.training: -- position_bias.requires_grad = True -- else: -- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -- -- # if key and values are already calculated -- # we want only the last query position bias -- if past_key_value is not None: -- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -- -- if mask is not None: -- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -- - if self.pruned_heads: - mask = torch.ones(position_bias.shape[1]) - mask[list(self.pruned_heads)] = 0 @@ -302,11 +155,13 @@ - - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- # print("output_attentions=",output_attentions) if output_attentions: outputs = outputs + (attn_weights,) return outputs -@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention): + + +- +- class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() @@ -315,7 +170,7 @@ self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) -@@ -921,7 +662,6 @@ class T5Block(nn.Module): +@@ -784,7 +662,6 @@ class T5Block(nn.Module): layer_head_mask=None, cross_attn_layer_head_mask=None, past_key_value=None, @@ -323,7 +178,7 @@ use_cache=False, output_attentions=False, return_dict=True, -@@ -931,17 +671,15 @@ class T5Block(nn.Module): +@@ -794,15 +671,15 @@ class T5Block(nn.Module): logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 @@ -333,56 +188,32 @@ - # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - # f"Got {len(past_key_value)} past key / value states" - # ) -- -- self_attn_past_key_value = past_key_value -- # print("self_attn_past_key_value=",self_attn_past_key_value.dtype) -- cross_attn_past_key_value = past_cross_key_value -- # cross_attn_past_key_value = past_key_value[2:] + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) -+ + +- self_attn_past_key_value = past_key_value +- cross_attn_past_key_value = past_cross_key_value + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] else: self_attn_past_key_value, cross_attn_past_key_value = None, None -@@ -955,8 +693,6 @@ class T5Block(nn.Module): - output_attentions=output_attentions, - ) - hidden_states, present_key_value_state = self_attention_outputs[:2] -- # if self.is_decoder: -- # print("present_key_value_state=",present_key_value_state[0].dtype) - attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training -@@ -967,7 +703,7 @@ class T5Block(nn.Module): - torch.finfo(hidden_states.dtype).max, - ) - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) -- present_cross_key_value_state = () -+ - do_cross_attention = self.is_decoder and encoder_hidden_states is not None - if do_cross_attention: - # the actual query length is unknown for cross attention -@@ -1000,10 +736,9 @@ class T5Block(nn.Module): +@@ -859,7 +736,9 @@ class T5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states -- # if present_key_value_state is not None: -- # present_key_value_state = present_key_value_state + cross_attention_outputs[1] - cross_attn_past_key_values = cross_attention_outputs[1] -- # print("cross_attn_past_key_values=",cross_attn_past_key_values) + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] -@@ -1022,7 +757,7 @@ class T5Block(nn.Module): +@@ -878,7 +757,7 @@ class T5Block(nn.Module): outputs = (hidden_states,) if use_cache: @@ -391,7 +222,7 @@ else: outputs = outputs + attention_outputs -@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) +@@ -1018,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) class T5Stack(T5PreTrainedModel): @@ -407,7 +238,7 @@ self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel): +@@ -1093,14 +969,13 @@ class T5Stack(T5PreTrainedModel): def forward( self, input_ids=None, @@ -424,21 +255,10 @@ use_cache=None, output_attentions=None, output_hidden_states=None, - return_dict=None, - ): - # Model parallel -- # print("aaaaaaaaaaaaaaaaa") - if self.model_parallel: - torch.cuda.set_device(self.first_device) - self.embed_tokens = self.embed_tokens.to(self.first_device) -@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel): +@@ -1146,9 +1021,9 @@ class T5Stack(T5PreTrainedModel): raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") # initialize past_key_values with `None` if past does not exist -- #modified -- # if past_key_values is None: -- # past_key_values = [None] * len(self.block) -- #added - if not self.is_decoder: + if past_key_values is None: past_key_values = [None] * len(self.block) @@ -447,19 +267,16 @@ if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel): +@@ -1179,7 +1054,7 @@ class T5Stack(T5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) -- # present_key_value_states = () if use_cache else None -- # present_cross_key_value_states = () if use_cache else None - present_key_value_states = [] if use_cache else None -- # present_cross_key_value_states = [] if use_cache else None + present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1187,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -472,7 +289,7 @@ layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): +@@ -1240,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, past_key_value=past_key_value, @@ -480,7 +297,7 @@ use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel): +@@ -1248,19 +1120,19 @@ class T5Stack(T5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -501,14 +318,11 @@ # append next layer key value states if use_cache: - present_key_value_states.extend(present_key_value_state) -- # present_cross_key_value_states.extend(present_cross_key_value_state) -- # present_key_value_states = present_key_value_states + (present_key_value_state,) -- # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,) + present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel): +@@ -1274,7 +1146,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -517,48 +331,20 @@ # Add last layer if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) -- # print("return_dict=",return_dict) -+ - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, -- # present_cross_key_value_states, - all_hidden_states, - all_attentions, - all_cross_attentions, +@@ -1292,17 +1164,13 @@ class T5Stack(T5PreTrainedModel): ] if v is not None ) - present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None -- # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2, -- # *present_cross_key_value_states[0].shape) if use_cache else None -- # print("dddddddddddd") -- # if use_cache: -- # print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype) -- # return BaseModelOutputWithPastAndCrossAttentions( -- # last_hidden_state=hidden_states, -- # past_key_values=present_key_value_states, -- # past_cross_key_values=present_cross_key_value_states -- # ) - if not self.is_decoder and self.encodecrosskeyvalue: - res = self.encodecrosskeyvalue(hidden_states) - return tuple((hidden_states, res)) -- # return BaseModelOutputWithPastAndCrossAttentions( -- # last_hidden_state=hidden_states, -- # past_key_values=present_key_value_states, -- # # past_cross_key_values=past_cross_key_values, -- # hidden_states=all_hidden_states, -- # attentions=all_attentions, -- # cross_attentions=all_cross_attentions, -- # ) +- lm_logits = None - if self.is_decoder: +- #logits = None - if self.config.tie_word_embeddings: -- hidden_states_1 = hidden_states * (self.model_dim ** -0.5) -- lm_logits = self.lm_head(hidden_states_1) +- hidden_states = hidden_states * (self.model_dim ** -0.5) +- lm_logits = self.lm_head(hidden_states) - return tuple((lm_logits, present_key_value_states)) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, @@ -570,7 +356,7 @@ T5_START_DOCSTRING = r""" -@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel): +@@ -1673,31 +1541,6 @@ class T5Model(T5PreTrainedModel): ) @@ -587,11 +373,14 @@ - def forward(self, hidden_states): - batch_size = hidden_states.shape[0] - encoder_hidden_states_kvs = [] +- # for i in range(len(self.cross_value)): +- # encoder_hidden_states_kvs.append( +- # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), +- # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) - for i in range(len(self.cross_value)): - encoder_hidden_states_kvs.append( -- torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), -- self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) -- +- torch.stack((self.cross_key[i](hidden_states), +- self.cross_value[i](hidden_states)), dim=0)) - past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) - return past_cross_key_values - @@ -599,7 +388,7 @@ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): _keys_to_ignore_on_load_unexpected = [ -@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr +@@ -1705,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -666,7 +455,7 @@ # Model parallel self.model_parallel = False -@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr +@@ -1824,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, @@ -674,14 +463,13 @@ inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, -@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr +@@ -1872,23 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr if self.config.num_layers == self.config.num_decoder_layers: warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) decoder_head_mask = head_mask - - hidden_states = encoder_outputs["last_hidden_state"] -- # import pdb -- # pdb.set_trace() +- past_cross_key_values = encoder_outputs["past_cross_key_values"] - # if self.model_parallel: - # torch.cuda.set_device(self.decoder.first_device) @@ -713,13 +501,12 @@ # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(labels) -- import time -- start_time = time.time() - with torch.npu.stream(self.stream): # set stream -- -- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) +- # import pdb +- # pdb.set_trace() +- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) - self.stream.synchronize() # synchronize -- print("time is", time.time() - start_time) +- # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) @@ -764,7 +551,7 @@ loss = None if labels is not None: -@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr +@@ -1901,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr if not return_dict: output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs return ((loss,) + output) if loss is not None else output @@ -791,7 +578,7 @@ attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr +@@ -1937,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr return { "decoder_input_ids": input_ids, "past_key_values": past_key_values, @@ -799,17 +586,7 @@ "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel): - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False - self.encoder = T5Stack(encoder_config, self.shared) -- self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt") -- # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt") -- self.stream = torch.npu.Stream("npu:2") - - # Initialize weights and apply final processing - self.post_init() -@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): +@@ -2086,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- Gitee From 09425109e1d96ca2aaef3f9c2ff50799c4818da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 4 Sep 2024 11:15:02 +0000 Subject: [PATCH 033/110] update MindIE/MindIE-Torch/built-in/T5/utils.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/utils.patch | 49 ++++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch index 811327bbc6..4968e30c2b 100644 --- a/MindIE/MindIE-Torch/built-in/T5/utils.patch +++ b/MindIE/MindIE-Torch/built-in/T5/utils.patch @@ -1,15 +1,18 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py 2024-08-29 11:22:09.280000000 +0800 -+++ utils.py 2024-08-29 16:28:18.360000000 +0800 -@@ -507,7 +507,7 @@ class GenerationMixin: +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py 2024-09-04 17:07:15.776000000 +0800 ++++ utils.py 2024-09-04 19:05:05.300000000 +0800 +@@ -507,10 +507,7 @@ class GenerationMixin: generation_config: GenerationConfig, ) -> Dict[str, Any]: # 1. get encoder -- encoder = self.encoder_mindie +- if self.encoder_mindie: +- encoder = self.encoder_mindie +- else: +- encoder = self.get_encoder() + encoder = self.get_encoder() # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device # as the inputs. if hasattr(self, "hf_device_map"): -@@ -523,12 +523,12 @@ class GenerationMixin: +@@ -526,12 +523,12 @@ class GenerationMixin: for argument, value in model_kwargs.items() if not any(argument.startswith(p) for p in irrelevant_prefix) } @@ -28,23 +31,23 @@ encoder_kwargs["output_attentions"] = generation_config.output_attentions encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states -@@ -536,13 +536,8 @@ class GenerationMixin: +@@ -539,13 +536,8 @@ class GenerationMixin: model_input_name = model_input_name if model_input_name is not None else self.main_input_name encoder_kwargs["return_dict"] = True encoder_kwargs[model_input_name] = inputs_tensor -- with torch.npu.stream(self.stream): # set stream -- encoder_outputs=encoder.forward(encoder_kwargs["input_ids"]) -- self.stream.synchronize() # synchronize +- if self.encoder_mindie: +- with torch.npu.stream(self.stream): # set stream +- encoder_outputs=encoder.forward(encoder_kwargs["input_ids"]) +- self.stream.synchronize() # synchronize +- else: +- encoder_outputs = encoder(**encoder_kwargs) - model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]} -- # import pdb -- # pdb.set_trace() -- # print("encoder_finished") + model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs) + return model_kwargs def _prepare_decoder_input_ids_for_generation( -@@ -667,9 +662,6 @@ class GenerationMixin: +@@ -670,9 +662,6 @@ class GenerationMixin: outputs, standardize_cache_format=standardize_cache_format ) model_kwargs[cache_name] = cache @@ -54,7 +57,7 @@ if getattr(outputs, "state", None) is not None: model_kwargs["state"] = outputs.state -@@ -1801,16 +1793,16 @@ class GenerationMixin: +@@ -1804,16 +1793,16 @@ class GenerationMixin: "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." ) @@ -81,7 +84,14 @@ # 8. prepare distribution pre_processing samplers prepared_logits_processor = self._get_logits_processor( -@@ -2650,10 +2642,7 @@ class GenerationMixin: +@@ -2647,20 +2636,15 @@ class GenerationMixin: + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) +- + + # keep track of which sequences are already finished + batch_size = input_ids.shape[0] this_peer_finished = False unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) @@ -92,12 +102,7 @@ + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): # prepare model inputs +- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) -@@ -2711,7 +2700,6 @@ class GenerationMixin: - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) -- # print("aaaa",input_ids) - if streamer is not None: - streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( + # forward pass to get next token -- Gitee From c84ed5a2f3a7585343d0f60c72fc973a1428617c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 4 Sep 2024 11:31:17 +0000 Subject: [PATCH 034/110] =?UTF-8?q?modleing=5Fmt5=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_mt5.patch | 568 ++++++++++++++++++ 1 file changed, 568 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch new file mode 100644 index 0000000000..38eb59c192 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch @@ -0,0 +1,568 @@ +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py 2024-09-03 19:47:27.392000000 +0800 ++++ modeling_mt5.py 2024-09-04 19:29:28.348000000 +0800 +@@ -324,6 +324,7 @@ class MT5Attention(nn.Module): + key_value_states=None, + position_bias=None, + past_key_value=None, ++ past_cross_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -340,7 +341,8 @@ class MT5Attention(nn.Module): + real_seq_length = seq_length + + if past_key_value is not None: +- if len(past_key_value) != 2: ++ if past_key_value.shape[0] != 2: ++ # if len(past_key_value) != 2: + raise ValueError( + f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + ) +@@ -368,6 +370,7 @@ class MT5Attention(nn.Module): + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: ++ past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) +@@ -446,12 +449,125 @@ class MT5Attention(nn.Module): + outputs = outputs + (attn_weights,) + return outputs + ++class MT5SelfAttention(MT5Attention): ++ def __init__(self, config: T5Config, has_relative_attention_bias=False): ++ super().__init__(config, has_relative_attention_bias) ++ ++ def forward( ++ self, ++ hidden_states, ++ mask=None, ++ position_bias=None, ++ past_key_value=None, ++ layer_head_mask=None, ++ use_cache=False, ++ output_attentions=False, ++ ): ++ """ ++ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). ++ """ ++ # Input is (batch_size, seq_length, dim) ++ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) ++ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) ++ batch_size, seq_length = hidden_states.shape[:2] ++ ++ real_seq_length = seq_length ++ ++ if past_key_value is not None: ++ if past_key_value.shape[0] != 2: ++ # if len(past_key_value) != 2: ++ raise ValueError( ++ f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" ++ ) ++ real_seq_length += past_key_value[0].shape[2] ++ key_length = real_seq_length ++ ++ def shape(states): ++ """projection""" ++ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) ++ ++ def unshape(states): ++ """reshape""" ++ return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) ++ ++ def project(hidden_states, proj_layer, past_key_value): ++ """projects hidden states correctly to key/query states""" ++ if past_key_value is None: ++ # cross-attn ++ # (batch_size, n_heads, seq_length, dim_per_head) ++ hidden_states = shape(proj_layer(hidden_states)) ++ ++ if past_key_value is not None: ++ hidden_states = shape(proj_layer(hidden_states)) ++ hidden_states = torch.cat([past_key_value, hidden_states], dim=2) ++ return hidden_states ++ ++ # get query states ++ query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) ++ ++ # get key/value states ++ key_states = project( ++ hidden_states, self.k, past_key_value[0] if past_key_value is not None else None ++ ) ++ value_states = project( ++ hidden_states, self.v, past_key_value[1] if past_key_value is not None else None ++ ) ++ # compute scores ++ scores = torch.matmul( ++ query_states, key_states.transpose(3, 2) ++ ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 ++ # print("scores=",scores.dtype) ++ if position_bias is None: ++ if not self.has_relative_attention_bias: ++ position_bias = torch.zeros( ++ (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype ++ ) ++ if self.gradient_checkpointing and self.training: ++ position_bias.requires_grad = True ++ else: ++ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) ++ ++ # if key and values are already calculated ++ # we want only the last query position bias ++ if past_key_value is not None: ++ position_bias = position_bias[:, :, -hidden_states.size(1) :, :] ++ ++ if mask is not None: ++ position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) ++ ++ if self.pruned_heads: ++ mask = torch.ones(position_bias.shape[1]) ++ mask[list(self.pruned_heads)] = 0 ++ position_bias_masked = position_bias[:, mask.bool()] ++ else: ++ position_bias_masked = position_bias ++ ++ scores += position_bias_masked ++ attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( ++ scores ++ ) # (batch_size, n_heads, seq_length, key_length) ++ attn_weights = nn.functional.dropout( ++ attn_weights, p=self.dropout, training=self.training ++ ) # (batch_size, n_heads, seq_length, key_length) ++ ++ # Mask heads if we want to ++ if layer_head_mask is not None: ++ attn_weights = attn_weights * layer_head_mask ++ ++ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) ++ attn_output = self.o(attn_output) ++ ++ present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) ++ if output_attentions: ++ outputs = outputs + (attn_weights,) ++ return outputs + + # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 + class MT5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() +- self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -540,6 +656,7 @@ class MT5Block(nn.Module): + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, ++ past_cross_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, +@@ -549,15 +666,15 @@ class MT5Block(nn.Module): + logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + +- if len(past_key_value) != expected_num_past_key_values: +- raise ValueError( +- f"There should be {expected_num_past_key_values} past states. " +- f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- f"Got {len(past_key_value)} past key / value states" +- ) ++ # if len(past_key_value) != expected_num_past_key_values: ++ # raise ValueError( ++ # f"There should be {expected_num_past_key_values} past states. " ++ # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" ++ # f"Got {len(past_key_value)} past key / value states" ++ # ) + +- self_attn_past_key_value = past_key_value[:2] +- cross_attn_past_key_value = past_key_value[2:] ++ self_attn_past_key_value = past_key_value ++ cross_attn_past_key_value = past_cross_key_value + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + +@@ -614,9 +731,7 @@ class MT5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- if present_key_value_state is not None: +- present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- ++ cross_attn_past_key_values = cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + +@@ -635,7 +750,7 @@ class MT5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) + attention_outputs ++ outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -884,11 +999,14 @@ class MT5PreTrainedModel(PreTrainedModel + + # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 + class MT5Stack(MT5PreTrainedModel): +- def __init__(self, config, embed_tokens=None): ++ def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder ++ self.lm_head=lm_head ++ self.encodecrosskeyvalue = encodecrosskeyvalue ++ self.model_dim = config.d_model + + self.block = nn.ModuleList( + [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -956,13 +1074,14 @@ class MT5Stack(MT5PreTrainedModel): + def forward( + self, + input_ids=None, +- attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, ++ past_key_values=None, ++ past_cross_key_values=None, ++ attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, +- past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, +@@ -1008,9 +1127,9 @@ class MT5Stack(MT5PreTrainedModel): + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- if past_key_values is None: ++ if not self.is_decoder: + past_key_values = [None] * len(self.block) +- ++ past_cross_key_values = [None] * len(self.block) + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + +@@ -1041,7 +1160,7 @@ class MT5Stack(MT5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- present_key_value_states = () if use_cache else None ++ present_key_value_states = [] if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1049,8 +1168,10 @@ class MT5Stack(MT5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- +- for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): ++ for i, layer_module in enumerate(self.block): ++ # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): ++ past_key_value = past_key_values[i] ++ past_cross_key_value = past_cross_key_values[i] + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1100,6 +1221,7 @@ class MT5Stack(MT5PreTrainedModel): + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, ++ past_cross_key_value=past_cross_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1107,19 +1229,19 @@ class MT5Stack(MT5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state = layer_outputs[:2] ++ hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[2] ++ position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: +- present_key_value_states = present_key_value_states + (present_key_value_state,) ++ present_key_value_states.extend(present_key_value_state) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1133,7 +1255,7 @@ class MT5Stack(MT5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: +@@ -1151,13 +1273,17 @@ class MT5Stack(MT5PreTrainedModel): + ] + if v is not None + ) +- return BaseModelOutputWithPastAndCrossAttentions( +- last_hidden_state=hidden_states, +- past_key_values=present_key_value_states, +- hidden_states=all_hidden_states, +- attentions=all_attentions, +- cross_attentions=all_cross_attentions, +- ) ++ present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None ++ if not self.is_decoder and self.encodecrosskeyvalue: ++ res = self.encodecrosskeyvalue(hidden_states) ++ return tuple((hidden_states, res)) ++ lm_logits = None ++ if self.is_decoder: ++ #logits = None ++ if self.config.tie_word_embeddings: ++ hidden_states = hidden_states * (self.model_dim ** -0.5) ++ lm_logits = self.lm_head(hidden_states) ++ return tuple((lm_logits, present_key_value_states)) + + + MT5_START_DOCSTRING = r""" +@@ -1549,6 +1675,29 @@ class MT5Model(MT5PreTrainedModel): + ) + + ++class EncoderToCrossKeyValue(nn.Module): ++ def __init__(self, cross_key, cross_value, num_heads, d_kv): ++ super().__init__() ++ self.cross_key = cross_key ++ self.cross_value = cross_value ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ encoder_hidden_states_kvs = [] ++ # for i in range(len(self.cross_value)): ++ # encoder_hidden_states_kvs.append( ++ # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), ++ # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) ++ for i in range(len(self.cross_value)): ++ encoder_hidden_states_kvs.append( ++ torch.stack((self.cross_key[i](hidden_states), ++ self.cross_value[i](hidden_states)), dim=0)) ++ past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) ++ return past_cross_key_values ++ + @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) + class MT5ForConditionalGeneration(MT5PreTrainedModel): + r""" +@@ -1573,28 +1722,45 @@ class MT5ForConditionalGeneration(MT5Pre + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 +- def __init__(self, config: MT5Config): ++ def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): + super().__init__(config) +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = MT5Stack(encoder_config, self.shared) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- self.decoder = MT5Stack(decoder_config, self.shared) +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ ++ self.encoder_path = encoder_path ++ self.decoder_path = decoder_path ++ if not self.encoder_path or not self.decoder_path: ++ self.model_dim = config.d_model ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ ++ ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.decoder = MT5Stack(decoder_config, self.shared, , self.lm_head) ++ cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) ++ cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) ++ encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) ++ self.encoder_mindie = None ++ self.decoder_mindie = None ++ if self.encoder_path: ++ self.encoder_mindie = torch.jit.load(self.encoder_path) ++ if self.decoder_path: ++ self.decoder_mindie = torch.jit.load(self.decoder_path) ++ self.stream = torch.npu.Stream(f"npu:{device_id}") ++ self.device_id = device_id ++ ++ ++ def get_device(self): ++ return f"npu:{self.device_id}" + + # Initialize weights and apply final processing +- self.post_init() ++ # self.post_init() + + # Model parallel + self.model_parallel = False +@@ -1677,6 +1843,7 @@ class MT5ForConditionalGeneration(MT5Pre + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, ++ past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, +@@ -1724,76 +1891,23 @@ class MT5ForConditionalGeneration(MT5Pre + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask ++ ++ hidden_states = encoder_outputs["last_hidden_state"] ++ past_cross_key_values = encoder_outputs["past_cross_key_values"] + +- # Encode if needed (training, first prediction pass) +- if encoder_outputs is None: +- # Convert encoder inputs in embeddings if needed +- encoder_outputs = self.encoder( +- input_ids=input_ids, +- attention_mask=attention_mask, +- inputs_embeds=inputs_embeds, +- head_mask=head_mask, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): +- encoder_outputs = BaseModelOutput( +- last_hidden_state=encoder_outputs[0], +- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, +- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, +- ) +- +- hidden_states = encoder_outputs[0] +- +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) ++ # if self.model_parallel: ++ # torch.cuda.set_device(self.decoder.first_device) + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- hidden_states = hidden_states.to(self.decoder.first_device) +- if decoder_input_ids is not None: +- decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) +- if attention_mask is not None: +- attention_mask = attention_mask.to(self.decoder.first_device) +- if decoder_attention_mask is not None: +- decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) +- +- # Decode +- decoder_outputs = self.decoder( +- input_ids=decoder_input_ids, +- attention_mask=decoder_attention_mask, +- inputs_embeds=decoder_inputs_embeds, +- past_key_values=past_key_values, +- encoder_hidden_states=hidden_states, +- encoder_attention_mask=attention_mask, +- head_mask=decoder_head_mask, +- cross_attn_head_mask=cross_attn_head_mask, +- use_cache=use_cache, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- +- sequence_output = decoder_outputs[0] +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.encoder.first_device) +- self.lm_head = self.lm_head.to(self.encoder.first_device) +- sequence_output = sequence_output.to(self.lm_head.weight.device) +- +- if self.config.tie_word_embeddings: +- # Rescale output before projecting on vocab +- # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 +- sequence_output = sequence_output * (self.model_dim**-0.5) +- +- lm_logits = self.lm_head(sequence_output) ++ with torch.npu.stream(self.stream): # set stream ++ # import pdb ++ # pdb.set_trace() ++ decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) ++ self.stream.synchronize() # synchronize ++ # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) + + loss = None + if labels is not None: +@@ -1806,17 +1920,10 @@ class MT5ForConditionalGeneration(MT5Pre + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output +- + return Seq2SeqLMOutput( + loss=loss, +- logits=lm_logits, +- past_key_values=decoder_outputs.past_key_values, +- decoder_hidden_states=decoder_outputs.hidden_states, +- decoder_attentions=decoder_outputs.attentions, +- cross_attentions=decoder_outputs.cross_attentions, +- encoder_last_hidden_state=encoder_outputs.last_hidden_state, +- encoder_hidden_states=encoder_outputs.hidden_states, +- encoder_attentions=encoder_outputs.attentions, ++ logits=decoder_outputs[0], ++ past_key_values=decoder_outputs[1] + ) + + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation +@@ -1824,6 +1931,7 @@ class MT5ForConditionalGeneration(MT5Pre + self, + input_ids, + past_key_values=None, ++ past_cross_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -1849,6 +1957,7 @@ class MT5ForConditionalGeneration(MT5Pre + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, ++ "past_cross_key_values": past_cross_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, -- Gitee From a6a3836f988646bf890a1f7b13d4eba8a62c44ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 4 Sep 2024 12:34:29 +0000 Subject: [PATCH 035/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index cdb7631c82..5fa13d3c0a 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -85,14 +85,14 @@ def export_textencoder(args, model, save_dir, batch_size): encoder.eval() torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) if not os.path.exists(compiled_path): - model = torch.jit.load(traced_path).eval() + traced_model = torch.jit.load(traced_path).eval() inputs0 = [] # inputs1 = [] inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) print("compiling encoder") compiled_model = mindietorch.compile( - model, + traced_model, inputs=inputs0, allow_tensor_replace_int=True, require_full_compilation=False, @@ -115,16 +115,16 @@ def export_textdecoder(args, model, save_dir, batch_size): text_decoder = model.decoder dummy_input = ( torch.ones([1, 1], dtype=torch.int64).npu(), - torch.randn(1,16,512).to(torch.float16).npu(), + torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(), torch.ones(1,16).npu(), - torch.randn(6,2,1,8,1,64).to(torch.float16).npu(), - torch.randn(6,2,1,8,24,64).to(torch.float16).npu() + torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(), + torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_model).to(torch.float16).npu() ) decoder = TextDecoderExport(text_decoder).npu() decoder.eval() torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) if not os.path.exists(compiled_path): - model = torch.jit.load(traced_path).eval() + traced_model = torch.jit.load(traced_path).eval() print("compiling decoder") compiled_model = mindietorch.compile( model, @@ -132,19 +132,19 @@ def export_textdecoder(args, model, save_dir, batch_size): max_shape = (args.max_batchsize,1), dtype=mindietorch.dtype.INT64), - mindietorch.Input(min_shape =(1, 1, 512), - max_shape=(args.max_batchsize, args.max_input_seq_len, 512), + mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), dtype=mindietorch.dtype.FLOAT16), mindietorch.Input(min_shape = (1,1), max_shape =(args.max_batchsize,args.max_input_seq_len), dtype=mindietorch.dtype.INT64), - mindietorch.Input(min_shape = (6,2,1,8,0,64), - max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64), + mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv), + max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), dtype=mindietorch.dtype.FLOAT16), - mindietorch.Input(min_shape = (6,2,1,8,1,64), - max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64), + mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_model), + max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_model), dtype=mindietorch.dtype.FLOAT16)], allow_tensor_replace_int=True, require_full_compilation=False, -- Gitee From 037d7ad1b64a1073ed8293fd888cc65b790bfabe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 4 Sep 2024 13:45:52 +0000 Subject: [PATCH 036/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index 5fa13d3c0a..b2f1b06157 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -127,7 +127,7 @@ def export_textdecoder(args, model, save_dir, batch_size): traced_model = torch.jit.load(traced_path).eval() print("compiling decoder") compiled_model = mindietorch.compile( - model, + traced_model, inputs=[mindietorch.Input(min_shape =(1, 1), max_shape = (args.max_batchsize,1), dtype=mindietorch.dtype.INT64), -- Gitee From fcb345f4f6b37639a90e20400c4e88db996787c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 5 Sep 2024 11:37:02 +0000 Subject: [PATCH 037/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index b2f1b06157..af67451d69 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -118,7 +118,7 @@ def export_textdecoder(args, model, save_dir, batch_size): torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(), torch.ones(1,16).npu(), torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(), - torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_model).to(torch.float16).npu() + torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu() ) decoder = TextDecoderExport(text_decoder).npu() decoder.eval() @@ -143,8 +143,8 @@ def export_textdecoder(args, model, save_dir, batch_size): max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), dtype=mindietorch.dtype.FLOAT16), - mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_model), - max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_model), + mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads), + max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads), dtype=mindietorch.dtype.FLOAT16)], allow_tensor_replace_int=True, require_full_compilation=False, -- Gitee From 4c918331b1afea3b44672d60e949918c6b717608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 5 Sep 2024 12:37:04 +0000 Subject: [PATCH 038/110] update MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_mt5.patch | 803 +++++++++--------- 1 file changed, 416 insertions(+), 387 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch index 38eb59c192..0fdb93043a 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch @@ -1,568 +1,597 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py 2024-09-03 19:47:27.392000000 +0800 -+++ modeling_mt5.py 2024-09-04 19:29:28.348000000 +0800 -@@ -324,6 +324,7 @@ class MT5Attention(nn.Module): +--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py 2024-09-05 20:29:34.752000000 +0800 ++++ modeling_mt5.py 2024-09-05 20:33:39.712000000 +0800 +@@ -21,8 +21,6 @@ import warnings + from typing import List, Optional, Tuple, Union + + import torch +-import torch_npu +-import mindietorch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +@@ -326,7 +324,6 @@ class MT5Attention(nn.Module): key_value_states=None, position_bias=None, past_key_value=None, -+ past_cross_key_value=None, +- past_cross_key_value=None, layer_head_mask=None, query_length=None, use_cache=False, -@@ -340,7 +341,8 @@ class MT5Attention(nn.Module): +@@ -343,8 +340,7 @@ class MT5Attention(nn.Module): real_seq_length = seq_length if past_key_value is not None: -- if len(past_key_value) != 2: -+ if past_key_value.shape[0] != 2: -+ # if len(past_key_value) != 2: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: ++ if len(past_key_value) != 2: raise ValueError( f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" ) -@@ -368,6 +370,7 @@ class MT5Attention(nn.Module): +@@ -369,10 +365,10 @@ class MT5Attention(nn.Module): + elif past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) ++ hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: -+ past_key_value = shape(past_key_value) +- past_key_value = shape(past_key_value) if key_value_states is None: # self-attn # (batch_size, n_heads, key_length, dim_per_head) -@@ -446,12 +449,125 @@ class MT5Attention(nn.Module): +@@ -451,125 +447,12 @@ class MT5Attention(nn.Module): outputs = outputs + (attn_weights,) return outputs -+class MT5SelfAttention(MT5Attention): -+ def __init__(self, config: T5Config, has_relative_attention_bias=False): -+ super().__init__(config, has_relative_attention_bias) -+ -+ def forward( -+ self, -+ hidden_states, -+ mask=None, -+ position_bias=None, -+ past_key_value=None, -+ layer_head_mask=None, -+ use_cache=False, -+ output_attentions=False, -+ ): -+ """ -+ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -+ """ -+ # Input is (batch_size, seq_length, dim) -+ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -+ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -+ batch_size, seq_length = hidden_states.shape[:2] -+ -+ real_seq_length = seq_length -+ -+ if past_key_value is not None: -+ if past_key_value.shape[0] != 2: -+ # if len(past_key_value) != 2: -+ raise ValueError( -+ f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -+ ) -+ real_seq_length += past_key_value[0].shape[2] -+ key_length = real_seq_length -+ -+ def shape(states): -+ """projection""" -+ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -+ -+ def unshape(states): -+ """reshape""" -+ return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -+ -+ def project(hidden_states, proj_layer, past_key_value): -+ """projects hidden states correctly to key/query states""" -+ if past_key_value is None: -+ # cross-attn -+ # (batch_size, n_heads, seq_length, dim_per_head) -+ hidden_states = shape(proj_layer(hidden_states)) -+ -+ if past_key_value is not None: -+ hidden_states = shape(proj_layer(hidden_states)) -+ hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -+ return hidden_states -+ -+ # get query states -+ query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -+ -+ # get key/value states -+ key_states = project( -+ hidden_states, self.k, past_key_value[0] if past_key_value is not None else None -+ ) -+ value_states = project( -+ hidden_states, self.v, past_key_value[1] if past_key_value is not None else None -+ ) -+ # compute scores -+ scores = torch.matmul( -+ query_states, key_states.transpose(3, 2) -+ ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -+ # print("scores=",scores.dtype) -+ if position_bias is None: -+ if not self.has_relative_attention_bias: -+ position_bias = torch.zeros( -+ (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -+ ) -+ if self.gradient_checkpointing and self.training: -+ position_bias.requires_grad = True -+ else: -+ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -+ -+ # if key and values are already calculated -+ # we want only the last query position bias -+ if past_key_value is not None: -+ position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -+ -+ if mask is not None: -+ position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -+ -+ if self.pruned_heads: -+ mask = torch.ones(position_bias.shape[1]) -+ mask[list(self.pruned_heads)] = 0 -+ position_bias_masked = position_bias[:, mask.bool()] -+ else: -+ position_bias_masked = position_bias -+ -+ scores += position_bias_masked -+ attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -+ scores -+ ) # (batch_size, n_heads, seq_length, key_length) -+ attn_weights = nn.functional.dropout( -+ attn_weights, p=self.dropout, training=self.training -+ ) # (batch_size, n_heads, seq_length, key_length) -+ -+ # Mask heads if we want to -+ if layer_head_mask is not None: -+ attn_weights = attn_weights * layer_head_mask -+ -+ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -+ attn_output = self.o(attn_output) -+ -+ present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -+ outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -+ if output_attentions: -+ outputs = outputs + (attn_weights,) -+ return outputs +-class MT5SelfAttention(MT5Attention): +- def __init__(self, config: MT5Config, has_relative_attention_bias=False): +- super().__init__(config, has_relative_attention_bias) +- +- def forward( +- self, +- hidden_states, +- mask=None, +- position_bias=None, +- past_key_value=None, +- layer_head_mask=None, +- use_cache=False, +- output_attentions=False, +- ): +- """ +- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). +- """ +- # Input is (batch_size, seq_length, dim) +- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) +- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) +- batch_size, seq_length = hidden_states.shape[:2] +- +- real_seq_length = seq_length +- +- if past_key_value is not None: +- if past_key_value.shape[0] != 2: +- # if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] +- key_length = real_seq_length +- +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) +- +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) +- +- def project(hidden_states, proj_layer, past_key_value): +- """projects hidden states correctly to key/query states""" +- if past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- +- if past_key_value is not None: +- hidden_states = shape(proj_layer(hidden_states)) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- return hidden_states +- +- # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- +- # get key/value states +- key_states = project( +- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None +- ) +- value_states = project( +- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None +- ) +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- # print("scores=",scores.dtype) +- if position_bias is None: +- if not self.has_relative_attention_bias: +- position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype +- ) +- if self.gradient_checkpointing and self.training: +- position_bias.requires_grad = True +- else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) +- +- # if key and values are already calculated +- # we want only the last query position bias +- if past_key_value is not None: +- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- +- if mask is not None: +- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) +- +- if self.pruned_heads: +- mask = torch.ones(position_bias.shape[1]) +- mask[list(self.pruned_heads)] = 0 +- position_bias_masked = position_bias[:, mask.bool()] +- else: +- position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) +- attn_output = self.o(attn_output) +- +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 class MT5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() -- self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) -+ self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) +- self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) -@@ -540,6 +656,7 @@ class MT5Block(nn.Module): +@@ -658,7 +541,6 @@ class MT5Block(nn.Module): layer_head_mask=None, cross_attn_layer_head_mask=None, past_key_value=None, -+ past_cross_key_value=None, +- past_cross_key_value=None, use_cache=False, output_attentions=False, return_dict=True, -@@ -549,15 +666,15 @@ class MT5Block(nn.Module): +@@ -668,15 +550,15 @@ class MT5Block(nn.Module): logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 -- if len(past_key_value) != expected_num_past_key_values: -- raise ValueError( -- f"There should be {expected_num_past_key_values} past states. " -- f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -- f"Got {len(past_key_value)} past key / value states" -- ) -+ # if len(past_key_value) != expected_num_past_key_values: -+ # raise ValueError( -+ # f"There should be {expected_num_past_key_values} past states. " -+ # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -+ # f"Got {len(past_key_value)} past key / value states" -+ # ) - -- self_attn_past_key_value = past_key_value[:2] -- cross_attn_past_key_value = past_key_value[2:] -+ self_attn_past_key_value = past_key_value -+ cross_attn_past_key_value = past_cross_key_value +- # if len(past_key_value) != expected_num_past_key_values: +- # raise ValueError( +- # f"There should be {expected_num_past_key_values} past states. " +- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- # f"Got {len(past_key_value)} past key / value states" +- # ) ++ if len(past_key_value) != expected_num_past_key_values: ++ raise ValueError( ++ f"There should be {expected_num_past_key_values} past states. " ++ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" ++ f"Got {len(past_key_value)} past key / value states" ++ ) + +- self_attn_past_key_value = past_key_value +- cross_attn_past_key_value = past_cross_key_value ++ self_attn_past_key_value = past_key_value[:2] ++ cross_attn_past_key_value = past_key_value[2:] else: self_attn_past_key_value, cross_attn_past_key_value = None, None -@@ -614,9 +731,7 @@ class MT5Block(nn.Module): +@@ -709,7 +591,8 @@ class MT5Block(nn.Module): + query_length = present_key_value_state[0].shape[2] + else: + query_length = None +- ++ import pdb ++ pdb.set_trace() + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, +@@ -733,7 +616,9 @@ class MT5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states -- if present_key_value_state is not None: -- present_key_value_state = present_key_value_state + cross_attention_outputs[1] -- -+ cross_attn_past_key_values = cross_attention_outputs[1] +- cross_attn_past_key_values = cross_attention_outputs[1] ++ if present_key_value_state is not None: ++ present_key_value_state = present_key_value_state + cross_attention_outputs[1] ++ # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] -@@ -635,7 +750,7 @@ class MT5Block(nn.Module): +@@ -752,7 +637,7 @@ class MT5Block(nn.Module): outputs = (hidden_states,) if use_cache: -- outputs = outputs + (present_key_value_state,) + attention_outputs -+ outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs +- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs ++ outputs = outputs + (present_key_value_state,) + attention_outputs else: outputs = outputs + attention_outputs -@@ -884,11 +999,14 @@ class MT5PreTrainedModel(PreTrainedModel +@@ -1001,14 +886,11 @@ class MT5PreTrainedModel(PreTrainedModel # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 class MT5Stack(MT5PreTrainedModel): -- def __init__(self, config, embed_tokens=None): -+ def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): +- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): ++ def __init__(self, config, embed_tokens=None): super().__init__(config) self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder -+ self.lm_head=lm_head -+ self.encodecrosskeyvalue = encodecrosskeyvalue -+ self.model_dim = config.d_model +- self.lm_head=lm_head +- self.encodecrosskeyvalue = encodecrosskeyvalue +- self.model_dim = config.d_model self.block = nn.ModuleList( [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -956,13 +1074,14 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1076,14 +958,13 @@ class MT5Stack(MT5PreTrainedModel): def forward( self, input_ids=None, -- attention_mask=None, ++ attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, -+ past_key_values=None, -+ past_cross_key_values=None, -+ attention_mask=None, +- past_key_values=None, +- past_cross_key_values=None, +- attention_mask=None, inputs_embeds=None, head_mask=None, cross_attn_head_mask=None, -- past_key_values=None, ++ past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, -@@ -1008,9 +1127,9 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1129,9 +1010,9 @@ class MT5Stack(MT5PreTrainedModel): raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") # initialize past_key_values with `None` if past does not exist -- if past_key_values is None: -+ if not self.is_decoder: +- if not self.is_decoder: ++ if past_key_values is None: past_key_values = [None] * len(self.block) -- -+ past_cross_key_values = [None] * len(self.block) +- past_cross_key_values = [None] * len(self.block) ++ if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1041,7 +1160,7 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1162,7 +1043,7 @@ class MT5Stack(MT5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) -- present_key_value_states = () if use_cache else None -+ present_key_value_states = [] if use_cache else None +- present_key_value_states = [] if use_cache else None ++ present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1049,8 +1168,10 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1170,10 +1051,8 @@ class MT5Stack(MT5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) -- -- for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): -+ for i, layer_module in enumerate(self.block): -+ # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): -+ past_key_value = past_key_values[i] -+ past_cross_key_value = past_cross_key_values[i] +- for i, layer_module in enumerate(self.block): +- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): +- past_key_value = past_key_values[i] +- past_cross_key_value = past_cross_key_values[i] ++ ++ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1100,6 +1221,7 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1223,7 +1102,6 @@ class MT5Stack(MT5PreTrainedModel): layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, past_key_value=past_key_value, -+ past_cross_key_value=past_cross_key_value, +- past_cross_key_value=past_cross_key_value, use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1107,19 +1229,19 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1231,19 +1109,19 @@ class MT5Stack(MT5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: -- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] -+ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] +- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] -- hidden_states, present_key_value_state = layer_outputs[:2] -+ hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] +- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] ++ hidden_states, present_key_value_state = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), # (cross-attention position bias), (cross-attention weights) -- position_bias = layer_outputs[2] -+ position_bias = layer_outputs[3] +- position_bias = layer_outputs[3] ++ position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: -- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] -+ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] +- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] ++ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] # append next layer key value states if use_cache: -- present_key_value_states = present_key_value_states + (present_key_value_state,) -+ present_key_value_states.extend(present_key_value_state) +- present_key_value_states.extend(present_key_value_state) ++ present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1133,7 +1255,7 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1257,12 +1135,12 @@ class MT5Stack(MT5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) -- hidden_states = self.dropout(hidden_states) -+ hidden_states = self.dropout(hidden_states).half() +- hidden_states = self.dropout(hidden_states).half() ++ hidden_states = self.dropout(hidden_states) # Add last layer if output_hidden_states: -@@ -1151,13 +1273,17 @@ class MT5Stack(MT5PreTrainedModel): + all_hidden_states = all_hidden_states + (hidden_states,) +- ++ print("return_dict=",return_dict) + if not return_dict: + return tuple( + v +@@ -1275,17 +1153,13 @@ class MT5Stack(MT5PreTrainedModel): ] if v is not None ) -- return BaseModelOutputWithPastAndCrossAttentions( -- last_hidden_state=hidden_states, -- past_key_values=present_key_value_states, -- hidden_states=all_hidden_states, -- attentions=all_attentions, -- cross_attentions=all_cross_attentions, -- ) -+ present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None -+ if not self.is_decoder and self.encodecrosskeyvalue: -+ res = self.encodecrosskeyvalue(hidden_states) -+ return tuple((hidden_states, res)) -+ lm_logits = None -+ if self.is_decoder: -+ #logits = None -+ if self.config.tie_word_embeddings: -+ hidden_states = hidden_states * (self.model_dim ** -0.5) -+ lm_logits = self.lm_head(hidden_states) -+ return tuple((lm_logits, present_key_value_states)) +- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None +- if not self.is_decoder and self.encodecrosskeyvalue: +- res = self.encodecrosskeyvalue(hidden_states) +- return tuple((hidden_states, res)) +- lm_logits = None +- if self.is_decoder: +- #logits = None +- if self.config.tie_word_embeddings: +- hidden_states = hidden_states * (self.model_dim ** -0.5) +- lm_logits = self.lm_head(hidden_states) +- return tuple((lm_logits, present_key_value_states)) ++ return BaseModelOutputWithPastAndCrossAttentions( ++ last_hidden_state=hidden_states, ++ past_key_values=present_key_value_states, ++ hidden_states=all_hidden_states, ++ attentions=all_attentions, ++ cross_attentions=all_cross_attentions, ++ ) MT5_START_DOCSTRING = r""" -@@ -1549,6 +1675,29 @@ class MT5Model(MT5PreTrainedModel): +@@ -1677,29 +1551,6 @@ class MT5Model(MT5PreTrainedModel): ) -+class EncoderToCrossKeyValue(nn.Module): -+ def __init__(self, cross_key, cross_value, num_heads, d_kv): -+ super().__init__() -+ self.cross_key = cross_key -+ self.cross_value = cross_value -+ self.num_heads = num_heads -+ self.d_kv = d_kv -+ -+ -+ def forward(self, hidden_states): -+ batch_size = hidden_states.shape[0] -+ encoder_hidden_states_kvs = [] -+ # for i in range(len(self.cross_value)): -+ # encoder_hidden_states_kvs.append( -+ # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), -+ # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) -+ for i in range(len(self.cross_value)): -+ encoder_hidden_states_kvs.append( -+ torch.stack((self.cross_key[i](hidden_states), -+ self.cross_value[i](hidden_states)), dim=0)) -+ past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) -+ return past_cross_key_values -+ +-class EncoderToCrossKeyValue(nn.Module): +- def __init__(self, cross_key, cross_value, num_heads, d_kv): +- super().__init__() +- self.cross_key = cross_key +- self.cross_value = cross_value +- self.num_heads = num_heads +- self.d_kv = d_kv +- +- +- def forward(self, hidden_states): +- batch_size = hidden_states.shape[0] +- encoder_hidden_states_kvs = [] +- # for i in range(len(self.cross_value)): +- # encoder_hidden_states_kvs.append( +- # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), +- # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) +- for i in range(len(self.cross_value)): +- encoder_hidden_states_kvs.append( +- torch.stack((self.cross_key[i](hidden_states), +- self.cross_value[i](hidden_states)), dim=0)) +- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) +- return past_cross_key_values +- @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) class MT5ForConditionalGeneration(MT5PreTrainedModel): r""" -@@ -1573,28 +1722,45 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1724,45 +1575,28 @@ class MT5ForConditionalGeneration(MT5Pre _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 -- def __init__(self, config: MT5Config): -+ def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): +- def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): ++ def __init__(self, config: MT5Config): super().__init__(config) -- self.model_dim = config.d_model -- -- self.shared = nn.Embedding(config.vocab_size, config.d_model) -- -- encoder_config = copy.deepcopy(config) -- encoder_config.is_decoder = False -- encoder_config.use_cache = False -- encoder_config.is_encoder_decoder = False -- self.encoder = MT5Stack(encoder_config, self.shared) +- +- self.encoder_path = encoder_path +- self.decoder_path = decoder_path +- if not self.encoder_path or not self.decoder_path: +- self.model_dim = config.d_model +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- - -- decoder_config = copy.deepcopy(config) -- decoder_config.is_decoder = True -- decoder_config.is_encoder_decoder = False -- decoder_config.num_layers = config.num_decoder_layers -- self.decoder = MT5Stack(decoder_config, self.shared) +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) +- self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head) +- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) +- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) +- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = MT5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) +- self.encoder_mindie = None +- self.decoder_mindie = None +- if self.encoder_path: +- self.encoder_mindie = torch.jit.load(self.encoder_path) +- if self.decoder_path: +- self.decoder_mindie = torch.jit.load(self.decoder_path) +- self.stream = torch.npu.Stream(f"npu:{device_id}") +- self.device_id = device_id - -- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -+ -+ self.encoder_path = encoder_path -+ self.decoder_path = decoder_path -+ if not self.encoder_path or not self.decoder_path: -+ self.model_dim = config.d_model -+ self.shared = nn.Embedding(config.vocab_size, config.d_model) -+ decoder_config = copy.deepcopy(config) -+ decoder_config.is_decoder = True -+ decoder_config.is_encoder_decoder = False -+ decoder_config.num_layers = config.num_decoder_layers -+ +- +- def get_device(self): +- return f"npu:{self.device_id}" ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = MT5Stack(encoder_config, self.shared) + -+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -+ self.decoder = MT5Stack(decoder_config, self.shared, , self.lm_head) -+ cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) -+ cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) -+ encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) -+ encoder_config = copy.deepcopy(config) -+ encoder_config.is_decoder = False -+ encoder_config.use_cache = False -+ encoder_config.is_encoder_decoder = False -+ self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) -+ self.encoder_mindie = None -+ self.decoder_mindie = None -+ if self.encoder_path: -+ self.encoder_mindie = torch.jit.load(self.encoder_path) -+ if self.decoder_path: -+ self.decoder_mindie = torch.jit.load(self.decoder_path) -+ self.stream = torch.npu.Stream(f"npu:{device_id}") -+ self.device_id = device_id ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ self.decoder = MT5Stack(decoder_config, self.shared) + -+ -+ def get_device(self): -+ return f"npu:{self.device_id}" ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) # Initialize weights and apply final processing -- self.post_init() -+ # self.post_init() +- # self.post_init() ++ self.post_init() # Model parallel self.model_parallel = False -@@ -1677,6 +1843,7 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1845,7 +1679,6 @@ class MT5ForConditionalGeneration(MT5Pre cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, -+ past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, -@@ -1724,76 +1891,23 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1893,23 +1726,76 @@ class MT5ForConditionalGeneration(MT5Pre if self.config.num_layers == self.config.num_decoder_layers: warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) decoder_head_mask = head_mask -+ -+ hidden_states = encoder_outputs["last_hidden_state"] -+ past_cross_key_values = encoder_outputs["past_cross_key_values"] - -- # Encode if needed (training, first prediction pass) -- if encoder_outputs is None: -- # Convert encoder inputs in embeddings if needed -- encoder_outputs = self.encoder( -- input_ids=input_ids, -- attention_mask=attention_mask, -- inputs_embeds=inputs_embeds, -- head_mask=head_mask, -- output_attentions=output_attentions, -- output_hidden_states=output_hidden_states, -- return_dict=return_dict, -- ) -- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): -- encoder_outputs = BaseModelOutput( -- last_hidden_state=encoder_outputs[0], -- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, -- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, -- ) -- -- hidden_states = encoder_outputs[0] -- -- if self.model_parallel: -- torch.cuda.set_device(self.decoder.first_device) -+ # if self.model_parallel: -+ # torch.cuda.set_device(self.decoder.first_device) +- +- hidden_states = encoder_outputs["last_hidden_state"] +- past_cross_key_values = encoder_outputs["past_cross_key_values"] + +- # if self.model_parallel: +- # torch.cuda.set_device(self.decoder.first_device) ++ # Encode if needed (training, first prediction pass) ++ if encoder_outputs is None: ++ # Convert encoder inputs in embeddings if needed ++ encoder_outputs = self.encoder( ++ input_ids=input_ids, ++ attention_mask=attention_mask, ++ inputs_embeds=inputs_embeds, ++ head_mask=head_mask, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): ++ encoder_outputs = BaseModelOutput( ++ last_hidden_state=encoder_outputs[0], ++ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, ++ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ++ ) ++ ++ hidden_states = encoder_outputs[0] ++ ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(labels) -- # Set device for model parallelism -- if self.model_parallel: -- torch.cuda.set_device(self.decoder.first_device) -- hidden_states = hidden_states.to(self.decoder.first_device) -- if decoder_input_ids is not None: -- decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) -- if attention_mask is not None: -- attention_mask = attention_mask.to(self.decoder.first_device) -- if decoder_attention_mask is not None: -- decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) -- -- # Decode -- decoder_outputs = self.decoder( -- input_ids=decoder_input_ids, -- attention_mask=decoder_attention_mask, -- inputs_embeds=decoder_inputs_embeds, -- past_key_values=past_key_values, -- encoder_hidden_states=hidden_states, -- encoder_attention_mask=attention_mask, -- head_mask=decoder_head_mask, -- cross_attn_head_mask=cross_attn_head_mask, -- use_cache=use_cache, -- output_attentions=output_attentions, -- output_hidden_states=output_hidden_states, -- return_dict=return_dict, -- ) -- -- sequence_output = decoder_outputs[0] -- -- # Set device for model parallelism -- if self.model_parallel: -- torch.cuda.set_device(self.encoder.first_device) -- self.lm_head = self.lm_head.to(self.encoder.first_device) -- sequence_output = sequence_output.to(self.lm_head.weight.device) -- -- if self.config.tie_word_embeddings: -- # Rescale output before projecting on vocab -- # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 -- sequence_output = sequence_output * (self.model_dim**-0.5) -- -- lm_logits = self.lm_head(sequence_output) -+ with torch.npu.stream(self.stream): # set stream -+ # import pdb -+ # pdb.set_trace() -+ decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -+ self.stream.synchronize() # synchronize -+ # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) +- with torch.npu.stream(self.stream): # set stream +- # import pdb +- # pdb.set_trace() +- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) +- self.stream.synchronize() # synchronize +- # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.decoder.first_device) ++ hidden_states = hidden_states.to(self.decoder.first_device) ++ if decoder_input_ids is not None: ++ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) ++ if attention_mask is not None: ++ attention_mask = attention_mask.to(self.decoder.first_device) ++ if decoder_attention_mask is not None: ++ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) ++ ++ # Decode ++ decoder_outputs = self.decoder( ++ input_ids=decoder_input_ids, ++ attention_mask=decoder_attention_mask, ++ inputs_embeds=decoder_inputs_embeds, ++ past_key_values=past_key_values, ++ encoder_hidden_states=hidden_states, ++ encoder_attention_mask=attention_mask, ++ head_mask=decoder_head_mask, ++ cross_attn_head_mask=cross_attn_head_mask, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) ++ ++ sequence_output = decoder_outputs[0] ++ ++ # Set device for model parallelism ++ if self.model_parallel: ++ torch.cuda.set_device(self.encoder.first_device) ++ self.lm_head = self.lm_head.to(self.encoder.first_device) ++ sequence_output = sequence_output.to(self.lm_head.weight.device) ++ ++ if self.config.tie_word_embeddings: ++ # Rescale output before projecting on vocab ++ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 ++ sequence_output = sequence_output * (self.model_dim**-0.5) ++ ++ lm_logits = self.lm_head(sequence_output) loss = None if labels is not None: -@@ -1806,17 +1920,10 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1922,10 +1808,17 @@ class MT5ForConditionalGeneration(MT5Pre if not return_dict: output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs return ((loss,) + output) if loss is not None else output -- ++ return Seq2SeqLMOutput( loss=loss, -- logits=lm_logits, -- past_key_values=decoder_outputs.past_key_values, -- decoder_hidden_states=decoder_outputs.hidden_states, -- decoder_attentions=decoder_outputs.attentions, -- cross_attentions=decoder_outputs.cross_attentions, -- encoder_last_hidden_state=encoder_outputs.last_hidden_state, -- encoder_hidden_states=encoder_outputs.hidden_states, -- encoder_attentions=encoder_outputs.attentions, -+ logits=decoder_outputs[0], -+ past_key_values=decoder_outputs[1] +- logits=decoder_outputs[0], +- past_key_values=decoder_outputs[1] ++ logits=lm_logits, ++ past_key_values=decoder_outputs.past_key_values, ++ decoder_hidden_states=decoder_outputs.hidden_states, ++ decoder_attentions=decoder_outputs.attentions, ++ cross_attentions=decoder_outputs.cross_attentions, ++ encoder_last_hidden_state=encoder_outputs.last_hidden_state, ++ encoder_hidden_states=encoder_outputs.hidden_states, ++ encoder_attentions=encoder_outputs.attentions, ) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation -@@ -1824,6 +1931,7 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1933,7 +1826,6 @@ class MT5ForConditionalGeneration(MT5Pre self, input_ids, past_key_values=None, -+ past_cross_key_values=None, +- past_cross_key_values=None, attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1849,6 +1957,7 @@ class MT5ForConditionalGeneration(MT5Pre +@@ -1959,7 +1851,6 @@ class MT5ForConditionalGeneration(MT5Pre return { "decoder_input_ids": input_ids, "past_key_values": past_key_values, -+ "past_cross_key_values": past_cross_key_values, +- "past_cross_key_values": past_cross_key_values, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -- Gitee From 64c73cd0ae3aee00ed990b1c586af60923e4a885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 5 Sep 2024 12:37:56 +0000 Subject: [PATCH 039/110] =?UTF-8?q?MT5=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_mt5.py | 181 ++++++++++++++++++ MindIE/MindIE-Torch/built-in/T5/test_mt5.py | 54 ++++++ 2 files changed, 235 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_mt5.py create mode 100644 MindIE/MindIE-Torch/built-in/T5/test_mt5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py new file mode 100644 index 0000000000..dc8308e362 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py @@ -0,0 +1,181 @@ + +import torch +import torch_npu +import argparse +import os +import mindietorch +from transformers import MT5ForConditionalGeneration + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--output_dir", + type=str, + default="./models", + help="save dir" + ) + parser.add_argument( + "--model_path", + type=str, + default="./T5-Small", + help="T5 model path" + ) + parser.add_argument( + "--max_batchsize", + type=int, + default=1, + help="max batchsize when running" + ) + + parser.add_argument( + "--max_input_seq_len", + type=int, + default=256, + help="max input_sequence length when running" + ) + + + parser.add_argument( + "--device_id", + type=int, + default=0, + help="npu device id" + ) + return parser.parse_args() + + +class TextEncoderExport(torch.nn.Module): + def __init__(self, textencoder_model): + super(TextEncoderExport, self).__init__() + self.textencoder_model = textencoder_model + + def forward(self, input_ids): + return self.textencoder_model(input_ids=input_ids) + +class TextDecoderExport(torch.nn.Module): + def __init__(self, textdecoder_model): + super(TextDecoderExport, self).__init__() + self.textdecoder_model = textdecoder_model + + def forward(self, + input_ids, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + past_cross_key_values): + return self.textdecoder_model(input_ids=input_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + past_cross_key_values=past_cross_key_values, + return_dict=True) + +def export_textencoder(args, model, save_dir, batch_size): + encoder_path = os.path.join(save_dir, "encoder") + if not os.path.exists(encoder_path): + os.makedirs(encoder_path, mode=0o640) + traced_path = os.path.join(encoder_path, "encoder.pt") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(traced_path): + text_encoder = model.encoder + dummy_input = ( + torch.ones([1, 128], dtype=torch.int64).npu() + ) + encoder = TextEncoderExport(text_encoder) + encoder.eval() + torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + + inputs0 = [] + # inputs1 = [] + inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) + print("compiling encoder") + compiled_model = mindietorch.compile( + traced_model, + inputs=inputs0, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def export_textdecoder(args, model, save_dir, batch_size): + decoder_path = os.path.join(save_dir, "decoder") + if not os.path.exists(decoder_path): + os.makedirs(decoder_path, mode=0o640) + traced_path = os.path.join(decoder_path, "decoder.pt") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + model_path = args.model_path + max_lenth = 120 + if not os.path.exists(traced_path): + text_decoder = model.decoder + dummy_input = ( + torch.ones([1, 1], dtype=torch.int64).npu(), + torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(), + torch.ones(1,16).npu(), + torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(), + torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu() + ) + decoder = TextDecoderExport(text_decoder).npu() + decoder.eval() + torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + print("compiling decoder") + compiled_model = mindietorch.compile( + traced_model, + inputs=[mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,1), + dtype=mindietorch.dtype.INT64), + + mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16), + + mindietorch.Input(min_shape = (1,1), + max_shape =(args.max_batchsize,args.max_input_seq_len), + dtype=mindietorch.dtype.INT64), + mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv), + max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16), + + mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads), + max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len,model.config.d_kv*model.config.num_heads), + dtype=mindietorch.dtype.FLOAT16)], + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def main(): + args = parse_arguments() + device_id = args.device_id + save_dir = args.output_dir + torch.npu.set_device(device_id) + batch_size = 1 + model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu() + encoder_path = os.path.join(save_dir, "encoder") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textencoder(args, model, save_dir, batch_size) + print("export encoder_model done!") + + decoder_path = os.path.join(save_dir, "decoder") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textdecoder(args, model, save_dir, batch_size) + print("export decoder_model done!") + + + + +if __name__ == "__main__": + main() diff --git a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py new file mode 100644 index 0000000000..af441392d4 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py @@ -0,0 +1,54 @@ +import torch +import time +import argparse +import torch_npu +from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--hf_model_path", type=str, required=True) + + parser.add_argument("--encoder_aie_path", type=str, required=True) + parser.add_argument("--decoder_aie_path", type=str, required=True) + + parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + + args = parser.parse_args() + return args + +def main(): + args = parse_args() + torch.npu.set_device(args.device_id) + tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path) + text = [ + "translate English to German: The house is wonderful.", + "summarize: I am a high-performance inference optimizer and runtime.", + "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world", + ] + model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu() + encoder = model.encoder + decoder = model.decoder + encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu() + t5_config = MT5Config.from_pretrained(args.hf_model_path) + + encoder_output = encoder(encoder_input)[0] + model = MT5ForConditionalGeneration(config=t5_config, + encoder_path=args.encoder_aie_path, + decoder_path=args.decoder_aie_path, + device_id=args.device_id).half().npu() + + encoder_mindie = model.encoder_mindie + decoder_mindie = model.decoder_mindie + mindie_stream = model.stream + with torch.npu.stream(mindie_stream): # set stream + mindie_encoder_output = encoder_mindie(encoder_input)[0] + mindie_stream.synchronize() # synchronize + if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99: + print("encoder precision failed") + else: + print("test OK") + + +if __name__ == "__main__": + main() + -- Gitee From f32b967a55b14959ee64fa8c2614ea94b7be4df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:48:57 +0000 Subject: [PATCH 040/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Foutputs=5Fpatch?= =?UTF-8?q?.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/T5_modeling_outputs_patch.py | 28 ------------------- 1 file changed, 28 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py deleted file mode 100644 index 21cd251b95..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import transformers - - -def main(): - transformers_path = transformers.__path__ - transformers_version = transformers.__version__ - - assert transformers_version == '4.42.0', "expectation transformers==4.42.0" - os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch') - - -if __name__ == '__main__': - main() -- Gitee From d569cb70edacafd1ca0bddbeefdce29fc3877d45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:49:16 +0000 Subject: [PATCH 041/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Fmt5.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/modeling_mt5.patch | 597 ------------------ 1 file changed, 597 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch deleted file mode 100644 index 0fdb93043a..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch +++ /dev/null @@ -1,597 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py 2024-09-05 20:29:34.752000000 +0800 -+++ modeling_mt5.py 2024-09-05 20:33:39.712000000 +0800 -@@ -21,8 +21,6 @@ import warnings - from typing import List, Optional, Tuple, Union - - import torch --import torch_npu --import mindietorch - from torch import nn - from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -@@ -326,7 +324,6 @@ class MT5Attention(nn.Module): - key_value_states=None, - position_bias=None, - past_key_value=None, -- past_cross_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, -@@ -343,8 +340,7 @@ class MT5Attention(nn.Module): - real_seq_length = seq_length - - if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -+ if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) -@@ -369,10 +365,10 @@ class MT5Attention(nn.Module): - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) -+ - hidden_states = shape(proj_layer(key_value_states)) - - if past_key_value is not None: -- past_key_value = shape(past_key_value) - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) -@@ -451,125 +447,12 @@ class MT5Attention(nn.Module): - outputs = outputs + (attn_weights,) - return outputs - --class MT5SelfAttention(MT5Attention): -- def __init__(self, config: MT5Config, has_relative_attention_bias=False): -- super().__init__(config, has_relative_attention_bias) -- -- def forward( -- self, -- hidden_states, -- mask=None, -- position_bias=None, -- past_key_value=None, -- layer_head_mask=None, -- use_cache=False, -- output_attentions=False, -- ): -- """ -- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -- """ -- # Input is (batch_size, seq_length, dim) -- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -- batch_size, seq_length = hidden_states.shape[:2] -- -- real_seq_length = seq_length -- -- if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -- raise ValueError( -- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -- ) -- real_seq_length += past_key_value[0].shape[2] -- key_length = real_seq_length -- -- def shape(states): -- """projection""" -- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -- -- def unshape(states): -- """reshape""" -- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -- -- def project(hidden_states, proj_layer, past_key_value): -- """projects hidden states correctly to key/query states""" -- if past_key_value is None: -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(hidden_states)) -- -- if past_key_value is not None: -- hidden_states = shape(proj_layer(hidden_states)) -- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -- return hidden_states -- -- # get query states -- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -- -- # get key/value states -- key_states = project( -- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None -- ) -- value_states = project( -- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None -- ) -- # compute scores -- scores = torch.matmul( -- query_states, key_states.transpose(3, 2) -- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- # print("scores=",scores.dtype) -- if position_bias is None: -- if not self.has_relative_attention_bias: -- position_bias = torch.zeros( -- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -- ) -- if self.gradient_checkpointing and self.training: -- position_bias.requires_grad = True -- else: -- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -- -- # if key and values are already calculated -- # we want only the last query position bias -- if past_key_value is not None: -- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -- -- if mask is not None: -- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) -- -- if self.pruned_heads: -- mask = torch.ones(position_bias.shape[1]) -- mask[list(self.pruned_heads)] = 0 -- position_bias_masked = position_bias[:, mask.bool()] -- else: -- position_bias_masked = position_bias -- -- scores += position_bias_masked -- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -- scores -- ) # (batch_size, n_heads, seq_length, key_length) -- attn_weights = nn.functional.dropout( -- attn_weights, p=self.dropout, training=self.training -- ) # (batch_size, n_heads, seq_length, key_length) -- -- # Mask heads if we want to -- if layer_head_mask is not None: -- attn_weights = attn_weights * layer_head_mask -- -- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -- attn_output = self.o(attn_output) -- -- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- if output_attentions: -- outputs = outputs + (attn_weights,) -- return outputs - - # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 - class MT5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() -- self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) -+ self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - -@@ -658,7 +541,6 @@ class MT5Block(nn.Module): - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, -- past_cross_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, -@@ -668,15 +550,15 @@ class MT5Block(nn.Module): - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - -- # if len(past_key_value) != expected_num_past_key_values: -- # raise ValueError( -- # f"There should be {expected_num_past_key_values} past states. " -- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -- # f"Got {len(past_key_value)} past key / value states" -- # ) -+ if len(past_key_value) != expected_num_past_key_values: -+ raise ValueError( -+ f"There should be {expected_num_past_key_values} past states. " -+ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -+ f"Got {len(past_key_value)} past key / value states" -+ ) - -- self_attn_past_key_value = past_key_value -- cross_attn_past_key_value = past_cross_key_value -+ self_attn_past_key_value = past_key_value[:2] -+ cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - -@@ -709,7 +591,8 @@ class MT5Block(nn.Module): - query_length = present_key_value_state[0].shape[2] - else: - query_length = None -- -+ import pdb -+ pdb.set_trace() - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, -@@ -733,7 +616,9 @@ class MT5Block(nn.Module): - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states -- cross_attn_past_key_values = cross_attention_outputs[1] -+ if present_key_value_state is not None: -+ present_key_value_state = present_key_value_state + cross_attention_outputs[1] -+ - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - -@@ -752,7 +637,7 @@ class MT5Block(nn.Module): - outputs = (hidden_states,) - - if use_cache: -- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs -+ outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - -@@ -1001,14 +886,11 @@ class MT5PreTrainedModel(PreTrainedModel - - # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 - class MT5Stack(MT5PreTrainedModel): -- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): -+ def __init__(self, config, embed_tokens=None): - super().__init__(config) - - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder -- self.lm_head=lm_head -- self.encodecrosskeyvalue = encodecrosskeyvalue -- self.model_dim = config.d_model - - self.block = nn.ModuleList( - [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -1076,14 +958,13 @@ class MT5Stack(MT5PreTrainedModel): - def forward( - self, - input_ids=None, -+ attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, -- past_key_values=None, -- past_cross_key_values=None, -- attention_mask=None, - inputs_embeds=None, - head_mask=None, - cross_attn_head_mask=None, -+ past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, -@@ -1129,9 +1010,9 @@ class MT5Stack(MT5PreTrainedModel): - raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - - # initialize past_key_values with `None` if past does not exist -- if not self.is_decoder: -+ if past_key_values is None: - past_key_values = [None] * len(self.block) -- past_cross_key_values = [None] * len(self.block) -+ - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - -@@ -1162,7 +1043,7 @@ class MT5Stack(MT5PreTrainedModel): - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) -- present_key_value_states = [] if use_cache else None -+ present_key_value_states = () if use_cache else None - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1170,10 +1051,8 @@ class MT5Stack(MT5PreTrainedModel): - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) -- for i, layer_module in enumerate(self.block): -- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): -- past_key_value = past_key_values[i] -- past_cross_key_value = past_cross_key_values[i] -+ -+ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): - layer_head_mask = head_mask[i] - cross_attn_layer_head_mask = cross_attn_head_mask[i] - # Model parallel -@@ -1223,7 +1102,6 @@ class MT5Stack(MT5PreTrainedModel): - layer_head_mask=layer_head_mask, - cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, -- past_cross_key_value=past_cross_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) -@@ -1231,19 +1109,19 @@ class MT5Stack(MT5PreTrainedModel): - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - if use_cache is False: -- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] -+ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - -- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] -+ hidden_states, present_key_value_state = layer_outputs[:2] - - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), - # (cross-attention position bias), (cross-attention weights) -- position_bias = layer_outputs[3] -+ position_bias = layer_outputs[2] - if self.is_decoder and encoder_hidden_states is not None: -- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] -+ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: -- present_key_value_states.extend(present_key_value_state) -+ present_key_value_states = present_key_value_states + (present_key_value_state,) - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[3],) -@@ -1257,12 +1135,12 @@ class MT5Stack(MT5PreTrainedModel): - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.final_layer_norm(hidden_states) -- hidden_states = self.dropout(hidden_states).half() -+ hidden_states = self.dropout(hidden_states) - - # Add last layer - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) -- -+ print("return_dict=",return_dict) - if not return_dict: - return tuple( - v -@@ -1275,17 +1153,13 @@ class MT5Stack(MT5PreTrainedModel): - ] - if v is not None - ) -- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None -- if not self.is_decoder and self.encodecrosskeyvalue: -- res = self.encodecrosskeyvalue(hidden_states) -- return tuple((hidden_states, res)) -- lm_logits = None -- if self.is_decoder: -- #logits = None -- if self.config.tie_word_embeddings: -- hidden_states = hidden_states * (self.model_dim ** -0.5) -- lm_logits = self.lm_head(hidden_states) -- return tuple((lm_logits, present_key_value_states)) -+ return BaseModelOutputWithPastAndCrossAttentions( -+ last_hidden_state=hidden_states, -+ past_key_values=present_key_value_states, -+ hidden_states=all_hidden_states, -+ attentions=all_attentions, -+ cross_attentions=all_cross_attentions, -+ ) - - - MT5_START_DOCSTRING = r""" -@@ -1677,29 +1551,6 @@ class MT5Model(MT5PreTrainedModel): - ) - - --class EncoderToCrossKeyValue(nn.Module): -- def __init__(self, cross_key, cross_value, num_heads, d_kv): -- super().__init__() -- self.cross_key = cross_key -- self.cross_value = cross_value -- self.num_heads = num_heads -- self.d_kv = d_kv -- -- -- def forward(self, hidden_states): -- batch_size = hidden_states.shape[0] -- encoder_hidden_states_kvs = [] -- # for i in range(len(self.cross_value)): -- # encoder_hidden_states_kvs.append( -- # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), -- # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) -- for i in range(len(self.cross_value)): -- encoder_hidden_states_kvs.append( -- torch.stack((self.cross_key[i](hidden_states), -- self.cross_value[i](hidden_states)), dim=0)) -- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) -- return past_cross_key_values -- - @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) - class MT5ForConditionalGeneration(MT5PreTrainedModel): - r""" -@@ -1724,45 +1575,28 @@ class MT5ForConditionalGeneration(MT5Pre - _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 -- def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): -+ def __init__(self, config: MT5Config): - super().__init__(config) -- -- self.encoder_path = encoder_path -- self.decoder_path = decoder_path -- if not self.encoder_path or not self.decoder_path: -- self.model_dim = config.d_model -- self.shared = nn.Embedding(config.vocab_size, config.d_model) -- decoder_config = copy.deepcopy(config) -- decoder_config.is_decoder = True -- decoder_config.is_encoder_decoder = False -- decoder_config.num_layers = config.num_decoder_layers -- -- -- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -- self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head) -- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) -- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) -- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) -- encoder_config = copy.deepcopy(config) -- encoder_config.is_decoder = False -- encoder_config.use_cache = False -- encoder_config.is_encoder_decoder = False -- self.encoder = MT5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) -- self.encoder_mindie = None -- self.decoder_mindie = None -- if self.encoder_path: -- self.encoder_mindie = torch.jit.load(self.encoder_path) -- if self.decoder_path: -- self.decoder_mindie = torch.jit.load(self.decoder_path) -- self.stream = torch.npu.Stream(f"npu:{device_id}") -- self.device_id = device_id -- -- -- def get_device(self): -- return f"npu:{self.device_id}" -+ self.model_dim = config.d_model -+ -+ self.shared = nn.Embedding(config.vocab_size, config.d_model) -+ -+ encoder_config = copy.deepcopy(config) -+ encoder_config.is_decoder = False -+ encoder_config.use_cache = False -+ encoder_config.is_encoder_decoder = False -+ self.encoder = MT5Stack(encoder_config, self.shared) -+ -+ decoder_config = copy.deepcopy(config) -+ decoder_config.is_decoder = True -+ decoder_config.is_encoder_decoder = False -+ decoder_config.num_layers = config.num_decoder_layers -+ self.decoder = MT5Stack(decoder_config, self.shared) -+ -+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - - # Initialize weights and apply final processing -- # self.post_init() -+ self.post_init() - - # Model parallel - self.model_parallel = False -@@ -1845,7 +1679,6 @@ class MT5ForConditionalGeneration(MT5Pre - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, -- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, -@@ -1893,23 +1726,76 @@ class MT5ForConditionalGeneration(MT5Pre - if self.config.num_layers == self.config.num_decoder_layers: - warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) - decoder_head_mask = head_mask -- -- hidden_states = encoder_outputs["last_hidden_state"] -- past_cross_key_values = encoder_outputs["past_cross_key_values"] - -- # if self.model_parallel: -- # torch.cuda.set_device(self.decoder.first_device) -+ # Encode if needed (training, first prediction pass) -+ if encoder_outputs is None: -+ # Convert encoder inputs in embeddings if needed -+ encoder_outputs = self.encoder( -+ input_ids=input_ids, -+ attention_mask=attention_mask, -+ inputs_embeds=inputs_embeds, -+ head_mask=head_mask, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): -+ encoder_outputs = BaseModelOutput( -+ last_hidden_state=encoder_outputs[0], -+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, -+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, -+ ) -+ -+ hidden_states = encoder_outputs[0] -+ -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) - - if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: - # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(labels) - -- with torch.npu.stream(self.stream): # set stream -- # import pdb -- # pdb.set_trace() -- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -- self.stream.synchronize() # synchronize -- # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) -+ hidden_states = hidden_states.to(self.decoder.first_device) -+ if decoder_input_ids is not None: -+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) -+ if attention_mask is not None: -+ attention_mask = attention_mask.to(self.decoder.first_device) -+ if decoder_attention_mask is not None: -+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) -+ -+ # Decode -+ decoder_outputs = self.decoder( -+ input_ids=decoder_input_ids, -+ attention_mask=decoder_attention_mask, -+ inputs_embeds=decoder_inputs_embeds, -+ past_key_values=past_key_values, -+ encoder_hidden_states=hidden_states, -+ encoder_attention_mask=attention_mask, -+ head_mask=decoder_head_mask, -+ cross_attn_head_mask=cross_attn_head_mask, -+ use_cache=use_cache, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ -+ sequence_output = decoder_outputs[0] -+ -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.encoder.first_device) -+ self.lm_head = self.lm_head.to(self.encoder.first_device) -+ sequence_output = sequence_output.to(self.lm_head.weight.device) -+ -+ if self.config.tie_word_embeddings: -+ # Rescale output before projecting on vocab -+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 -+ sequence_output = sequence_output * (self.model_dim**-0.5) -+ -+ lm_logits = self.lm_head(sequence_output) - - loss = None - if labels is not None: -@@ -1922,10 +1808,17 @@ class MT5ForConditionalGeneration(MT5Pre - if not return_dict: - output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs - return ((loss,) + output) if loss is not None else output -+ - return Seq2SeqLMOutput( - loss=loss, -- logits=decoder_outputs[0], -- past_key_values=decoder_outputs[1] -+ logits=lm_logits, -+ past_key_values=decoder_outputs.past_key_values, -+ decoder_hidden_states=decoder_outputs.hidden_states, -+ decoder_attentions=decoder_outputs.attentions, -+ cross_attentions=decoder_outputs.cross_attentions, -+ encoder_last_hidden_state=encoder_outputs.last_hidden_state, -+ encoder_hidden_states=encoder_outputs.hidden_states, -+ encoder_attentions=encoder_outputs.attentions, - ) - - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation -@@ -1933,7 +1826,6 @@ class MT5ForConditionalGeneration(MT5Pre - self, - input_ids, - past_key_values=None, -- past_cross_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, -@@ -1959,7 +1851,6 @@ class MT5ForConditionalGeneration(MT5Pre - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, -- "past_cross_key_values": past_cross_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, -- Gitee From 1c98df50261ea7eba496c23e0fb978199abd95da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:53:17 +0000 Subject: [PATCH 042/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Ft5=5Fpatch.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/T5_modeling_t5_patch.py | 28 ------------------- 1 file changed, 28 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py deleted file mode 100644 index e304f4f9f2..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import transformers - - -def main(): - transformers_path = transformers.__path__ - transformers_version = transformers.__version__ - - assert transformers_version =='4.42.0', "expectation transformers==4.42.0" - os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') - - -if __name__ == '__main__': - main() -- Gitee From f53e742428dce93119e97d52e96b2bf1f6b31b69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:53:29 +0000 Subject: [PATCH 043/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Futils=5Fpatch.p?= =?UTF-8?q?y?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/T5_modeling_utils_patch.py | 28 ------------------- 1 file changed, 28 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py deleted file mode 100644 index b3ad7bc20b..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import transformers - - -def main(): - transformers_path = transformers.__path__ - transformers_version = transformers.__version__ - - assert transformers_version == '4.42.0', "expectation transformers==4.42.0" - os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch') - - -if __name__ == '__main__': - main() -- Gitee From d44f0ade1f0f81d6be81e609a3fd6e3c386d24a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:53:41 +0000 Subject: [PATCH 044/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Futils=5Fpatch.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/T5_utils_patch.py | 28 ------------------- 1 file changed, 28 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py deleted file mode 100644 index 046b6e6b85..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import transformers - - -def main(): - transformers_path = transformers.__path__ - transformers_version = transformers.__version__ - - assert transformers_version == '4.42.0', "expectation transformers==4.42.0" - os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch') - - -if __name__ == '__main__': - main() -- Gitee From eceacc242939e507a2c1a74ebea6fc463f99bebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:53:59 +0000 Subject: [PATCH 045/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Foutputs.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch deleted file mode 100644 index 6c99414a69..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch +++ /dev/null @@ -1,10 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_outputs.py 2024-08-28 19:20:22.112000000 +0800 -+++ modeling_outputs.py 2024-09-02 18:32:37.720000000 +0800 -@@ -282,7 +282,6 @@ class BaseModelOutputWithPastAndCrossAtt - - last_hidden_state: torch.FloatTensor = None - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None -- past_cross_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None -- Gitee From 3e2708551fdd041724b830c29a8c4474a783f884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:54:14 +0000 Subject: [PATCH 046/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Ft5.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/modeling_t5.patch | 596 ------------------ 1 file changed, 596 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch deleted file mode 100644 index 40920ac007..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ /dev/null @@ -1,596 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py 2024-09-04 19:03:55.080000000 +0800 -+++ modling_t5.py 2024-09-04 19:04:47.048000000 +0800 -@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple - import torch - from torch import nn - from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss --import torch_npu --import mindietorch - - from ...activations import ACT2FN - from ...modeling_outputs import ( -@@ -451,7 +449,6 @@ class T5Attention(nn.Module): - key_value_states=None, - position_bias=None, - past_key_value=None, -- past_cross_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, -@@ -468,8 +465,7 @@ class T5Attention(nn.Module): - real_seq_length = seq_length - - if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -+ if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) -@@ -497,7 +493,6 @@ class T5Attention(nn.Module): - hidden_states = shape(proj_layer(key_value_states)) - - if past_key_value is not None: -- past_key_value = shape(past_key_value) - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) -@@ -571,133 +566,16 @@ class T5Attention(nn.Module): - - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- -- if output_attentions: -- outputs = outputs + (attn_weights,) -- return outputs -- -- --class T5SelfAttention(T5Attention): -- def __init__(self, config: T5Config, has_relative_attention_bias=False): -- super().__init__(config, has_relative_attention_bias) -- -- def forward( -- self, -- hidden_states, -- mask=None, -- position_bias=None, -- past_key_value=None, -- layer_head_mask=None, -- use_cache=False, -- output_attentions=False, -- ): -- """ -- Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). -- """ -- # Input is (batch_size, seq_length, dim) -- # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) -- # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) -- batch_size, seq_length = hidden_states.shape[:2] -- -- real_seq_length = seq_length -- -- if past_key_value is not None: -- if past_key_value.shape[0] != 2: -- # if len(past_key_value) != 2: -- raise ValueError( -- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" -- ) -- real_seq_length += past_key_value[0].shape[2] -- key_length = real_seq_length -- -- def shape(states): -- """projection""" -- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) -- -- def unshape(states): -- """reshape""" -- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) -- -- def project(hidden_states, proj_layer, past_key_value): -- """projects hidden states correctly to key/query states""" -- if past_key_value is None: -- # cross-attn -- # (batch_size, n_heads, seq_length, dim_per_head) -- hidden_states = shape(proj_layer(hidden_states)) -- -- if past_key_value is not None: -- hidden_states = shape(proj_layer(hidden_states)) -- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) -- return hidden_states -- -- # get query states -- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) -- -- # get key/value states -- key_states = project( -- hidden_states, self.k, past_key_value[0] if past_key_value is not None else None -- ) -- value_states = project( -- hidden_states, self.v, past_key_value[1] if past_key_value is not None else None -- ) -- # compute scores -- scores = torch.matmul( -- query_states, key_states.transpose(3, 2) -- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- # print("scores=",scores.dtype) -- if position_bias is None: -- if not self.has_relative_attention_bias: -- position_bias = torch.zeros( -- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype -- ) -- if self.gradient_checkpointing and self.training: -- position_bias.requires_grad = True -- else: -- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) -- -- # if key and values are already calculated -- # we want only the last query position bias -- if past_key_value is not None: -- position_bias = position_bias[:, :, -hidden_states.size(1) :, :] -- -- if mask is not None: -- position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - -- if self.pruned_heads: -- mask = torch.ones(position_bias.shape[1]) -- mask[list(self.pruned_heads)] = 0 -- position_bias_masked = position_bias[:, mask.bool()] -- else: -- position_bias_masked = position_bias -- -- scores += position_bias_masked -- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( -- scores -- ) # (batch_size, n_heads, seq_length, key_length) -- attn_weights = nn.functional.dropout( -- attn_weights, p=self.dropout, training=self.training -- ) # (batch_size, n_heads, seq_length, key_length) -- -- # Mask heads if we want to -- if layer_head_mask is not None: -- attn_weights = attn_weights * layer_head_mask -- -- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) -- attn_output = self.o(attn_output) -- -- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None -- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs - - -- -- - class T5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() -- self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) -+ self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - -@@ -784,7 +662,6 @@ class T5Block(nn.Module): - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, -- past_cross_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, -@@ -794,15 +671,15 @@ class T5Block(nn.Module): - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - -- # if len(past_key_value) != expected_num_past_key_values: -- # raise ValueError( -- # f"There should be {expected_num_past_key_values} past states. " -- # f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -- # f"Got {len(past_key_value)} past key / value states" -- # ) -+ if len(past_key_value) != expected_num_past_key_values: -+ raise ValueError( -+ f"There should be {expected_num_past_key_values} past states. " -+ f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" -+ f"Got {len(past_key_value)} past key / value states" -+ ) - -- self_attn_past_key_value = past_key_value -- cross_attn_past_key_value = past_cross_key_value -+ self_attn_past_key_value = past_key_value[:2] -+ cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - -@@ -859,7 +736,9 @@ class T5Block(nn.Module): - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states -- cross_attn_past_key_values = cross_attention_outputs[1] -+ if present_key_value_state is not None: -+ present_key_value_state = present_key_value_state + cross_attention_outputs[1] -+ - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - -@@ -878,7 +757,7 @@ class T5Block(nn.Module): - outputs = (hidden_states,) - - if use_cache: -- outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs -+ outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - -@@ -1018,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel) - - - class T5Stack(T5PreTrainedModel): -- def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None): -+ def __init__(self, config, embed_tokens=None): - super().__init__(config) - - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder -- self.lm_head=lm_head -- self.encodecrosskeyvalue = encodecrosskeyvalue -- self.model_dim = config.d_model - - self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -1093,14 +969,13 @@ class T5Stack(T5PreTrainedModel): - def forward( - self, - input_ids=None, -+ attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, -- past_key_values=None, -- past_cross_key_values=None, -- attention_mask=None, - inputs_embeds=None, - head_mask=None, - cross_attn_head_mask=None, -+ past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, -@@ -1146,9 +1021,9 @@ class T5Stack(T5PreTrainedModel): - raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - - # initialize past_key_values with `None` if past does not exist -- if not self.is_decoder: -+ if past_key_values is None: - past_key_values = [None] * len(self.block) -- past_cross_key_values = [None] * len(self.block) -+ - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - -@@ -1179,7 +1054,7 @@ class T5Stack(T5PreTrainedModel): - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) -- present_key_value_states = [] if use_cache else None -+ present_key_value_states = () if use_cache else None - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1187,10 +1062,8 @@ class T5Stack(T5PreTrainedModel): - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) -- for i, layer_module in enumerate(self.block): -- # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): -- past_key_value = past_key_values[i] -- past_cross_key_value = past_cross_key_values[i] -+ -+ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): - layer_head_mask = head_mask[i] - cross_attn_layer_head_mask = cross_attn_head_mask[i] - # Model parallel -@@ -1240,7 +1113,6 @@ class T5Stack(T5PreTrainedModel): - layer_head_mask=layer_head_mask, - cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, -- past_cross_key_value=past_cross_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) -@@ -1248,19 +1120,19 @@ class T5Stack(T5PreTrainedModel): - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - if use_cache is False: -- layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] -+ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - -- hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3] -+ hidden_states, present_key_value_state = layer_outputs[:2] - - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), - # (cross-attention position bias), (cross-attention weights) -- position_bias = layer_outputs[3] -+ position_bias = layer_outputs[2] - if self.is_decoder and encoder_hidden_states is not None: -- encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] -+ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: -- present_key_value_states.extend(present_key_value_state) -+ present_key_value_states = present_key_value_states + (present_key_value_state,) - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[3],) -@@ -1274,7 +1146,7 @@ class T5Stack(T5PreTrainedModel): - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.final_layer_norm(hidden_states) -- hidden_states = self.dropout(hidden_states).half() -+ hidden_states = self.dropout(hidden_states) - - # Add last layer - if output_hidden_states: -@@ -1292,17 +1164,13 @@ class T5Stack(T5PreTrainedModel): - ] - if v is not None - ) -- present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None -- if not self.is_decoder and self.encodecrosskeyvalue: -- res = self.encodecrosskeyvalue(hidden_states) -- return tuple((hidden_states, res)) -- lm_logits = None -- if self.is_decoder: -- #logits = None -- if self.config.tie_word_embeddings: -- hidden_states = hidden_states * (self.model_dim ** -0.5) -- lm_logits = self.lm_head(hidden_states) -- return tuple((lm_logits, present_key_value_states)) -+ return BaseModelOutputWithPastAndCrossAttentions( -+ last_hidden_state=hidden_states, -+ past_key_values=present_key_value_states, -+ hidden_states=all_hidden_states, -+ attentions=all_attentions, -+ cross_attentions=all_cross_attentions, -+ ) - - - T5_START_DOCSTRING = r""" -@@ -1673,31 +1541,6 @@ class T5Model(T5PreTrainedModel): - ) - - -- --class EncoderToCrossKeyValue(nn.Module): -- def __init__(self, cross_key, cross_value, num_heads, d_kv): -- super().__init__() -- self.cross_key = cross_key -- self.cross_value = cross_value -- self.num_heads = num_heads -- self.d_kv = d_kv -- -- -- def forward(self, hidden_states): -- batch_size = hidden_states.shape[0] -- encoder_hidden_states_kvs = [] -- # for i in range(len(self.cross_value)): -- # encoder_hidden_states_kvs.append( -- # torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2), -- # self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0)) -- for i in range(len(self.cross_value)): -- encoder_hidden_states_kvs.append( -- torch.stack((self.cross_key[i](hidden_states), -- self.cross_value[i](hidden_states)), dim=0)) -- past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0) -- return past_cross_key_values -- -- - @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) - class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [ -@@ -1705,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr - ] - _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - -- def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): -+ def __init__(self, config: T5Config): - super().__init__(config) -- self.encoder_path = encoder_path -- self.decoder_path = decoder_path -- if not self.encoder_path or not self.decoder_path: -- self.model_dim = config.d_model -- -- self.shared = nn.Embedding(config.vocab_size, config.d_model) -- -- decoder_config = copy.deepcopy(config) -- decoder_config.is_decoder = True -- decoder_config.is_encoder_decoder = False -- decoder_config.num_layers = config.num_decoder_layers -- -- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -- self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) -- -- cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) -- cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) -- encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv) -- -- encoder_config = copy.deepcopy(config) -- encoder_config.is_decoder = False -- encoder_config.use_cache = False -- encoder_config.is_encoder_decoder = False -- self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue) -- self.encoder_mindie = None -- self.decoder_mindie = None -- if self.encoder_path: -- self.encoder_mindie = torch.jit.load(self.encoder_path) -- if self.decoder_path: -- self.decoder_mindie = torch.jit.load(self.decoder_path) -- self.stream = torch.npu.Stream(f"npu:{device_id}") -- self.device_id = device_id -- -- -- def get_device(self): -- return f"npu:{self.device_id}" -+ self.model_dim = config.d_model -+ -+ self.shared = nn.Embedding(config.vocab_size, config.d_model) -+ -+ encoder_config = copy.deepcopy(config) -+ encoder_config.is_decoder = False -+ encoder_config.use_cache = False -+ encoder_config.is_encoder_decoder = False -+ self.encoder = T5Stack(encoder_config, self.shared) -+ -+ decoder_config = copy.deepcopy(config) -+ decoder_config.is_decoder = True -+ decoder_config.is_encoder_decoder = False -+ decoder_config.num_layers = config.num_decoder_layers -+ self.decoder = T5Stack(decoder_config, self.shared) -+ -+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - - # Initialize weights and apply final processing -- # self.post_init() -+ self.post_init() - - # Model parallel - self.model_parallel = False -@@ -1824,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, -- past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, -@@ -1872,23 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr - if self.config.num_layers == self.config.num_decoder_layers: - warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) - decoder_head_mask = head_mask -- -- hidden_states = encoder_outputs["last_hidden_state"] -- past_cross_key_values = encoder_outputs["past_cross_key_values"] - -- # if self.model_parallel: -- # torch.cuda.set_device(self.decoder.first_device) -+ # Encode if needed (training, first prediction pass) -+ if encoder_outputs is None: -+ # Convert encoder inputs in embeddings if needed -+ encoder_outputs = self.encoder( -+ input_ids=input_ids, -+ attention_mask=attention_mask, -+ inputs_embeds=inputs_embeds, -+ head_mask=head_mask, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): -+ encoder_outputs = BaseModelOutput( -+ last_hidden_state=encoder_outputs[0], -+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, -+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, -+ ) -+ -+ hidden_states = encoder_outputs[0] -+ -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) - - if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: - # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(labels) - -- with torch.npu.stream(self.stream): # set stream -- # import pdb -- # pdb.set_trace() -- decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -- self.stream.synchronize() # synchronize -- # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values) -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.decoder.first_device) -+ hidden_states = hidden_states.to(self.decoder.first_device) -+ if decoder_input_ids is not None: -+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) -+ if attention_mask is not None: -+ attention_mask = attention_mask.to(self.decoder.first_device) -+ if decoder_attention_mask is not None: -+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) -+ -+ # Decode -+ decoder_outputs = self.decoder( -+ input_ids=decoder_input_ids, -+ attention_mask=decoder_attention_mask, -+ inputs_embeds=decoder_inputs_embeds, -+ past_key_values=past_key_values, -+ encoder_hidden_states=hidden_states, -+ encoder_attention_mask=attention_mask, -+ head_mask=decoder_head_mask, -+ cross_attn_head_mask=cross_attn_head_mask, -+ use_cache=use_cache, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ -+ sequence_output = decoder_outputs[0] -+ -+ # Set device for model parallelism -+ if self.model_parallel: -+ torch.cuda.set_device(self.encoder.first_device) -+ self.lm_head = self.lm_head.to(self.encoder.first_device) -+ sequence_output = sequence_output.to(self.lm_head.weight.device) -+ -+ if self.config.tie_word_embeddings: -+ # Rescale output before projecting on vocab -+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 -+ sequence_output = sequence_output * (self.model_dim**-0.5) -+ -+ lm_logits = self.lm_head(sequence_output) - - loss = None - if labels is not None: -@@ -1901,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr - if not return_dict: - output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs - return ((loss,) + output) if loss is not None else output -+ - return Seq2SeqLMOutput( - loss=loss, -- logits=decoder_outputs[0], -- past_key_values=decoder_outputs[1] -+ logits=lm_logits, -+ past_key_values=decoder_outputs.past_key_values, -+ decoder_hidden_states=decoder_outputs.hidden_states, -+ decoder_attentions=decoder_outputs.attentions, -+ cross_attentions=decoder_outputs.cross_attentions, -+ encoder_last_hidden_state=encoder_outputs.last_hidden_state, -+ encoder_hidden_states=encoder_outputs.hidden_states, -+ encoder_attentions=encoder_outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, -- past_cross_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, -@@ -1937,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, -- "past_cross_key_values": past_cross_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, -@@ -2086,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel): - >>> last_hidden_states = outputs.last_hidden_state - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict -+ - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, -- Gitee From 6f87011e6e1d32ec35dc05fe082fe6193c29832d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:54:26 +0000 Subject: [PATCH 047/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Futils.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/T5/modeling_utils.patch | 41 ------------------- 1 file changed, 41 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch deleted file mode 100644 index 1b9fef8cd2..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch +++ /dev/null @@ -1,41 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_utils.py 2024-08-28 20:15:38.524000000 +0800 -+++ modeling_utils.py 2024-09-02 17:29:43.700000000 +0800 -@@ -975,7 +975,7 @@ class ModuleUtilsMixin: - `torch.device`: The device on which the module is (assuming that all the module parameters are on the same - device). - """ -- return self.get_device() -+ return get_parameter_device(self) - - @property - def dtype(self) -> torch.dtype: -@@ -1004,8 +1004,7 @@ class ModuleUtilsMixin: - # encoder_extended_attention_mask = (encoder_extended_attention_mask == - # encoder_extended_attention_mask.transpose(-1, -2)) - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility -- #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min -- encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 -+ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min - - return encoder_extended_attention_mask - -@@ -1019,9 +1018,7 @@ class ModuleUtilsMixin: - device = attention_mask.device - batch_size, seq_length = input_shape - seq_ids = torch.arange(seq_length, device=device) -- # print("seq_ids=",seq_ids) - causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] -- # print("causal_mask=",causal_mask) - # in case past_key_values are used we need to add a prefix ones mask to the causal mask - # causal and attention masks must have same type with pytorch version < 1.3 - causal_mask = causal_mask.to(attention_mask.dtype) -@@ -1088,8 +1085,7 @@ class ModuleUtilsMixin: - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility -- #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min -- extended_attention_mask = (1.0 - extended_attention_mask) * -1000 -+ extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min - return extended_attention_mask - - def get_head_mask( -- Gitee From a87eaa722ec0f636c0c9aa7bf1c5b43e27366f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:57:09 +0000 Subject: [PATCH 048/110] add MindIE/MindIE-Torch/built-in/T5. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 -- Gitee From 83fcd45b764f7aafad49e04a9c9582beb7161c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:57:33 +0000 Subject: [PATCH 049/110] add MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 80d5d1980a3499cc6f9f50c7fdcedf1c4781f747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 02:58:09 +0000 Subject: [PATCH 050/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/T5/modeling_t5.py | 1045 +++++++++++++++++ 1 file changed, 1045 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py index e69de29bb2..c764d99e7b 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py @@ -0,0 +1,1045 @@ +# coding=utf-8 +# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch T5 model.""" + +import copy +import math +import os +import warnings +from typing import List, Optional, Tuple, Union +from dataclasses import dataclass +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +# import torch_npu +import mindietorch + + + + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torch_fx_proxy, + logging, + replace_return_docstrings, +) +from ...utils.model_parallel_utils import assert_device_map, get_device_map +from .configuration_t5 import T5Config +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.configuration_utils import GenerationMode +from transformers.utils.generic import ModelOutput + + +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "T5Config" +_CHECKPOINT_FOR_DOC = "google-t5/t5-small" + +#################################################### +# This dict contains ids and associated url +# for the pretrained weights provided with the models +#################################################### + + +#################################################### +# This is a conversion method from TF 1.0 to PyTorch +# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 +#################################################### +def load_tf_weights_in_t5(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + tf_weights = {} + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + tf_weights[name] = array + + for txt_name in names: + name = txt_name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + if "_slot_" in name[-1]: + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + pointer = model + array = tf_weights[txt_name] + + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") + elif scope_names[0] == "self_attention": + pointer = getattr(pointer, "layer") + pointer = pointer[0] + elif scope_names[0] == "enc_dec_attention": + pointer = getattr(pointer, "layer") + pointer = pointer[1] + elif scope_names[0] == "dense_relu_dense": + pointer = getattr(pointer, "layer") + pointer = pointer[2] + elif scope_names[0] == "rms_norm": + if hasattr(pointer, "layer_norm"): + pointer = getattr(pointer, "layer_norm") + elif hasattr(pointer, "final_layer_norm"): + pointer = getattr(pointer, "final_layer_norm") + elif scope_names[0] == "scale": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + elif scope_names[0] == "decoder" and name[1] == "logits": + continue + elif scope_names[0] == "logits": + pointer = getattr(pointer, "lm_head") + elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit(): + pointer = getattr(pointer, f"wi_{scope_names[1]}") + continue + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if scope_names[0] not in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") + if scope_names[0] != "embedding": + logger.info(f"Transposing numpy weight of shape {array.shape} for {name}") + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array.astype(np.float32)) + tf_weights.pop(txt_name, None) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") + return model + + +#################################################### +# PyTorch Models are constructed by sub-classing +# - torch.nn.Module for the layers and +# - PreTrainedModel for the models (it-self a sub-class of nn.Module) +#################################################### +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the t5 models have the + following number of attention modules: + + - google-t5/t5-small: 6 + - google-t5/t5-base: 12 + - google-t5/t5-large: 24 + - google-t5/t5-3b: 24 + - google-t5/t5-11b: 24 + + Example: + + ```python + # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules: + model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) + ``` +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python + # On a 4 GPU machine with google-t5/t5-3b: + model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() + ``` +""" + + +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the T5 style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for + # half-precision inputs is done in fp32 + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +try: + from apex.normalization import FusedRMSNorm + + T5LayerNorm = FusedRMSNorm # noqa + + logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm") +except ImportError: + # using the normal T5LayerNorm + pass +except Exception: + logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm") + pass + +ALL_LAYERNORM_LAYERS.append(T5LayerNorm) + + +class T5DenseActDense(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_states = self.wi(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + if ( + isinstance(self.wo.weight, torch.Tensor) + and hidden_states.dtype != self.wo.weight.dtype + and self.wo.weight.dtype != torch.int8 + ): + hidden_states = hidden_states.to(self.wo.weight.dtype) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5DenseGatedActDense(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_gelu = self.act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + + # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32. + # See https://github.com/huggingface/transformers/issues/20287 + # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None`` + if ( + isinstance(self.wo.weight, torch.Tensor) + and hidden_states.dtype != self.wo.weight.dtype + and self.wo.weight.dtype != torch.int8 + ): + hidden_states = hidden_states.to(self.wo.weight.dtype) + + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5LayerFF(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + if config.is_gated_act: + self.DenseReluDense = T5DenseGatedActDense(config) + else: + self.DenseReluDense = T5DenseActDense(config) + + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + +class T5Attention(nn.Module): + def __init__(self, config: T5Config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length, device=None): + """Compute binned relative position bias""" + if device is None: + device = self.relative_attention_bias.weight.device + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] + relative_position = memory_position - context_position # shape (query_length, key_length) + relative_position_bucket = self._relative_position_bucket( + relative_position, # shape (query_length, key_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads) + values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) + return values + + def forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key=None, + past_value=None, + past_cross_key=None, + past_cross_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) + # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key is not None: + real_seq_length += past_key.shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" + if key_value_states is None: + # self-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: + past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + # elif past_key_value.shape[2] != key_value_states.shape[1]: + # # checking that the `sequence_length` of the `past_key_value` is the same as + # # the provided `key_value_states` to support prefix tuning + # # cross-attn + # # (batch_size, n_heads, seq_length, dim_per_head) + # hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, self.k, key_value_states, past_key if past_key is not None else None + ) + value_states = project( + hidden_states, self.v, key_value_states, past_value if past_value is not None else None + ) + # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,) + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores + ) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) # (batch_size, n_heads, seq_length, key_length) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +class T5SelfAttention(T5Attention): + def __init__(self, config: T5Config, has_relative_attention_bias=False): + super().__init__(config, has_relative_attention_bias) + + def forward( + self, + hidden_states, + mask=None, + position_bias=None, + past_key=None, + past_value=None, + layer_head_mask=None, + use_cache=False, + output_attentions=False, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) + # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key is not None: + real_seq_length += past_key.shape[2] + key_length = real_seq_length + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, past_key_value): + """projects hidden states correctly to key/query states""" + if past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + + if past_key_value is not None: + hidden_states = shape(proj_layer(hidden_states)) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, self.k, past_key if past_key is not None else None + ) + value_states = project( + hidden_states, self.v, past_value if past_value is not None else None + ) + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores + ) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) # (batch_size, n_heads, seq_length, key_length) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + + + +class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key=None, + past_value=None, + use_cache=False, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key=past_key, + past_value=past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5LayerCrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + key_value_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key=None, + past_value=None, + use_cache=False, + query_length=None, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention( + normed_hidden_states, + mask=attention_mask, + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key=past_key, + past_value=past_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, + ) + layer_output = hidden_states + self.dropout(attention_output[0]) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5Block(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.layer = nn.ModuleList() + self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + if self.is_decoder: + self.layer.append(T5LayerCrossAttention(config)) + + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key=None, + past_value=None, + past_cross_key=None, + past_cross_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + if past_key is not None: + self_attn_past_key = past_key + self_attn_past_value = past_value + cross_attn_past_key = past_cross_key + cross_attn_past_value = past_cross_value + else: + self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key=self_attn_past_key, + past_value=self_attn_past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_state, present_value_state = self_attention_outputs[:3] + attention_outputs = self_attention_outputs[3:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_state is not None: + query_length = present_key_state[0].shape[2] + else: + query_length = None + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key=cross_attn_past_key, + past_value=cross_attn_past_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + # cross_attn_past_key_values = cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[3:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class T5ClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config: T5Config): + super().__init__() + self.dense = nn.Linear(config.d_model, config.d_model) + self.dropout = nn.Dropout(p=config.classifier_dropout) + self.out_proj = nn.Linear(config.d_model, config.num_labels) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class T5PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = T5Config + load_tf_weights = load_tf_weights_in_t5 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + _no_split_modules = ["T5Block"] + _keep_in_fp32_modules = ["wo"] + + @property + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = { + "decoder_input_ids": input_ids, + "input_ids": input_ids, + "decoder_attention_mask": input_mask, + } + return dummy_inputs + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, T5LayerNorm): + module.weight.data.fill_(factor * 1.0) + elif isinstance( + module, + (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering), + ): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "qa_outputs"): + module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + module.qa_outputs.bias.data.zero_() + elif isinstance(module, T5ForTokenClassification): + if hasattr(module, "classifier"): + module.classifier.weight.data.normal_(mean=0.0, std=factor * 1.0) + module.classifier.bias.data.zero_() + elif isinstance(module, T5ClassificationHead): + module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.dense, "bias") and module.dense.bias is not None: + module.dense.bias.data.zero_() + module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None: + module.out_proj.bias.data.zero_() + elif isinstance(module, T5DenseActDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi, "bias") and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5DenseGatedActDense): + module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: + module.wi_0.bias.data.zero_() + module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: + module.wi_1.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5Attention): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + if decoder_start_token_id is None: + raise ValueError( + "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. " + "See T5 docs for more information." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids \ No newline at end of file -- Gitee From 05699199272770fb3c8e7dd1bb38a9c1b4e7a826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 03:00:15 +0000 Subject: [PATCH 051/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/T5/modeling_t5.py | 470 +++++++++++++++++- 1 file changed, 469 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py index c764d99e7b..99fd48535a 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py @@ -1042,4 +1042,472 @@ class T5PreTrainedModel(PreTrainedModel): # replace possible -100 values in labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) - return shifted_input_ids \ No newline at end of file + return shifted_input_ids + +class T5Stack(T5PreTrainedModel): + def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder + self.lm_head=lm_head + self.encodecrosskey = encodecrosskey + self.encodecrossvalue = encodecrossvalue + self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) + self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing + self.post_init() + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + warnings.warn( + "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" + " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," + " 'block.1': 1, ...}", + FutureWarning, + ) + # Check validity of device_map + self.device_map = ( + get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map + ) + assert_device_map(self.device_map, len(self.block)) + self.model_parallel = True + self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys())) + self.last_device = "cuda:" + str(max(self.device_map.keys())) + # Load onto devices + for k, v in self.device_map.items(): + for layer in v: + cuda_device = "cuda:" + str(k) + self.block[layer] = self.block[layer].to(cuda_device) + + # Set embed_tokens to first layer + self.embed_tokens = self.embed_tokens.to(self.first_device) + # Set final layer norm to last device + self.final_layer_norm = self.final_layer_norm.to(self.last_device) + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + self.model_parallel = False + self.device_map = None + self.first_device = "cpu" + self.last_device = "cpu" + for i in range(len(self.block)): + self.block[i] = self.block[i].to("cpu") + self.embed_tokens = self.embed_tokens.to("cpu") + self.final_layer_norm = self.final_layer_norm.to("cpu") + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + encoder_hidden_states=None, + past_keys=None, + past_values=None, + past_cross_keys=None, + past_cross_values=None, + encoder_attention_mask=None, + attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **model_kwargs + ): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(self.first_device) + self.embed_tokens = self.embed_tokens.to(self.first_device) + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + + if inputs_embeds is None: + if self.embed_tokens is None: + raise ValueError("You have to initialize the model with valid token embeddings") + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + # required mask seq length can be calculated via length of past + mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length + + if use_cache is True: + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist + if not self.is_decoder: + past_keys = [None] * len(self.block) + past_values = [None] * len(self.block) + past_cross_keys = [None] * len(self.block) + past_cross_values = [None] * len(self.block) + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long + ) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) + present_key_states = () if use_cache else None + present_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + # for i, layer_module in enumerate(self.block): + for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if position_bias is not None: + position_bias = position_bias.to(hidden_states.device) + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.to(hidden_states.device) + if encoder_extended_attention_mask is not None: + encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device) + if encoder_decoder_position_bias is not None: + encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device) + if layer_head_mask is not None: + layer_head_mask = layer_head_mask.to(hidden_states.device) + if cross_attn_layer_head_mask is not None: + cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.forward, + hidden_states, + extended_attention_mask, + position_bias, + encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, + layer_head_mask, + cross_attn_layer_head_mask, + None, # past_key_value is always None with gradient checkpointing + use_cache, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key=past_key, + past_value=past_value, + past_cross_key=past_cross_key, + past_cross_value=past_cross_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: + layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + + hidden_states, present_key_state, present_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: + present_key_states = present_key_states + present_key_state + present_value_states = present_value_states + present_value_state + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + (layer_outputs[5],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) + if not self.is_decoder: + cross_keys = None + cross_values = None + if self.encodecrosskey: + cross_keys = self.encodecrosskey(hidden_states) + if self.encodecrossvalue: + cross_values = self.encodecrossvalue(hidden_states) + return tuple((hidden_states, cross_keys, cross_values)) + lm_logits = None + if self.is_decoder: + if self.config.tie_word_embeddings: + hidden_states = hidden_states * (self.model_dim ** -0.5) + lm_logits = self.lm_head(hidden_states) + return tuple((lm_logits, present_key_states, present_value_states)) + + +class T5Stack_Encoder(T5PreTrainedModel): + def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder + self.encodecrosskey = encodecrosskey + self.encodecrossvalue = encodecrossvalue + self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) + self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing + self.post_init() + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **model_kwargs + ): + # Model parallel + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + # required mask seq length can be calculated via length of past + mask_seq_length = seq_length + + # initialize past_key_values with `None` if past does not exist + past_keys = [None] * len(self.block) + past_values = [None] * len(self.block) + past_cross_keys = [None] * len(self.block) + past_cross_values = [None] * len(self.block) + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + + encoder_extended_attention_mask = None + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) + present_key_states = () if use_cache else None + present_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=None, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key=past_key, + past_value=past_value, + past_cross_key=past_cross_key, + past_cross_value=past_cross_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: + layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + + hidden_states, present_key_state, present_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: + present_key_states = present_key_states + present_key_state + present_value_states = present_value_states + present_value_state + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + (layer_outputs[5],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) + if not self.is_decoder: + cross_keys = None + cross_values = None + if self.encodecrosskey: + cross_keys = self.encodecrosskey(hidden_states) + if self.encodecrossvalue: + cross_values = self.encodecrossvalue(hidden_states) + return tuple((hidden_states, cross_keys, cross_values)) + + -- Gitee From fdb814b194ac34636783c0e39dd8936b0959479d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 03:01:13 +0000 Subject: [PATCH 052/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/T5/modeling_t5.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py index 99fd48535a..c6e5d57c8f 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py @@ -1510,4 +1510,155 @@ class T5Stack_Encoder(T5PreTrainedModel): cross_values = self.encodecrossvalue(hidden_states) return tuple((hidden_states, cross_keys, cross_values)) +T5_START_DOCSTRING = r""" + + The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text + Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan + Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a + text-to-text denoising generative setting. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`T5Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +T5_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you + should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 + Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in + `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at + the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +T5_ENCODER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you + should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + -- Gitee From 40152886b808171031d0cf5ad934672fe4291198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 03:04:42 +0000 Subject: [PATCH 053/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py index c6e5d57c8f..039b3da657 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py @@ -1661,4 +1661,10 @@ T5_ENCODER_INPUTS_DOCSTRING = r""" Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, +`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. +If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, +num_heads)`. +""" -- Gitee From 1a20d26c2e392e5be33f855a70e88a0607ea65c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 03:05:37 +0000 Subject: [PATCH 054/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py index 039b3da657..d422aef611 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py @@ -1668,3 +1668,8 @@ The input argument `head_mask` was split into two arguments `head_mask` and `dec If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, num_heads)`. """ + +@add_start_docstrings( + "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.", + T5_START_DOCSTRING, +) \ No newline at end of file -- Gitee From ed7e43928f0deaeb37b67c9b2bff0983b2b05d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:14:56 +0000 Subject: [PATCH 055/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Ft5.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MindIE-Torch/built-in/T5/modeling_t5.py | 1675 ----------------- 1 file changed, 1675 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py deleted file mode 100644 index d422aef611..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py +++ /dev/null @@ -1,1675 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch T5 model.""" - -import copy -import math -import os -import warnings -from typing import List, Optional, Tuple, Union -from dataclasses import dataclass -import torch -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -# import torch_npu -import mindietorch - - - - -from ...activations import ACT2FN -from ...modeling_outputs import ( - BaseModelOutput, - BaseModelOutputWithPastAndCrossAttentions, - Seq2SeqModelOutput, - Seq2SeqQuestionAnsweringModelOutput, - Seq2SeqSequenceClassifierOutput, - TokenClassifierOutput, -) -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import ( - DUMMY_INPUTS, - DUMMY_MASK, - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_torch_fx_proxy, - logging, - replace_return_docstrings, -) -from ...utils.model_parallel_utils import assert_device_map, get_device_map -from .configuration_t5 import T5Config -from transformers.generation.logits_process import LogitsProcessorList -from transformers.generation.stopping_criteria import StoppingCriteriaList -from transformers.generation.configuration_utils import GenerationMode -from transformers.utils.generic import ModelOutput - - -@dataclass -class Seq2SeqLMOutput(ModelOutput): - """ - Base class for model's outputs, with potential hidden states and attentions. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - encoder_last_hidden_state: Optional[torch.FloatTensor] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "T5Config" -_CHECKPOINT_FOR_DOC = "google-t5/t5-small" - -#################################################### -# This dict contains ids and associated url -# for the pretrained weights provided with the models -#################################################### - - -#################################################### -# This is a conversion method from TF 1.0 to PyTorch -# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 -#################################################### -def load_tf_weights_in_t5(model, config, tf_checkpoint_path): - """Load tf checkpoints in a pytorch model.""" - try: - import re - - import numpy as np - import tensorflow as tf - except ImportError: - logger.error( - "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions." - ) - raise - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info(f"Converting TensorFlow checkpoint from {tf_path}") - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - tf_weights = {} - for name, shape in init_vars: - logger.info(f"Loading TF weight {name} with shape {shape}") - array = tf.train.load_variable(tf_path, name) - names.append(name) - tf_weights[name] = array - - for txt_name in names: - name = txt_name.split("/") - # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v - # which are not required for using pretrained model - if any( - n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] - for n in name - ): - logger.info(f"Skipping {'/'.join(name)}") - tf_weights.pop(txt_name, None) - continue - if "_slot_" in name[-1]: - logger.info(f"Skipping {'/'.join(name)}") - tf_weights.pop(txt_name, None) - continue - pointer = model - array = tf_weights[txt_name] - - for m_name in name: - if re.fullmatch(r"[A-Za-z]+_\d+", m_name): - scope_names = re.split(r"_(\d+)", m_name) - else: - scope_names = [m_name] - if scope_names[0] in ["kernel", "scale", "embedding"]: - pointer = getattr(pointer, "weight") - elif scope_names[0] == "self_attention": - pointer = getattr(pointer, "layer") - pointer = pointer[0] - elif scope_names[0] == "enc_dec_attention": - pointer = getattr(pointer, "layer") - pointer = pointer[1] - elif scope_names[0] == "dense_relu_dense": - pointer = getattr(pointer, "layer") - pointer = pointer[2] - elif scope_names[0] == "rms_norm": - if hasattr(pointer, "layer_norm"): - pointer = getattr(pointer, "layer_norm") - elif hasattr(pointer, "final_layer_norm"): - pointer = getattr(pointer, "final_layer_norm") - elif scope_names[0] == "scale": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "output_bias" or scope_names[0] == "beta": - pointer = getattr(pointer, "bias") - elif scope_names[0] == "squad": - pointer = getattr(pointer, "classifier") - elif scope_names[0] == "decoder" and name[1] == "logits": - continue - elif scope_names[0] == "logits": - pointer = getattr(pointer, "lm_head") - elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit(): - pointer = getattr(pointer, f"wi_{scope_names[1]}") - continue - else: - try: - pointer = getattr(pointer, scope_names[0]) - except AttributeError: - logger.info(f"Skipping {'/'.join(name)}") - continue - if len(scope_names) >= 2: - num = int(scope_names[1]) - pointer = pointer[num] - if scope_names[0] not in ["kernel", "scale", "embedding"]: - pointer = getattr(pointer, "weight") - if scope_names[0] != "embedding": - logger.info(f"Transposing numpy weight of shape {array.shape} for {name}") - array = np.transpose(array) - try: - if pointer.shape != array.shape: - raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") - except AssertionError as e: - e.args += (pointer.shape, array.shape) - raise - logger.info(f"Initialize PyTorch weight {name}") - pointer.data = torch.from_numpy(array.astype(np.float32)) - tf_weights.pop(txt_name, None) - - logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") - return model - - -#################################################### -# PyTorch Models are constructed by sub-classing -# - torch.nn.Module for the layers and -# - PreTrainedModel for the models (it-self a sub-class of nn.Module) -#################################################### -PARALLELIZE_DOCSTRING = r""" - This is an experimental feature and is a subject to change at a moment's notice. - - Uses a device map to distribute attention modules of the model across several devices. If no device map is given, - it will evenly distribute blocks across all devices. - - Args: - device_map (`Dict[int, list]`, optional, defaults to None): - A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always - automatically mapped to the first device (for esoteric reasons). That means that the first device should - have fewer attention modules mapped to it than other devices. For reference, the t5 models have the - following number of attention modules: - - - google-t5/t5-small: 6 - - google-t5/t5-base: 12 - - google-t5/t5-large: 24 - - google-t5/t5-3b: 24 - - google-t5/t5-11b: 24 - - Example: - - ```python - # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules: - model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b") - device_map = { - 0: [0, 1, 2], - 1: [3, 4, 5, 6, 7, 8, 9], - 2: [10, 11, 12, 13, 14, 15, 16], - 3: [17, 18, 19, 20, 21, 22, 23], - } - model.parallelize(device_map) - ``` -""" -DEPARALLELIZE_DOCSTRING = r""" - Moves the model to cpu from a model parallel state. - - Example: - - ```python - # On a 4 GPU machine with google-t5/t5-3b: - model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b") - device_map = { - 0: [0, 1, 2], - 1: [3, 4, 5, 6, 7, 8, 9], - 2: [10, 11, 12, 13, 14, 15, 16], - 3: [17, 18, 19, 20, 21, 22, 23], - } - model.parallelize(device_map) # Splits the model across several devices - model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() - ``` -""" - - -class T5LayerNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - Construct a layernorm module in the T5 style. No bias and no subtraction of mean. - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated - # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for - # half-precision inputs is done in fp32 - - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) - - return self.weight * hidden_states - - -try: - from apex.normalization import FusedRMSNorm - - T5LayerNorm = FusedRMSNorm # noqa - - logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm") -except ImportError: - # using the normal T5LayerNorm - pass -except Exception: - logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm") - pass - -ALL_LAYERNORM_LAYERS.append(T5LayerNorm) - - -class T5DenseActDense(nn.Module): - def __init__(self, config: T5Config): - super().__init__() - self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) - self.act = ACT2FN[config.dense_act_fn] - - def forward(self, hidden_states): - hidden_states = self.wi(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states = self.dropout(hidden_states) - if ( - isinstance(self.wo.weight, torch.Tensor) - and hidden_states.dtype != self.wo.weight.dtype - and self.wo.weight.dtype != torch.int8 - ): - hidden_states = hidden_states.to(self.wo.weight.dtype) - hidden_states = self.wo(hidden_states) - return hidden_states - - -class T5DenseGatedActDense(nn.Module): - def __init__(self, config: T5Config): - super().__init__() - self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) - self.act = ACT2FN[config.dense_act_fn] - - def forward(self, hidden_states): - hidden_gelu = self.act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - - # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32. - # See https://github.com/huggingface/transformers/issues/20287 - # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None`` - if ( - isinstance(self.wo.weight, torch.Tensor) - and hidden_states.dtype != self.wo.weight.dtype - and self.wo.weight.dtype != torch.int8 - ): - hidden_states = hidden_states.to(self.wo.weight.dtype) - - hidden_states = self.wo(hidden_states) - return hidden_states - - -class T5LayerFF(nn.Module): - def __init__(self, config: T5Config): - super().__init__() - if config.is_gated_act: - self.DenseReluDense = T5DenseGatedActDense(config) - else: - self.DenseReluDense = T5DenseActDense(config) - - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - def forward(self, hidden_states): - forwarded_states = self.layer_norm(hidden_states) - forwarded_states = self.DenseReluDense(forwarded_states) - hidden_states = hidden_states + self.dropout(forwarded_states) - return hidden_states - - -class T5Attention(nn.Module): - def __init__(self, config: T5Config, has_relative_attention_bias=False): - super().__init__() - self.is_decoder = config.is_decoder - self.has_relative_attention_bias = has_relative_attention_bias - self.relative_attention_num_buckets = config.relative_attention_num_buckets - self.relative_attention_max_distance = config.relative_attention_max_distance - self.d_model = config.d_model - self.key_value_proj_dim = config.d_kv - self.n_heads = config.num_heads - self.dropout = config.dropout_rate - self.inner_dim = self.n_heads * self.key_value_proj_dim - - # Mesh TensorFlow initialization to avoid scaling before softmax - self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) - self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) - self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) - self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) - - if self.has_relative_attention_bias: - self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) - self.pruned_heads = set() - self.gradient_checkpointing = False - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads - ) - # Prune linear layers - self.q = prune_linear_layer(self.q, index) - self.k = prune_linear_layer(self.k, index) - self.v = prune_linear_layer(self.v, index) - self.o = prune_linear_layer(self.o, index, dim=1) - # Update hyper params - self.n_heads = self.n_heads - len(heads) - self.inner_dim = self.key_value_proj_dim * self.n_heads - self.pruned_heads = self.pruned_heads.union(heads) - - @staticmethod - def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): - """ - Adapted from Mesh Tensorflow: - https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 - - Translate relative position to a bucket number for relative attention. The relative position is defined as - memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to - position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for - small absolute relative_position and larger buckets for larger absolute relative_positions. All relative - positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. - This should allow for more graceful generalization to longer sequences than the model has been trained on - - Args: - relative_position: an int32 Tensor - bidirectional: a boolean - whether the attention is bidirectional - num_buckets: an integer - max_distance: an integer - - Returns: - a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) - """ - relative_buckets = 0 - if bidirectional: - num_buckets //= 2 - relative_buckets += (relative_position > 0).to(torch.long) * num_buckets - relative_position = torch.abs(relative_position) - else: - relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) - # now relative_position is in the range [0, inf) - - # half of the buckets are for exact increments in positions - max_exact = num_buckets // 2 - is_small = relative_position < max_exact - - # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance - relative_position_if_large = max_exact + ( - torch.log(relative_position.float() / max_exact) - / math.log(max_distance / max_exact) - * (num_buckets - max_exact) - ).to(torch.long) - relative_position_if_large = torch.min( - relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) - ) - - relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) - return relative_buckets - - def compute_bias(self, query_length, key_length, device=None): - """Compute binned relative position bias""" - if device is None: - device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] - memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] - relative_position = memory_position - context_position # shape (query_length, key_length) - relative_position_bucket = self._relative_position_bucket( - relative_position, # shape (query_length, key_length) - bidirectional=(not self.is_decoder), - num_buckets=self.relative_attention_num_buckets, - max_distance=self.relative_attention_max_distance, - ) - values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads) - values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) - return values - - def forward( - self, - hidden_states, - mask=None, - key_value_states=None, - position_bias=None, - past_key=None, - past_value=None, - past_cross_key=None, - past_cross_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, - output_attentions=False, - ): - """ - Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). - """ - # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) - batch_size, seq_length = hidden_states.shape[:2] - - real_seq_length = seq_length - - if past_key is not None: - real_seq_length += past_key.shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - - if past_key_value is not None: - past_key_value = shape(past_key_value) - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - # elif past_key_value.shape[2] != key_value_states.shape[1]: - # # checking that the `sequence_length` of the `past_key_value` is the same as - # # the provided `key_value_states` to support prefix tuning - # # cross-attn - # # (batch_size, n_heads, seq_length, dim_per_head) - # hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key if past_key is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_value if past_value is not None else None - ) - # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,) - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype - ) - if self.gradient_checkpointing and self.training: - position_bias.requires_grad = True - else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - - if self.pruned_heads: - mask = torch.ones(position_bias.shape[1]) - mask[list(self.pruned_heads)] = 0 - position_bias_masked = position_bias[:, mask.bool()] - else: - position_bias_masked = position_bias - scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) - - # Mask heads if we want to - if layer_head_mask is not None: - attn_weights = attn_weights * layer_head_mask - - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) - attn_output = self.o(attn_output) - - # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None - present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) - - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs - - -class T5SelfAttention(T5Attention): - def __init__(self, config: T5Config, has_relative_attention_bias=False): - super().__init__(config, has_relative_attention_bias) - - def forward( - self, - hidden_states, - mask=None, - position_bias=None, - past_key=None, - past_value=None, - layer_head_mask=None, - use_cache=False, - output_attentions=False, - ): - """ - Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). - """ - # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) - batch_size, seq_length = hidden_states.shape[:2] - - real_seq_length = seq_length - - if past_key is not None: - real_seq_length += past_key.shape[2] - key_length = real_seq_length - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, past_key_value): - """projects hidden states correctly to key/query states""" - if past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - - if past_key_value is not None: - hidden_states = shape(proj_layer(hidden_states)) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, past_key if past_key is not None else None - ) - value_states = project( - hidden_states, self.v, past_value if past_value is not None else None - ) - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype - ) - if self.gradient_checkpointing and self.training: - position_bias.requires_grad = True - else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - - if self.pruned_heads: - mask = torch.ones(position_bias.shape[1]) - mask[list(self.pruned_heads)] = 0 - position_bias_masked = position_bias[:, mask.bool()] - else: - position_bias_masked = position_bias - scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) - - # Mask heads if we want to - if layer_head_mask is not None: - attn_weights = attn_weights * layer_head_mask - - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) - attn_output = self.o(attn_output) - - # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None - present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs - - - - -class T5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - def forward( - self, - hidden_states, - attention_mask=None, - position_bias=None, - layer_head_mask=None, - past_key=None, - past_value=None, - use_cache=False, - output_attentions=False, - ): - normed_hidden_states = self.layer_norm(hidden_states) - attention_output = self.SelfAttention( - normed_hidden_states, - mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key=past_key, - past_value=past_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = hidden_states + self.dropout(attention_output[0]) - outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them - return outputs - - -class T5LayerCrossAttention(nn.Module): - def __init__(self, config): - super().__init__() - self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - def forward( - self, - hidden_states, - key_value_states, - attention_mask=None, - position_bias=None, - layer_head_mask=None, - past_key=None, - past_value=None, - use_cache=False, - query_length=None, - output_attentions=False, - ): - normed_hidden_states = self.layer_norm(hidden_states) - attention_output = self.EncDecAttention( - normed_hidden_states, - mask=attention_mask, - key_value_states=key_value_states, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key=past_key, - past_value=past_value, - use_cache=use_cache, - query_length=query_length, - output_attentions=output_attentions, - ) - layer_output = hidden_states + self.dropout(attention_output[0]) - outputs = (layer_output,) + attention_output[1:] # add attentions if we output them - return outputs - - -class T5Block(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.is_decoder = config.is_decoder - self.layer = nn.ModuleList() - self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) - if self.is_decoder: - self.layer.append(T5LayerCrossAttention(config)) - - self.layer.append(T5LayerFF(config)) - - def forward( - self, - hidden_states, - attention_mask=None, - position_bias=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - encoder_decoder_position_bias=None, - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key=None, - past_value=None, - past_cross_key=None, - past_cross_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, - ): - if past_key is not None: - self_attn_past_key = past_key - self_attn_past_value = past_value - cross_attn_past_key = past_cross_key - cross_attn_past_value = past_cross_value - else: - self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None - - self_attention_outputs = self.layer[0]( - hidden_states, - attention_mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key=self_attn_past_key, - past_value=self_attn_past_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states, present_key_state, present_value_state = self_attention_outputs[:3] - attention_outputs = self_attention_outputs[3:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16: - clamp_value = torch.where( - torch.isinf(hidden_states).any(), - torch.finfo(hidden_states.dtype).max - 1000, - torch.finfo(hidden_states.dtype).max, - ) - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - do_cross_attention = self.is_decoder and encoder_hidden_states is not None - if do_cross_attention: - - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_state is not None: - query_length = present_key_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - layer_head_mask=cross_attn_layer_head_mask, - past_key=cross_attn_past_key, - past_value=cross_attn_past_value, - query_length=query_length, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = cross_attention_outputs[0] - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16: - clamp_value = torch.where( - torch.isinf(hidden_states).any(), - torch.finfo(hidden_states.dtype).max - 1000, - torch.finfo(hidden_states.dtype).max, - ) - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states - # cross_attn_past_key_values = cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[3:] - - # Apply Feed Forward layer - hidden_states = self.layer[-1](hidden_states) - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16: - clamp_value = torch.where( - torch.isinf(hidden_states).any(), - torch.finfo(hidden_states.dtype).max - 1000, - torch.finfo(hidden_states.dtype).max, - ) - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs - else: - outputs = outputs + attention_outputs - - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - - -class T5ClassificationHead(nn.Module): - """Head for sentence-level classification tasks.""" - - def __init__(self, config: T5Config): - super().__init__() - self.dense = nn.Linear(config.d_model, config.d_model) - self.dropout = nn.Dropout(p=config.classifier_dropout) - self.out_proj = nn.Linear(config.d_model, config.num_labels) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dropout(hidden_states) - hidden_states = self.dense(hidden_states) - hidden_states = torch.tanh(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.out_proj(hidden_states) - return hidden_states - - -class T5PreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = T5Config - load_tf_weights = load_tf_weights_in_t5 - base_model_prefix = "transformer" - is_parallelizable = True - supports_gradient_checkpointing = True - _no_split_modules = ["T5Block"] - _keep_in_fp32_modules = ["wo"] - - @property - def dummy_inputs(self): - input_ids = torch.tensor(DUMMY_INPUTS) - input_mask = torch.tensor(DUMMY_MASK) - dummy_inputs = { - "decoder_input_ids": input_ids, - "input_ids": input_ids, - "decoder_attention_mask": input_mask, - } - return dummy_inputs - - def _init_weights(self, module): - """Initialize the weights""" - factor = self.config.initializer_factor # Used for testing weights initialization - if isinstance(module, T5LayerNorm): - module.weight.data.fill_(factor * 1.0) - elif isinstance( - module, - (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering), - ): - # Mesh TensorFlow embeddings initialization - # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 - module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) - if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: - module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) - if hasattr(module, "qa_outputs"): - module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - module.qa_outputs.bias.data.zero_() - elif isinstance(module, T5ForTokenClassification): - if hasattr(module, "classifier"): - module.classifier.weight.data.normal_(mean=0.0, std=factor * 1.0) - module.classifier.bias.data.zero_() - elif isinstance(module, T5ClassificationHead): - module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - if hasattr(module.dense, "bias") and module.dense.bias is not None: - module.dense.bias.data.zero_() - module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None: - module.out_proj.bias.data.zero_() - elif isinstance(module, T5DenseActDense): - # Mesh TensorFlow FF initialization - # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 - # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 - module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - if hasattr(module.wi, "bias") and module.wi.bias is not None: - module.wi.bias.data.zero_() - module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) - if hasattr(module.wo, "bias") and module.wo.bias is not None: - module.wo.bias.data.zero_() - elif isinstance(module, T5DenseGatedActDense): - module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: - module.wi_0.bias.data.zero_() - module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) - if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: - module.wi_1.bias.data.zero_() - module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) - if hasattr(module.wo, "bias") and module.wo.bias is not None: - module.wo.bias.data.zero_() - elif isinstance(module, T5Attention): - # Mesh TensorFlow attention initialization to avoid scaling before softmax - # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 - d_model = self.config.d_model - key_value_proj_dim = self.config.d_kv - n_heads = self.config.num_heads - module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) - module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) - module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) - module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) - if module.has_relative_attention_bias: - module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) - - def _shift_right(self, input_ids): - decoder_start_token_id = self.config.decoder_start_token_id - pad_token_id = self.config.pad_token_id - - if decoder_start_token_id is None: - raise ValueError( - "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. " - "See T5 docs for more information." - ) - - # shift inputs to the right - if is_torch_fx_proxy(input_ids): - # Item assignment is not supported natively for proxies. - shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) - shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) - else: - shifted_input_ids = input_ids.new_zeros(input_ids.shape) - shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() - shifted_input_ids[..., 0] = decoder_start_token_id - - if pad_token_id is None: - raise ValueError("self.model.config.pad_token_id has to be defined.") - # replace possible -100 values in labels by `pad_token_id` - shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) - - return shifted_input_ids - -class T5Stack(T5PreTrainedModel): - def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): - super().__init__(config) - - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder - self.lm_head=lm_head - self.encodecrosskey = encodecrosskey - self.encodecrossvalue = encodecrossvalue - self.model_dim = config.d_model - - self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] - ) - self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - # Initialize weights and apply final processing - self.post_init() - # Model parallel - self.model_parallel = False - self.device_map = None - self.gradient_checkpointing = False - - @add_start_docstrings(PARALLELIZE_DOCSTRING) - def parallelize(self, device_map=None): - warnings.warn( - "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" - " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" - " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," - " 'block.1': 1, ...}", - FutureWarning, - ) - # Check validity of device_map - self.device_map = ( - get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map - ) - assert_device_map(self.device_map, len(self.block)) - self.model_parallel = True - self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys())) - self.last_device = "cuda:" + str(max(self.device_map.keys())) - # Load onto devices - for k, v in self.device_map.items(): - for layer in v: - cuda_device = "cuda:" + str(k) - self.block[layer] = self.block[layer].to(cuda_device) - - # Set embed_tokens to first layer - self.embed_tokens = self.embed_tokens.to(self.first_device) - # Set final layer norm to last device - self.final_layer_norm = self.final_layer_norm.to(self.last_device) - - @add_start_docstrings(DEPARALLELIZE_DOCSTRING) - def deparallelize(self): - warnings.warn( - "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", - FutureWarning, - ) - self.model_parallel = False - self.device_map = None - self.first_device = "cpu" - self.last_device = "cpu" - for i in range(len(self.block)): - self.block[i] = self.block[i].to("cpu") - self.embed_tokens = self.embed_tokens.to("cpu") - self.final_layer_norm = self.final_layer_norm.to("cpu") - torch.cuda.empty_cache() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, new_embeddings): - self.embed_tokens = new_embeddings - - def forward( - self, - input_ids=None, - encoder_hidden_states=None, - past_keys=None, - past_values=None, - past_cross_keys=None, - past_cross_values=None, - encoder_attention_mask=None, - attention_mask=None, - inputs_embeds=None, - head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **model_kwargs - ): - # Model parallel - if self.model_parallel: - torch.cuda.set_device(self.first_device) - self.embed_tokens = self.embed_tokens.to(self.first_device) - use_cache = use_cache if use_cache is not None else self.config.use_cache - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - err_msg_prefix = "decoder_" if self.is_decoder else "" - raise ValueError( - f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" - ) - elif input_ids is not None: - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - err_msg_prefix = "decoder_" if self.is_decoder else "" - raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") - - if inputs_embeds is None: - if self.embed_tokens is None: - raise ValueError("You have to initialize the model with valid token embeddings") - inputs_embeds = self.embed_tokens(input_ids) - - batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length - - if use_cache is True: - if not self.is_decoder: - raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - - # initialize past_key_values with `None` if past does not exist - if not self.is_decoder: - past_keys = [None] * len(self.block) - past_values = [None] * len(self.block) - past_cross_keys = [None] * len(self.block) - past_cross_values = [None] * len(self.block) - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones( - encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long - ) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_states = () if use_cache else None - present_value_states = () if use_cache else None - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - all_cross_attentions = () if (output_attentions and self.is_decoder) else None - position_bias = None - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) - # for i, layer_module in enumerate(self.block): - for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): - layer_head_mask = head_mask[i] - cross_attn_layer_head_mask = cross_attn_head_mask[i] - # Model parallel - if self.model_parallel: - torch.cuda.set_device(hidden_states.device) - # Ensure that attention_mask is always on the same device as hidden_states - if attention_mask is not None: - attention_mask = attention_mask.to(hidden_states.device) - if position_bias is not None: - position_bias = position_bias.to(hidden_states.device) - if encoder_hidden_states is not None: - encoder_hidden_states = encoder_hidden_states.to(hidden_states.device) - if encoder_extended_attention_mask is not None: - encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device) - if encoder_decoder_position_bias is not None: - encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device) - if layer_head_mask is not None: - layer_head_mask = layer_head_mask.to(hidden_states.device) - if cross_attn_layer_head_mask is not None: - cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - layer_module.forward, - hidden_states, - extended_attention_mask, - position_bias, - encoder_hidden_states, - encoder_extended_attention_mask, - encoder_decoder_position_bias, - layer_head_mask, - cross_attn_layer_head_mask, - None, # past_key_value is always None with gradient checkpointing - use_cache, - output_attentions, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - layer_head_mask=layer_head_mask, - cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key=past_key, - past_value=past_value, - past_cross_key=past_cross_key, - past_cross_value=past_cross_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - if use_cache is False: - layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] - - hidden_states, present_key_state, present_value_state = layer_outputs[:3] - - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), - # (cross-attention position bias), (cross-attention weights) - position_bias = layer_outputs[3] - if self.is_decoder and encoder_hidden_states is not None: - encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] - # append next layer key value states - if use_cache: - present_key_states = present_key_states + present_key_state - present_value_states = present_value_states + present_value_state - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[3],) - if self.is_decoder: - all_cross_attentions = all_cross_attentions + (layer_outputs[5],) - - # Model Parallel: If it's the last layer for that device, put things on the next device - if self.model_parallel: - for k, v in self.device_map.items(): - if i == v[-1] and "cuda:" + str(k) != self.last_device: - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.final_layer_norm(hidden_states) - hidden_states = self.dropout(hidden_states).half() - - # Add last layer - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, - all_hidden_states, - all_attentions, - all_cross_attentions, - ] - if v is not None - ) - if not self.is_decoder: - cross_keys = None - cross_values = None - if self.encodecrosskey: - cross_keys = self.encodecrosskey(hidden_states) - if self.encodecrossvalue: - cross_values = self.encodecrossvalue(hidden_states) - return tuple((hidden_states, cross_keys, cross_values)) - lm_logits = None - if self.is_decoder: - if self.config.tie_word_embeddings: - hidden_states = hidden_states * (self.model_dim ** -0.5) - lm_logits = self.lm_head(hidden_states) - return tuple((lm_logits, present_key_states, present_value_states)) - - -class T5Stack_Encoder(T5PreTrainedModel): - def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): - super().__init__(config) - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder - self.encodecrosskey = encodecrosskey - self.encodecrossvalue = encodecrossvalue - self.model_dim = config.d_model - - self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] - ) - self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - # Initialize weights and apply final processing - self.post_init() - # Model parallel - self.model_parallel = False - self.device_map = None - self.gradient_checkpointing = False - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, new_embeddings): - self.embed_tokens = new_embeddings - - def forward( - self, - input_ids=None, - attention_mask=None, - head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **model_kwargs - ): - # Model parallel - use_cache = use_cache if use_cache is not None else self.config.use_cache - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - - inputs_embeds = self.embed_tokens(input_ids) - - batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = seq_length - - # initialize past_key_values with `None` if past does not exist - past_keys = [None] * len(self.block) - past_values = [None] * len(self.block) - past_cross_keys = [None] * len(self.block) - past_cross_values = [None] * len(self.block) - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - - encoder_extended_attention_mask = None - - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_states = () if use_cache else None - present_value_states = () if use_cache else None - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - all_cross_attentions = () if (output_attentions and self.is_decoder) else None - position_bias = None - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): - layer_head_mask = head_mask[i] - cross_attn_layer_head_mask = cross_attn_head_mask[i] - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = layer_module( - hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=None, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - layer_head_mask=layer_head_mask, - cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key=past_key, - past_value=past_value, - past_cross_key=past_cross_key, - past_cross_value=past_cross_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - if use_cache is False: - layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] - - hidden_states, present_key_state, present_value_state = layer_outputs[:3] - - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), - # (cross-attention position bias), (cross-attention weights) - position_bias = layer_outputs[3] - if self.is_decoder and encoder_hidden_states is not None: - encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] - # append next layer key value states - if use_cache: - present_key_states = present_key_states + present_key_state - present_value_states = present_value_states + present_value_state - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[3],) - if self.is_decoder: - all_cross_attentions = all_cross_attentions + (layer_outputs[5],) - - # Model Parallel: If it's the last layer for that device, put things on the next device - if self.model_parallel: - for k, v in self.device_map.items(): - if i == v[-1] and "cuda:" + str(k) != self.last_device: - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.final_layer_norm(hidden_states) - hidden_states = self.dropout(hidden_states).half() - - # Add last layer - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, - all_hidden_states, - all_attentions, - all_cross_attentions, - ] - if v is not None - ) - if not self.is_decoder: - cross_keys = None - cross_values = None - if self.encodecrosskey: - cross_keys = self.encodecrosskey(hidden_states) - if self.encodecrossvalue: - cross_values = self.encodecrossvalue(hidden_states) - return tuple((hidden_states, cross_keys, cross_values)) - -T5_START_DOCSTRING = r""" - - The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text - Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan - Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a - text-to-text denoising generative setting. - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`T5Config`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -T5_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you - should be able to pad the inputs on both the right and the left. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for detail. - - [What are input IDs?](../glossary#input-ids) - - To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). - attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Indices of decoder input sequence tokens in the vocabulary. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are decoder input IDs?](../glossary#decoder-input-ids) - - T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` - is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). - - To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 - Training](./t5#training). - decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also - be used by default. - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0, - 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, - 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in - `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): - Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*) - `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at - the output of the last layer of the encoder. Used in the cross-attention of the decoder. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded - representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be - input (see `past_key_values`). This is useful if you want more control over how to convert - `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - - If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value - of `inputs_embeds`. - - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - -T5_ENCODER_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you - should be able to pad the inputs on both the right and the left. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for detail. - - To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training). - attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - -# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask -__HEAD_MASK_WARNING_MSG = """ -The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, -`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. -If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, -num_heads)`. -""" - -@add_start_docstrings( - "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.", - T5_START_DOCSTRING, -) \ No newline at end of file -- Gitee From 308cf8e9198cb11b24a3dc166a0dc706fcb4f3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:15:11 +0000 Subject: [PATCH 056/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/utils.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/utils.patch | 108 -------------------- 1 file changed, 108 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/utils.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch deleted file mode 100644 index 4968e30c2b..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/utils.patch +++ /dev/null @@ -1,108 +0,0 @@ ---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py 2024-09-04 17:07:15.776000000 +0800 -+++ utils.py 2024-09-04 19:05:05.300000000 +0800 -@@ -507,10 +507,7 @@ class GenerationMixin: - generation_config: GenerationConfig, - ) -> Dict[str, Any]: - # 1. get encoder -- if self.encoder_mindie: -- encoder = self.encoder_mindie -- else: -- encoder = self.get_encoder() -+ encoder = self.get_encoder() - # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device - # as the inputs. - if hasattr(self, "hf_device_map"): -@@ -526,12 +523,12 @@ class GenerationMixin: - for argument, value in model_kwargs.items() - if not any(argument.startswith(p) for p in irrelevant_prefix) - } -- # encoder_signature = set(inspect.signature(encoder.forward).parameters) -- # encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature -- # if not encoder_accepts_wildcard: -- # encoder_kwargs = { -- # argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature -- # } -+ encoder_signature = set(inspect.signature(encoder.forward).parameters) -+ encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature -+ if not encoder_accepts_wildcard: -+ encoder_kwargs = { -+ argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature -+ } - encoder_kwargs["output_attentions"] = generation_config.output_attentions - encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states - -@@ -539,13 +536,8 @@ class GenerationMixin: - model_input_name = model_input_name if model_input_name is not None else self.main_input_name - encoder_kwargs["return_dict"] = True - encoder_kwargs[model_input_name] = inputs_tensor -- if self.encoder_mindie: -- with torch.npu.stream(self.stream): # set stream -- encoder_outputs=encoder.forward(encoder_kwargs["input_ids"]) -- self.stream.synchronize() # synchronize -- else: -- encoder_outputs = encoder(**encoder_kwargs) -- model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]} -+ model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs) -+ - return model_kwargs - - def _prepare_decoder_input_ids_for_generation( -@@ -670,9 +662,6 @@ class GenerationMixin: - outputs, standardize_cache_format=standardize_cache_format - ) - model_kwargs[cache_name] = cache -- if "past_cross_key_values" in outputs: -- past_cross_key_values = outputs.past_cross_key_values -- model_kwargs["past_cross_key_values"] = past_cross_key_values - if getattr(outputs, "state", None) is not None: - model_kwargs["state"] = outputs.state - -@@ -1804,16 +1793,16 @@ class GenerationMixin: - "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." - ) - -- # if self.device.type != input_ids.device.type: -- # warnings.warn( -- # "You are calling .generate() with the `input_ids` being on a device type different" -- # f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" -- # f" is on {self.device.type}. You may experience unexpected behaviors or slower generation." -- # " Please make sure that you have put `input_ids` to the" -- # f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before" -- # " running `.generate()`.", -- # UserWarning, -- # ) -+ if self.device.type != input_ids.device.type: -+ warnings.warn( -+ "You are calling .generate() with the `input_ids` being on a device type different" -+ f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" -+ f" is on {self.device.type}. You may experience unexpected behaviors or slower generation." -+ " Please make sure that you have put `input_ids` to the" -+ f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before" -+ " running `.generate()`.", -+ UserWarning, -+ ) - - # 8. prepare distribution pre_processing samplers - prepared_logits_processor = self._get_logits_processor( -@@ -2647,20 +2636,15 @@ class GenerationMixin: - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) -- - - # keep track of which sequences are already finished - batch_size = input_ids.shape[0] - this_peer_finished = False - unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) - model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) -- num_layers = self.config.num_layers -- num_heads = self.config.num_heads -- d_kv = self.config.d_kv -- model_kwargs["past_key_values"] = torch.randn(num_layers, 2, batch_size, num_heads, 0, d_kv).half().npu() -+ - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): - # prepare model inputs -- - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token -- Gitee From b4c1077337169f9e517a20431a2410f2822124c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:18:27 +0000 Subject: [PATCH 057/110] add MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5.patch | 1641 +++++++++++++++++ 1 file changed, 1641 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch new file mode 100644 index 0000000000..95d0455bf5 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -0,0 +1,1641 @@ +diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +index 224769fdf..6af548437 100644 +--- a/modeling_t5.py ++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +@@ -19,22 +19,26 @@ import math + import os + import warnings + from typing import List, Optional, Tuple, Union +- ++from dataclasses import dataclass + import torch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss ++# import torch_npu ++import mindietorch ++ ++ ++ + + from ...activations import ACT2FN + from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, +- Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + TokenClassifierOutput, + ) +-from ...modeling_utils import PreTrainedModel ++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin + from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer + from ...utils import ( + DUMMY_INPUTS, +@@ -47,8 +51,44 @@ from ...utils import ( + ) + from ...utils.model_parallel_utils import assert_device_map, get_device_map + from .configuration_t5 import T5Config ++from transformers.generation.logits_process import LogitsProcessorList ++from transformers.generation.stopping_criteria import StoppingCriteriaList ++from transformers.generation.configuration_utils import GenerationMode ++from transformers.utils.generic import ModelOutput + + ++@dataclass ++class Seq2SeqLMOutput(ModelOutput): ++ """ ++ Base class for model's outputs, with potential hidden states and attentions. ++ ++ Args: ++ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): ++ Sequence of hidden-states at the output of the last layer of the model. ++ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): ++ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + ++ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. ++ ++ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. ++ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): ++ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, ++ sequence_length)`. ++ ++ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention ++ heads. ++ """ ++ loss: Optional[torch.FloatTensor] = None ++ logits: torch.FloatTensor = None ++ past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_last_hidden_state: Optional[torch.FloatTensor] = None ++ encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ + logger = logging.get_logger(__name__) + + _CONFIG_FOR_DOC = "T5Config" +@@ -448,7 +488,10 @@ class T5Attention(nn.Module): + mask=None, + key_value_states=None, + position_bias=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -464,12 +507,8 @@ class T5Attention(nn.Module): + + real_seq_length = seq_length + +- if past_key_value is not None: +- if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length ++ if past_key is not None: ++ real_seq_length += past_key.shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + +@@ -493,16 +532,17 @@ class T5Attention(nn.Module): + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: ++ past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- elif past_key_value.shape[2] != key_value_states.shape[1]: +- # checking that the `sequence_length` of the `past_key_value` is the same as +- # the provided `key_value_states` to support prefix tuning +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) ++ # elif past_key_value.shape[2] != key_value_states.shape[1]: ++ # # checking that the `sequence_length` of the `past_key_value` is the same as ++ # # the provided `key_value_states` to support prefix tuning ++ # # cross-attn ++ # # (batch_size, n_heads, seq_length, dim_per_head) ++ # hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value +@@ -513,17 +553,16 @@ class T5Attention(nn.Module): + + # get key/value states + key_states = project( +- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None ++ hidden_states, self.k, key_value_states, past_key if past_key is not None else None + ) + value_states = project( +- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None ++ hidden_states, self.v, key_value_states, past_value if past_value is not None else None + ) +- ++ # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,) + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( +@@ -536,7 +575,7 @@ class T5Attention(nn.Module): + + # if key and values are already calculated + # we want only the last query position bias +- if past_key_value is not None: ++ if past_key is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: +@@ -548,7 +587,6 @@ class T5Attention(nn.Module): + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias +- + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores +@@ -564,18 +602,131 @@ class T5Attention(nn.Module): + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) ++ # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) ++ ++ if output_attentions: ++ outputs = outputs + (attn_weights,) ++ return outputs ++ ++ ++class T5SelfAttention(T5Attention): ++ def __init__(self, config: T5Config, has_relative_attention_bias=False): ++ super().__init__(config, has_relative_attention_bias) ++ ++ def forward( ++ self, ++ hidden_states, ++ mask=None, ++ position_bias=None, ++ past_key=None, ++ past_value=None, ++ layer_head_mask=None, ++ use_cache=False, ++ output_attentions=False, ++ ): ++ """ ++ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). ++ """ ++ # Input is (batch_size, seq_length, dim) ++ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) ++ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) ++ batch_size, seq_length = hidden_states.shape[:2] ++ ++ real_seq_length = seq_length ++ ++ if past_key is not None: ++ real_seq_length += past_key.shape[2] ++ key_length = real_seq_length ++ def shape(states): ++ """projection""" ++ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) ++ ++ def unshape(states): ++ """reshape""" ++ return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) ++ ++ def project(hidden_states, proj_layer, past_key_value): ++ """projects hidden states correctly to key/query states""" ++ if past_key_value is None: ++ # cross-attn ++ # (batch_size, n_heads, seq_length, dim_per_head) ++ hidden_states = shape(proj_layer(hidden_states)) ++ ++ if past_key_value is not None: ++ hidden_states = shape(proj_layer(hidden_states)) ++ hidden_states = torch.cat([past_key_value, hidden_states], dim=2) ++ return hidden_states ++ ++ # get query states ++ query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) ++ ++ # get key/value states ++ key_states = project( ++ hidden_states, self.k, past_key if past_key is not None else None ++ ) ++ value_states = project( ++ hidden_states, self.v, past_value if past_value is not None else None ++ ) ++ # compute scores ++ scores = torch.matmul( ++ query_states, key_states.transpose(3, 2) ++ ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 ++ if position_bias is None: ++ if not self.has_relative_attention_bias: ++ position_bias = torch.zeros( ++ (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype ++ ) ++ if self.gradient_checkpointing and self.training: ++ position_bias.requires_grad = True ++ else: ++ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) ++ ++ # if key and values are already calculated ++ # we want only the last query position bias ++ if past_key is not None: ++ position_bias = position_bias[:, :, -hidden_states.size(1) :, :] ++ if mask is not None: ++ position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) ++ ++ if self.pruned_heads: ++ mask = torch.ones(position_bias.shape[1]) ++ mask[list(self.pruned_heads)] = 0 ++ position_bias_masked = position_bias[:, mask.bool()] ++ else: ++ position_bias_masked = position_bias ++ scores += position_bias_masked ++ attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( ++ scores ++ ) # (batch_size, n_heads, seq_length, key_length) ++ attn_weights = nn.functional.dropout( ++ attn_weights, p=self.dropout, training=self.training ++ ) # (batch_size, n_heads, seq_length, key_length) ++ ++ # Mask heads if we want to ++ if layer_head_mask is not None: ++ attn_weights = attn_weights * layer_head_mask ++ ++ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) ++ attn_output = self.o(attn_output) + ++ # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + ++ ++ + class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() +- self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + output_attentions=False, + ): +@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module): + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + query_length=None, + output_attentions=False, +@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module): + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, +@@ -661,39 +816,34 @@ class T5Block(nn.Module): + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): +- if past_key_value is not None: +- if not self.is_decoder: +- logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") +- expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 +- +- if len(past_key_value) != expected_num_past_key_values: +- raise ValueError( +- f"There should be {expected_num_past_key_values} past states. " +- f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- f"Got {len(past_key_value)} past key / value states" +- ) +- +- self_attn_past_key_value = past_key_value[:2] +- cross_attn_past_key_value = past_key_value[2:] ++ if past_key is not None: ++ self_attn_past_key = past_key ++ self_attn_past_value = past_value ++ cross_attn_past_key = past_cross_key ++ cross_attn_past_value = past_cross_value + else: +- self_attn_past_key_value, cross_attn_past_key_value = None, None ++ self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=self_attn_past_key_value, ++ past_key=self_attn_past_key, ++ past_value=self_attn_past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +- hidden_states, present_key_value_state = self_attention_outputs[:2] +- attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights ++ hidden_states, present_key_state, present_value_state = self_attention_outputs[:3] ++ attention_outputs = self_attention_outputs[3:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: +@@ -706,22 +856,23 @@ class T5Block(nn.Module): + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: ++ + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here +- if present_key_value_state is not None: +- query_length = present_key_value_state[0].shape[2] ++ if present_key_state is not None: ++ query_length = present_key_state[0].shape[2] + else: + query_length = None +- + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=cross_attn_past_key_value, ++ past_key=cross_attn_past_key, ++ past_value=cross_attn_past_value, + query_length=query_length, +- use_cache=use_cache, ++ use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] +@@ -736,11 +887,9 @@ class T5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- if present_key_value_state is not None: +- present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- ++ # cross_attn_past_key_values = cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights +- attention_outputs = attention_outputs + cross_attention_outputs[2:] ++ attention_outputs = attention_outputs + cross_attention_outputs[3:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) +@@ -757,7 +906,7 @@ class T5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) + attention_outputs ++ outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel): + + + class T5Stack(T5PreTrainedModel): +- def __init__(self, config, embed_tokens=None): ++ def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder ++ self.lm_head=lm_head ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel): + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + ++ def invert_attention_mask(self, encoder_attention_mask): ++ if encoder_attention_mask.dim() == 3: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] ++ if encoder_attention_mask.dim() == 2: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] ++ encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility ++ ++ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ print("encoder_extended_attention_mask=",encoder_extended_attention_mask) ++ ++ return encoder_extended_attention_mask ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ + def forward( + self, + input_ids=None, +- attention_mask=None, + encoder_hidden_states=None, ++ past_keys=None, ++ past_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, + encoder_attention_mask=None, ++ attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, +- past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, ++ **model_kwargs + ): + # Model parallel + if self.model_parallel: +@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel): + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: ++ + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) ++ input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: +@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel): + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape +- + # required mask seq length can be calculated via length of past +- mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length ++ mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length + + if use_cache is True: + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- if past_key_values is None: +- past_key_values = [None] * len(self.block) +- ++ if not self.is_decoder: ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + +@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- present_key_value_states = () if use_cache else None ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- +- for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): ++ # for i, layer_module in enumerate(self.block): ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state = layer_outputs[:2] ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[2] ++ position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: +- present_key_value_states = present_key_value_states + (present_key_value_state,) ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: +@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel): + ] + if v is not None + ) +- return BaseModelOutputWithPastAndCrossAttentions( +- last_hidden_state=hidden_states, +- past_key_values=present_key_value_states, +- hidden_states=all_hidden_states, +- attentions=all_attentions, +- cross_attentions=all_cross_attentions, ++ if not self.is_decoder: ++ cross_keys = None ++ cross_values = None ++ if self.encodecrosskey: ++ cross_keys = self.encodecrosskey(hidden_states) ++ if self.encodecrossvalue: ++ cross_values = self.encodecrossvalue(hidden_states) ++ return tuple((hidden_states, cross_keys, cross_values)) ++ lm_logits = None ++ if self.is_decoder: ++ if self.config.tie_word_embeddings: ++ hidden_states = hidden_states * (self.model_dim ** -0.5) ++ lm_logits = self.lm_head(hidden_states) ++ return tuple((lm_logits, present_key_states, present_value_states)) ++ ++ ++class T5Stack_Encoder(T5PreTrainedModel): ++ def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): ++ super().__init__(config) ++ self.embed_tokens = embed_tokens ++ self.is_decoder = config.is_decoder ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model ++ ++ self.block = nn.ModuleList( ++ [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) ++ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) ++ self.dropout = nn.Dropout(config.dropout_rate) ++ ++ # Initialize weights and apply final processing ++ self.post_init() ++ # Model parallel ++ self.model_parallel = False ++ self.device_map = None ++ self.gradient_checkpointing = False ++ ++ def get_input_embeddings(self): ++ return self.embed_tokens ++ ++ def set_input_embeddings(self, new_embeddings): ++ self.embed_tokens = new_embeddings ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ ++ def forward( ++ self, ++ input_ids=None, ++ attention_mask=None, ++ head_mask=None, ++ cross_attn_head_mask=None, ++ use_cache=None, ++ output_attentions=None, ++ output_hidden_states=None, ++ return_dict=None, ++ **model_kwargs ++ ): ++ # Model parallel ++ use_cache = use_cache if use_cache is not None else self.config.use_cache ++ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions ++ output_hidden_states = ( ++ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ++ ) ++ return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ ++ input_shape = input_ids.size() ++ input_ids = input_ids.view(-1, input_shape[-1]) ++ ++ inputs_embeds = self.embed_tokens(input_ids) ++ ++ batch_size, seq_length = input_shape ++ # required mask seq length can be calculated via length of past ++ mask_seq_length = seq_length ++ ++ # initialize past_key_values with `None` if past does not exist ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) ++ if attention_mask is None: ++ attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) ++ ++ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] ++ # ourselves in which case we just need to make it broadcastable to all heads. ++ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) ++ ++ # If a 2D or 3D attention mask is provided for the cross-attention ++ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] ++ ++ encoder_extended_attention_mask = None ++ ++ # Prepare head mask if needed ++ head_mask = self.get_head_mask(head_mask, self.config.num_layers) ++ cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None ++ all_hidden_states = () if output_hidden_states else None ++ all_attentions = () if output_attentions else None ++ all_cross_attentions = () if (output_attentions and self.is_decoder) else None ++ position_bias = None ++ encoder_decoder_position_bias = None ++ ++ hidden_states = self.dropout(inputs_embeds) ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): ++ layer_head_mask = head_mask[i] ++ cross_attn_layer_head_mask = cross_attn_head_mask[i] ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ ++ layer_outputs = layer_module( ++ hidden_states, ++ attention_mask=extended_attention_mask, ++ position_bias=position_bias, ++ encoder_hidden_states=None, ++ encoder_attention_mask=encoder_extended_attention_mask, ++ encoder_decoder_position_bias=encoder_decoder_position_bias, ++ layer_head_mask=layer_head_mask, ++ cross_attn_layer_head_mask=cross_attn_layer_head_mask, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ ) ++ ++ # layer_outputs is a tuple with: ++ # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) ++ if use_cache is False: ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] ++ ++ # We share the position biases between the layers - the first layer store them ++ # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), ++ # (cross-attention position bias), (cross-attention weights) ++ position_bias = layer_outputs[3] ++ if self.is_decoder and encoder_hidden_states is not None: ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] ++ # append next layer key value states ++ if use_cache: ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state ++ ++ if output_attentions: ++ all_attentions = all_attentions + (layer_outputs[3],) ++ if self.is_decoder: ++ all_cross_attentions = all_cross_attentions + (layer_outputs[5],) ++ ++ # Model Parallel: If it's the last layer for that device, put things on the next device ++ if self.model_parallel: ++ for k, v in self.device_map.items(): ++ if i == v[-1] and "cuda:" + str(k) != self.last_device: ++ hidden_states = hidden_states.to("cuda:" + str(k + 1)) ++ ++ hidden_states = self.final_layer_norm(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() ++ ++ # Add last layer ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ ++ if not return_dict: ++ return tuple( ++ v ++ for v in [ ++ hidden_states, ++ present_key_value_states, ++ all_hidden_states, ++ all_attentions, ++ all_cross_attentions, ++ ] ++ if v is not None ++ ) ++ # present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None ++ if not self.is_decoder: ++ cross_keys = None ++ cross_values = None ++ if self.encodecrosskey: ++ cross_keys = self.encodecrosskey(hidden_states) ++ if self.encodecrossvalue: ++ cross_values = self.encodecrossvalue(hidden_states) ++ return tuple((hidden_states, cross_keys, cross_values)) + + + T5_START_DOCSTRING = r""" +@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel): + ) + + ++class EncoderToCrossKey(nn.Module): ++ def __init__(self, cross_key, num_heads, d_kv): ++ super().__init__() ++ self.cross_key = cross_key ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_keys = () ++ for i in range(len(self.cross_key)): ++ past_cross_keys += (self.cross_key[i](hidden_states),) ++ return past_cross_keys ++ ++ ++class EncoderToCrossValue(nn.Module): ++ def __init__(self, cross_value, num_heads, d_kv): ++ super().__init__() ++ self.cross_value = cross_value ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_values = () ++ for i in range(len(self.cross_value)): ++ past_cross_values += (self.cross_value[i](hidden_states),) ++ return past_cross_values ++ ++ + @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) + class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ +@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + +- def __init__(self, config: T5Config): ++ def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): + super().__init__(config) +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = T5Stack(encoder_config, self.shared) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- self.decoder = T5Stack(decoder_config, self.shared) +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.encoder_path = encoder_path ++ self.decoder_path = decoder_path ++ self.is_mindie = False ++ if not self.encoder_path or not self.decoder_path: ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) ++ ++ cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) ++ cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) ++ encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv) ++ encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv) ++ ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) ++ self.encoder_mindie = None ++ self.decoder_mindie = None ++ if self.encoder_path: ++ self.encoder_mindie = torch.jit.load(self.encoder_path) ++ self.is_mindie = True ++ if self.decoder_path: ++ self.decoder_mindie = torch.jit.load(self.decoder_path) ++ ++ self.stream = torch.npu.Stream(f"npu:{device_id}") ++ self.device_id = device_id ++ ++ ++ def get_device(self): ++ return f"npu:{self.device_id}" + + # Initialize weights and apply final processing +- self.post_init() ++ # self.post_init() + + # Model parallel + self.model_parallel = False +@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) +- def forward( +- self, +- input_ids: Optional[torch.LongTensor] = None, +- attention_mask: Optional[torch.FloatTensor] = None, +- decoder_input_ids: Optional[torch.LongTensor] = None, +- decoder_attention_mask: Optional[torch.BoolTensor] = None, +- head_mask: Optional[torch.FloatTensor] = None, +- decoder_head_mask: Optional[torch.FloatTensor] = None, +- cross_attn_head_mask: Optional[torch.Tensor] = None, +- encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- inputs_embeds: Optional[torch.FloatTensor] = None, +- decoder_inputs_embeds: Optional[torch.FloatTensor] = None, +- labels: Optional[torch.LongTensor] = None, +- use_cache: Optional[bool] = None, +- output_attentions: Optional[bool] = None, +- output_hidden_states: Optional[bool] = None, +- return_dict: Optional[bool] = None, +- ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: ++ def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., +@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" +- use_cache = use_cache if use_cache is not None else self.config.use_cache +- return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- +- # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +- if head_mask is not None and decoder_head_mask is None: +- if self.config.num_layers == self.config.num_decoder_layers: +- warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) +- decoder_head_mask = head_mask +- +- # Encode if needed (training, first prediction pass) +- if encoder_outputs is None: +- # Convert encoder inputs in embeddings if needed +- encoder_outputs = self.encoder( +- input_ids=input_ids, +- attention_mask=attention_mask, +- inputs_embeds=inputs_embeds, +- head_mask=head_mask, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): +- encoder_outputs = BaseModelOutput( +- last_hidden_state=encoder_outputs[0], +- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, +- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, +- ) +- +- hidden_states = encoder_outputs[0] +- +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- +- if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: +- # get decoder inputs from shifting lm labels to the right +- decoder_input_ids = self._shift_right(labels) +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- hidden_states = hidden_states.to(self.decoder.first_device) +- if decoder_input_ids is not None: +- decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) +- if attention_mask is not None: +- attention_mask = attention_mask.to(self.decoder.first_device) +- if decoder_attention_mask is not None: +- decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) +- +- # Decode +- decoder_outputs = self.decoder( +- input_ids=decoder_input_ids, +- attention_mask=decoder_attention_mask, +- inputs_embeds=decoder_inputs_embeds, +- past_key_values=past_key_values, +- encoder_hidden_states=hidden_states, +- encoder_attention_mask=attention_mask, +- head_mask=decoder_head_mask, +- cross_attn_head_mask=cross_attn_head_mask, +- use_cache=use_cache, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- +- sequence_output = decoder_outputs[0] +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.encoder.first_device) +- self.lm_head = self.lm_head.to(self.encoder.first_device) +- sequence_output = sequence_output.to(self.lm_head.weight.device) +- +- if self.config.tie_word_embeddings: +- # Rescale output before projecting on vocab +- # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 +- sequence_output = sequence_output * (self.model_dim**-0.5) +- +- lm_logits = self.lm_head(sequence_output) ++ if self.is_mindie: ++ with torch.npu.stream(self.stream): # set stream ++ decoder_outputs = self.decoder_mindie.forward(*args) ++ self.stream.synchronize() # synchronize ++ else: ++ hidden_states = args[0] ++ past_cross_keys = args[1:self.config.num_decoder_layers+1] ++ past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1] ++ past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1] ++ past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1] ++ encoder_attention_mask = args[-2] ++ decoder_input_ids = args[-1] ++ decoder_outputs = self.decoder(input_ids=decoder_input_ids, ++ encoder_hidden_states=hidden_states, ++ past_keys=past_keys, ++ past_values=past_values, ++ past_cross_keys=past_cross_keys, ++ past_cross_values=past_cross_values, ++ encoder_attention_mask=encoder_attention_mask) ++ + + loss = None +- if labels is not None: +- loss_fct = CrossEntropyLoss(ignore_index=-100) +- # move labels to correct device to enable PP +- labels = labels.to(lm_logits.device) +- loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) +- # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 +- +- if not return_dict: +- output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs +- return ((loss,) + output) if loss is not None else output +- +- return Seq2SeqLMOutput( +- loss=loss, +- logits=lm_logits, +- past_key_values=decoder_outputs.past_key_values, +- decoder_hidden_states=decoder_outputs.hidden_states, +- decoder_attentions=decoder_outputs.attentions, +- cross_attentions=decoder_outputs.cross_attentions, +- encoder_last_hidden_state=encoder_outputs.last_hidden_state, +- encoder_hidden_states=encoder_outputs.hidden_states, +- encoder_attentions=encoder_outputs.attentions, +- ) ++ return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2]) + + def prepare_inputs_for_generation( + self, + input_ids, +- past_key_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, ++ past_keys=None, ++ past_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used +- if past_key_values is not None: +- past_length = past_key_values[0][0].shape[2] ++ if past_keys is not None: ++ past_length = past_keys[0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: +@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 +- + input_ids = input_ids[:, remove_prefix_length:] + + return { + "decoder_input_ids": input_ids, +- "past_key_values": past_key_values, ++ "past_cross_keys":past_cross_keys, ++ "past_cross_values":past_cross_values, ++ "past_keys":past_keys, ++ "past_values":past_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, +@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + "decoder_attention_mask": decoder_attention_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, ++ + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): +@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) + return reordered_decoder_past + ++ def _prepare_encoder_decoder_kwargs_for_generation( ++ self, ++ inputs_tensor: torch.Tensor, ++ model_kwargs, ++ model_input_name, ++ generation_config, ++ ): ++ irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] ++ encoder_kwargs = { ++ argument: value ++ for argument, value in model_kwargs.items() ++ if not any(argument.startswith(p) for p in irrelevant_prefix) ++ } ++ encoder_kwargs["output_attentions"] = generation_config.output_attentions ++ encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states ++ model_input_name = model_input_name if model_input_name is not None else self.main_input_name ++ encoder_kwargs["return_dict"] = True ++ encoder_kwargs[model_input_name] = inputs_tensor ++ import time ++ start_time = time.time() ++ with torch.npu.stream(self.stream): # set stream ++ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"]) ++ self.stream.synchronize() # synchronize ++ model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]} ++ model_kwargs["past_cross_keys"] = encoder_outputs[1] ++ model_kwargs["past_cross_values"] =encoder_outputs[2] ++ return model_kwargs ++ ++ def _update_model_kwargs_for_generation( ++ self, ++ outputs, ++ model_kwargs, ++ is_encoder_decoder = False, ++ standardize_cache_format = False, ++ num_new_tokens = 1, ++ ): ++ # update past_key_values keeping its naming used in model code ++ cache_name, cache = self._extract_past_from_model_output( ++ outputs, standardize_cache_format=standardize_cache_format ++ ) ++ model_kwargs[cache_name] = cache ++ if "past_keys" in outputs: ++ past_keys = outputs.past_keys ++ model_kwargs["past_keys"] = past_keys ++ if "past_values" in outputs: ++ past_values = outputs.past_values ++ model_kwargs["past_values"] = past_values ++ # update decoder attention mask ++ if "decoder_attention_mask" in model_kwargs: ++ decoder_attention_mask = model_kwargs["decoder_attention_mask"] ++ model_kwargs["decoder_attention_mask"] = torch.cat( ++ [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))], ++ dim=-1, ++ ) ++ return model_kwargs ++ ++ @torch.no_grad() ++ def generate( ++ self, ++ inputs = None, ++ generation_config = None, ++ logits_processor = None, ++ stopping_criteria = None, ++ prefix_allowed_tokens_fn = None, ++ assistant_model = None, ++ negative_prompt_ids = None, ++ negative_prompt_attention_mask = None, ++ **kwargs, ++ ): ++ # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call ++ import time ++ start_time = time.time() ++ self._validate_model_class() ++ tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria ++ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs) ++ self._validate_model_kwargs(model_kwargs.copy()) ++ ++ ++ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() ++ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() ++ ++ accepts_attention_mask = True ++ requires_attention_mask = "encoder_outputs" not in model_kwargs ++ kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None ++ ++ # 3. Define model inputs ++ inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( ++ inputs, generation_config.bos_token_id, model_kwargs ++ ) ++ batch_size = inputs_tensor.shape[0] ++ ++ device = inputs_tensor.device ++ self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) ++ ++ # 4. Define other model kwargs ++ # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are ++ # generating the first new token or not, and we only want to use the embeddings for the first new token) ++ if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": ++ model_kwargs["use_cache"] = True ++ else: ++ model_kwargs["use_cache"] = generation_config.use_cache ++ if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: ++ model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( ++ inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id ++ ) ++ ++ if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: ++ # if model is encoder decoder encoder_outputs are created and added to `model_kwargs` ++ model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( ++ inputs_tensor, model_kwargs, model_input_name, generation_config ++ ) ++ ++ # 5. Prepare `input_ids` which will be used for auto-regressive generation ++ if self.config.is_encoder_decoder: ++ input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( ++ batch_size=batch_size, ++ model_input_name=model_input_name, ++ model_kwargs=model_kwargs, ++ decoder_start_token_id=generation_config.decoder_start_token_id, ++ device=inputs_tensor.device, ++ ) ++ else: ++ input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") ++ ++ if generation_config.token_healing: ++ input_ids = self.heal_tokens(input_ids, tokenizer) ++ ++ # 6. Prepare `max_length` depending on other stopping criteria. ++ input_ids_length = input_ids.shape[-1] ++ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None ++ has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None ++ generation_config = self._prepare_generated_length( ++ generation_config=generation_config, ++ has_default_max_length=has_default_max_length, ++ has_default_min_length=has_default_min_length, ++ model_input_name=model_input_name, ++ inputs_tensor=inputs_tensor, ++ input_ids_length=input_ids_length, ++ ) ++ ++ use_dynamic_cache_by_default = False ++ if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None: ++ raise ValueError( ++ "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a " ++ "Cache object) is unsupported. Please use only one of the two." ++ ) ++ elif generation_config.cache_implementation is not None: ++ if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING: ++ if generation_config.cache_implementation == "static" and not self._supports_static_cache: ++ raise ValueError( ++ "This model does not support `cache_implementation='static'`. Please check the following " ++ "issue: https://github.com/huggingface/transformers/issues/28981" ++ ) ++ model_kwargs["past_key_values"] = self._get_cache( ++ generation_config.cache_implementation, ++ getattr(generation_config, "num_beams", 1) * batch_size, ++ generation_config.max_length, ++ ) ++ elif generation_config.cache_implementation == "quantized": ++ if not self._supports_quantized_cache: ++ raise ValueError( ++ "This model does not support the quantized cache. If you want your model to support quantized " ++ "cache, please open an issue." ++ ) ++ ++ cache_config = ( ++ generation_config.cache_config ++ if generation_config.cache_config is not None ++ else QuantizedCacheConfig() ++ ) ++ cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend] ++ ++ if cache_config.backend == "quanto" and not is_quanto_available(): ++ raise ImportError( ++ "You need to install `quanto` in order to use KV cache quantization with quanto backend. " ++ "Please install it via with `pip install quanto`" ++ ) ++ elif cache_config.backend == "HQQ" and not is_hqq_available(): ++ raise ImportError( ++ "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. " ++ "Please install it via with `pip install hqq`" ++ ) ++ ++ model_kwargs["past_key_values"] = cache_class(cache_config) ++ # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that ++ # keeps copying the cache thus using much more memory ++ elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache(): ++ past = model_kwargs.get("past_key_values", None) ++ if past is None: ++ model_kwargs["past_key_values"] = DynamicCache() ++ use_dynamic_cache_by_default = True ++ elif isinstance(past, tuple): ++ model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past) ++ use_dynamic_cache_by_default = True ++ ++ self._validate_generated_length(generation_config, input_ids_length, has_default_max_length) ++ ++ # 7. determine generation mode ++ generation_mode = generation_config.get_generation_mode(assistant_model) ++ # 8. prepare distribution pre_processing samplers ++ prepared_logits_processor = self._get_logits_processor( ++ generation_config=generation_config, ++ input_ids_seq_length=input_ids_length, ++ encoder_input_ids=inputs_tensor, ++ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, ++ logits_processor=logits_processor, ++ device=inputs_tensor.device, ++ model_kwargs=model_kwargs, ++ negative_prompt_ids=negative_prompt_ids, ++ negative_prompt_attention_mask=negative_prompt_attention_mask, ++ ) ++ ++ # 9. prepare stopping criteria ++ prepared_stopping_criteria = self._get_stopping_criteria( ++ generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs ++ ) ++ ++ if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): ++ # 11. prepare logits warper ++ prepared_logits_warper = ( ++ self._get_logits_warper(generation_config, device=input_ids.device) ++ if generation_config.do_sample ++ else None ++ ) ++ ++ # 12. expand input_ids with `num_return_sequences` additional sequences per batch ++ input_ids, model_kwargs = self._expand_inputs_for_generation( ++ input_ids=input_ids, ++ expand_size=generation_config.num_return_sequences, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ **model_kwargs, ++ ) ++ # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`) ++ result = self._sample( ++ input_ids, ++ logits_processor=prepared_logits_processor, ++ logits_warper=prepared_logits_warper, ++ stopping_criteria=prepared_stopping_criteria, ++ generation_config=generation_config, ++ **model_kwargs, ++ ) ++ return result ++ ++ def _sample( ++ self, ++ input_ids, ++ logits_processor, ++ stopping_criteria, ++ generation_config, ++ logits_warper = None, ++ **model_kwargs, ++ ): ++ # init values ++ pad_token_id = generation_config.pad_token_id ++ output_attentions = generation_config.output_attentions ++ output_hidden_states = generation_config.output_hidden_states ++ output_scores = generation_config.output_scores ++ output_logits = generation_config.output_logits ++ return_dict_in_generate = generation_config.return_dict_in_generate ++ has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) ++ do_sample = generation_config.do_sample ++ if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): ++ raise ValueError( ++ "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " ++ f"{logits_warper})." ++ ) ++ ++ # init attention / hidden states / scores tuples ++ scores = () if (return_dict_in_generate and output_scores) else None ++ raw_logits = () if (return_dict_in_generate and output_logits) else None ++ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None ++ cross_attentions = () if (return_dict_in_generate and output_attentions) else None ++ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None ++ ++ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states ++ if return_dict_in_generate and self.config.is_encoder_decoder: ++ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None ++ encoder_hidden_states = ( ++ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ++ ) ++ ++ this_peer_finished = False ++ batch_size = input_ids.shape[0] ++ unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) ++ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) ++ ++ # keep track of which sequences are already finished ++ if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration": ++ num_layers = self.config.num_layers ++ num_heads = self.config.num_heads ++ d_kv = self.config.d_kv ++ model_kwargs["past_keys"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)] ++ model_kwargs["past_values"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)] ++ ++ ++ while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device): ++ # prepare model inputs ++ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) ++ model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]] ++ model_args.extend(model_kwargs["past_cross_keys"]) ++ model_args.extend(model_kwargs["past_cross_values"]) ++ model_args.extend(model_inputs["past_keys"]) ++ model_args.extend(model_inputs["past_values"]) ++ model_args.append(model_inputs["attention_mask"]) ++ model_args.append(model_inputs["decoder_input_ids"]) ++ ++ # forward pass to get next token ++ outputs = self(*model_args) ++ outputs = Seq2SeqLMOutput(logits=outputs[0], ++ past_keys=outputs[1], ++ past_values=outputs[2]) ++ ++ # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration ++ # (the clone itself is always small) ++ next_token_logits = outputs.logits[:, -1, :].clone() ++ ++ # pre-process distribution ++ next_token_scores = logits_processor(input_ids, next_token_logits) ++ if do_sample: ++ next_token_scores = logits_warper(input_ids, next_token_scores) ++ ++ # Store scores, attentions and hidden_states when required ++ if return_dict_in_generate: ++ if output_scores: ++ scores += (next_token_scores,) ++ if output_logits: ++ raw_logits += (next_token_logits,) ++ if output_attentions: ++ decoder_attentions += ( ++ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ++ ) ++ if self.config.is_encoder_decoder: ++ cross_attentions += (outputs.cross_attentions,) ++ ++ if output_hidden_states: ++ decoder_hidden_states += ( ++ (outputs.decoder_hidden_states,) ++ if self.config.is_encoder_decoder ++ else (outputs.hidden_states,) ++ ) ++ ++ # token selection ++ if do_sample: ++ probs = nn.functional.softmax(next_token_scores, dim=-1) ++ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) ++ else: ++ next_tokens = torch.argmax(next_token_scores, dim=-1) ++ ++ # finished sentences should have their next token be a padding token ++ if has_eos_stopping_criteria: ++ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) ++ ++ # update generated ids, model inputs, and length for next step ++ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) ++ model_kwargs = self._update_model_kwargs_for_generation( ++ outputs, ++ model_kwargs, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ ) ++ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) ++ this_peer_finished = unfinished_sequences.max() == 0 ++ # This is needed to properly delete outputs.logits which may be very large for first iteration ++ # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration ++ del outputs ++ return input_ids ++ ++ def invert_attention_mask(self, encoder_attention_mask): ++ """ ++ Invert an attention mask (e.g., switches 0. and 1.). ++ ++ Args: ++ encoder_attention_mask (`torch.Tensor`): An attention mask. ++ ++ Returns: ++ `torch.Tensor`: The inverted attention mask. ++ """ ++ if encoder_attention_mask.dim() == 3: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] ++ if encoder_attention_mask.dim() == 2: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] ++ # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition ++ # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow ++ # /transformer/transformer_layers.py#L270 ++ # encoder_extended_attention_mask = (encoder_extended_attention_mask == ++ # encoder_extended_attention_mask.transpose(-1, -2)) ++ encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility ++ #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min ++ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ ++ return encoder_extended_attention_mask ++ ++ @property ++ def device(self) -> torch.device: ++ """ ++ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same ++ device). ++ """ ++ return self.get_device() ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, devic=None, dtype=None ++ ): ++ """ ++ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. ++ ++ Arguments: ++ attention_mask (`torch.Tensor`): ++ Mask with ones indicating tokens to attend to, zeros for tokens to ignore. ++ input_shape (`Tuple[int]`): ++ The shape of the input to the model. ++ ++ Returns: ++ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. ++ """ ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] ++ # ourselves in which case we just need to make it broadcastable to all heads. ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ # Provided a padding mask of dimensions [batch_size, seq_length] ++ # - if the model is a decoder, apply a causal mask in addition to the padding mask ++ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ ++ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for ++ # masked positions, this operation will create a tensor which is 0.0 for ++ # positions we want to attend and the dtype's smallest value for masked positions. ++ # Since we are adding it to the raw scores before the softmax, this is ++ # effectively the same as removing these entirely. ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ ++ ++ + + @add_start_docstrings( + "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", +@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel): + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, -- Gitee From 989e8f4c2c52a5b1ff1894bed74ad14426eeca6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:22:30 +0000 Subject: [PATCH 058/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 93 +++++++++++--------- 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index af67451d69..e152265ae9 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -3,6 +3,7 @@ import torch import torch_npu import argparse import os +import math import mindietorch from transformers import T5ForConditionalGeneration @@ -58,17 +59,8 @@ class TextDecoderExport(torch.nn.Module): self.textdecoder_model = textdecoder_model def forward(self, - input_ids, - encoder_hidden_states, - encoder_attention_mask, - past_key_values, - past_cross_key_values): - return self.textdecoder_model(input_ids=input_ids, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - past_cross_key_values=past_cross_key_values, - return_dict=True) + *args): + return self.textdecoder_model(*args) def export_textencoder(args, model, save_dir, batch_size): encoder_path = os.path.join(save_dir, "encoder") @@ -88,7 +80,6 @@ def export_textencoder(args, model, save_dir, batch_size): traced_model = torch.jit.load(traced_path).eval() inputs0 = [] - # inputs1 = [] inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) print("compiling encoder") compiled_model = mindietorch.compile( @@ -112,48 +103,70 @@ def export_textdecoder(args, model, save_dir, batch_size): model_path = args.model_path max_lenth = 120 if not os.path.exists(traced_path): - text_decoder = model.decoder - dummy_input = ( - torch.ones([1, 1], dtype=torch.int64).npu(), - torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(), - torch.ones(1,16).npu(), - torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(), - torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu() - ) - decoder = TextDecoderExport(text_decoder).npu() + text_decoder = model + all_past_keys = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + all_past_values = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_keys = [torch.randn([1, 16, model.config.d_model]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_values = [torch.randn([1, 16, model.config.d_model]).to(torch.float16).npu()] * model.config.num_layers + dummy_input = [torch.randn(1, 16, model.config.d_model).to(torch.float16).npu()] + dummy_input.extend(all_past_cross_keys) + dummy_input.extend(all_past_cross_values) + dummy_input.extend(all_past_keys) + dummy_input.extend(all_past_values) + dummy_input.append(torch.ones(1,16).npu()) + dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu()) + decoder = TextDecoderExport(text_decoder).npu() decoder.eval() torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) if not os.path.exists(compiled_path): traced_model = torch.jit.load(traced_path).eval() print("compiling decoder") + input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] + past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_cross_value_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_key_infos = [mindietorch.Input(min_shape =(1, model.config.num_heads, 0, model.config.d_kv), + max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_value_infos = [mindietorch.Input(min_shape =(1, model.config.num_heads, 0, model.config.d_kv), + max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,1), + dtype=mindietorch.dtype.INT64)] + encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,args.max_input_seq_len), + dtype=mindietorch.dtype.INT64)] + input_info.extend(past_cross_key_infos) + input_info.extend(past_cross_value_infos) + input_info.extend(past_key_infos) + input_info.extend(past_value_infos) + input_info.extend(encoder_attention_mask_info) + input_info.extend(decoder_input_ids_info) + buffer = [] + for _ in range(2*model.config.num_layers): + buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) + buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024) + buffer.append(buffer_size0) + print("buffer=",buffer) compiled_model = mindietorch.compile( traced_model, - inputs=[mindietorch.Input(min_shape =(1, 1), - max_shape = (args.max_batchsize,1), - dtype=mindietorch.dtype.INT64), - - mindietorch.Input(min_shape =(1, 1, model.config.d_model), - max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), - dtype=mindietorch.dtype.FLOAT16), - - mindietorch.Input(min_shape = (1,1), - max_shape =(args.max_batchsize,args.max_input_seq_len), - dtype=mindietorch.dtype.INT64), - mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv), - max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), - dtype=mindietorch.dtype.FLOAT16), - - mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads), - max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads), - dtype=mindietorch.dtype.FLOAT16)], + inputs=input_info, allow_tensor_replace_int=True, require_full_compilation=False, truncate_long_and_double=True, precision_policy=mindietorch.PrecisionPolicy.FP16, soc_version="Ascend910B4", + default_buffer_size_vec=buffer, optimization_level=0 ) compiled_model.save(compiled_path) + def main(): args = parse_arguments() @@ -175,7 +188,5 @@ def main(): print("export decoder_model done!") - - if __name__ == "__main__": main() -- Gitee From 1b0910e7859a1879c53ea3bbd8dbef4729845f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:24:40 +0000 Subject: [PATCH 059/110] update MindIE/MindIE-Torch/built-in/T5/main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py index e1ec51d66a..8ac34ceec5 100644 --- a/MindIE/MindIE-Torch/built-in/T5/main.py +++ b/MindIE/MindIE-Torch/built-in/T5/main.py @@ -1,7 +1,6 @@ import torch import time import argparse -import torch_npu from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config def parse_args(): -- Gitee From b99351f5b9be33faaf1c5183962b80f63d90128d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:27:02 +0000 Subject: [PATCH 060/110] add MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_modeling_t5_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py new file mode 100644 index 0000000000..e304f4f9f2 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version =='4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') + + +if __name__ == '__main__': + main() -- Gitee From f3a88de8c3514363f2a01044f8788750ce5ebd31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 09:35:51 +0000 Subject: [PATCH 061/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index f518880708..b677c10796 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -67,21 +67,9 @@ 3. 代码修改,在T5目录下 执行命令: - - ```bash - python T5_modeling_outputs_patch.py - ``` - ```bash python T5_modeling_t5_patch.py ``` - - ```bash - python T5_modeling_utils_patch.py - ``` - ```bash - python T5_utils_patch.py - ``` 4.导出mindietorch模型 ```bash python export_t5.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} -- Gitee From 29e928df2600fe854fc83dc8fabe7dc583879101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 10:37:15 +0000 Subject: [PATCH 062/110] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20MT5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/MT5/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/.keep diff --git a/MindIE/MindIE-Torch/built-in/MT5/.keep b/MindIE/MindIE-Torch/built-in/MT5/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From b35c9a5cd3710ac2f6a65f2f20533c02b6c57942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:27:16 +0000 Subject: [PATCH 063/110] add MindIE/MindIE-Torch/built-in/MT5/export_mt5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/MT5/export_mt5.py | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/export_mt5.py diff --git a/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py new file mode 100644 index 0000000000..138728fc16 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py @@ -0,0 +1,192 @@ + +import torch +import torch_npu +import argparse +import os +import math +import mindietorch +from transformers import MT5ForConditionalGeneration + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--output_dir", + type=str, + default="./models", + help="save dir" + ) + parser.add_argument( + "--model_path", + type=str, + default="./MT5-Small", + help="T5 model path" + ) + parser.add_argument( + "--max_batchsize", + type=int, + default=1, + help="max batchsize when running" + ) + + parser.add_argument( + "--max_input_seq_len", + type=int, + default=256, + help="max input_sequence length when running" + ) + + + parser.add_argument( + "--device_id", + type=int, + default=0, + help="npu device id" + ) + return parser.parse_args() + + +class TextEncoderExport(torch.nn.Module): + def __init__(self, textencoder_model): + super(TextEncoderExport, self).__init__() + self.textencoder_model = textencoder_model + + def forward(self, input_ids): + return self.textencoder_model(input_ids=input_ids) + +class TextDecoderExport(torch.nn.Module): + def __init__(self, textdecoder_model): + super(TextDecoderExport, self).__init__() + self.textdecoder_model = textdecoder_model + + def forward(self, + *args): + return self.textdecoder_model(*args) + +def export_textencoder(args, model, save_dir, batch_size): + encoder_path = os.path.join(save_dir, "encoder") + if not os.path.exists(encoder_path): + os.makedirs(encoder_path, mode=0o640) + traced_path = os.path.join(encoder_path, "encoder.pt") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(traced_path): + text_encoder = model.encoder + dummy_input = ( + torch.ones([1, 128], dtype=torch.int64).npu() + ) + encoder = TextEncoderExport(text_encoder) + encoder.eval() + torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + + inputs0 = [] + inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) + print("compiling encoder") + compiled_model = mindietorch.compile( + traced_model, + inputs=inputs0, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def export_textdecoder(args, model, save_dir, batch_size): + decoder_path = os.path.join(save_dir, "decoder") + if not os.path.exists(decoder_path): + os.makedirs(decoder_path, mode=0o640) + traced_path = os.path.join(decoder_path, "decoder.pt") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + model_path = args.model_path + max_lenth = 120 + if not os.path.exists(traced_path): + text_decoder = model + all_past_keys = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + all_past_values = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_keys = [torch.randn([1, 16, model.config.num_heads * model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_values = [torch.randn([1, 16, model.config.num_heads * model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers + dummy_input = [torch.randn(1, 16, model.config.d_model).to(torch.float16).npu()] + dummy_input.extend(all_past_cross_keys) + dummy_input.extend(all_past_cross_values) + dummy_input.extend(all_past_keys) + dummy_input.extend(all_past_values) + dummy_input.append(torch.ones(1,16).npu()) + dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu()) + decoder = TextDecoderExport(text_decoder).npu() + decoder.eval() + torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + print("compiling decoder") + input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] + past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_cross_value_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_key_infos = [mindietorch.Input(min_shape =(1, model.config.num_heads, 0, model.config.d_kv), + max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_value_infos = [mindietorch.Input(min_shape =(1, model.config.num_heads, 0, model.config.d_kv), + max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,1), + dtype=mindietorch.dtype.INT64)] + encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,args.max_input_seq_len), + dtype=mindietorch.dtype.INT64)] + input_info.extend(past_cross_key_infos) + input_info.extend(past_cross_value_infos) + input_info.extend(past_key_infos) + input_info.extend(past_value_infos) + input_info.extend(encoder_attention_mask_info) + input_info.extend(decoder_input_ids_info) + buffer = [] + for _ in range(2*model.config.num_layers): + buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) + buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024) + buffer.append(buffer_size0) + print("buffer=",buffer) + compiled_model = mindietorch.compile( + traced_model, + inputs=input_info, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + default_buffer_size_vec=buffer, + optimization_level=0 + ) + compiled_model.save(compiled_path) + + +def main(): + args = parse_arguments() + device_id = args.device_id + save_dir = args.output_dir + torch.npu.set_device(device_id) + batch_size = 1 + model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu() + encoder_path = os.path.join(save_dir, "encoder") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textencoder(args, model, save_dir, batch_size) + print("export encoder_model done!") + + decoder_path = os.path.join(save_dir, "decoder") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textdecoder(args, model, save_dir, batch_size) + print("export decoder_model done!") + + +if __name__ == "__main__": + main() -- Gitee From fae9c1dfe5dfbc790126b3ae40f76a1b507a3c89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:28:22 +0000 Subject: [PATCH 064/110] add MindIE/MindIE-Torch/built-in/MT5/test_mt5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/test_mt5.py | 50 ++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/test_mt5.py diff --git a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py new file mode 100644 index 0000000000..c73905875e --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py @@ -0,0 +1,50 @@ +import torch +import time +import argparse +import torch_npu +from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--hf_model_path", type=str, required=True) + + parser.add_argument("--encoder_aie_path", type=str, required=True) + parser.add_argument("--decoder_aie_path", type=str, required=True) + + parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + + args = parser.parse_args() + return args + +def main(): + args = parse_args() + torch.npu.set_device(args.device_id) + model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu() + encoder = model.encoder + decoder = model.decoder + encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu() + t5_config = MT5Config.from_pretrained(args.hf_model_path) + + encoder_output = encoder(encoder_input)[0] + model = MT5ForConditionalGeneration(config=t5_config, + encoder_path=args.encoder_aie_path, + decoder_path=args.decoder_aie_path, + device_id=args.device_id).half().npu() + + encoder_mindie = model.encoder_mindie + decoder_mindie = model.decoder_mindie + mindie_stream = model.stream + with torch.npu.stream(mindie_stream): # set stream + mindie_encoder_output = encoder_mindie(encoder_input)[0] + mindie_stream.synchronize() # synchronize + import pdb + pdb.set_trace() + if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99: + print("encoder precision failed") + else: + print("test OK") + + +if __name__ == "__main__": + main() + -- Gitee From c4b40d99040f21c419bbaa69565352c89833b6de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:33:36 +0000 Subject: [PATCH 065/110] add MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/MT5/modeling_mt5.patch | 1557 +++++++++++++++++ 1 file changed, 1557 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch new file mode 100644 index 0000000000..0df148b2ea --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch @@ -0,0 +1,1557 @@ +diff --git a/modeling_mt5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py +index 1336b9196..5b94d69c7 100644 +--- a/modeling_mt5_origin.py ++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py +@@ -19,22 +19,26 @@ import math + import os + import warnings + from typing import List, Optional, Tuple, Union +- ++from dataclasses import dataclass + import torch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss ++# import torch_npu ++import mindietorch ++ ++ ++ + + from ...activations import ACT2FN + from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, +- Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + TokenClassifierOutput, + ) +-from ...modeling_utils import PreTrainedModel ++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin + from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer + from ...utils import ( + DUMMY_INPUTS, +@@ -47,8 +51,44 @@ from ...utils import ( + ) + from ...utils.model_parallel_utils import assert_device_map, get_device_map + from .configuration_mt5 import MT5Config ++from transformers.generation.logits_process import LogitsProcessorList ++from transformers.generation.stopping_criteria import StoppingCriteriaList ++from transformers.generation.configuration_utils import GenerationMode ++from transformers.utils.generic import ModelOutput + + ++@dataclass ++class Seq2SeqLMOutput(ModelOutput): ++ """ ++ Base class for model's outputs, with potential hidden states and attentions. ++ ++ Args: ++ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): ++ Sequence of hidden-states at the output of the last layer of the model. ++ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): ++ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + ++ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. ++ ++ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. ++ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): ++ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, ++ sequence_length)`. ++ ++ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention ++ heads. ++ """ ++ loss: Optional[torch.FloatTensor] = None ++ logits: torch.FloatTensor = None ++ past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_last_hidden_state: Optional[torch.FloatTensor] = None ++ encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ + logger = logging.get_logger(__name__) + + _CONFIG_FOR_DOC = "MT5Config" +@@ -323,7 +363,10 @@ class MT5Attention(nn.Module): + mask=None, + key_value_states=None, + position_bias=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -339,17 +382,15 @@ class MT5Attention(nn.Module): + + real_seq_length = seq_length + +- if past_key_value is not None: +- if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length ++ if past_key is not None: ++ real_seq_length += past_key.shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" ++ # import pdb ++ # pdb.set_trace() + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): +@@ -368,16 +409,17 @@ class MT5Attention(nn.Module): + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: ++ past_key_value = shape(past_key_value) + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- elif past_key_value.shape[2] != key_value_states.shape[1]: +- # checking that the `sequence_length` of the `past_key_value` is the same as +- # the provided `key_value_states` to support prefix tuning +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) ++ # elif past_key_value.shape[2] != key_value_states.shape[1]: ++ # # checking that the `sequence_length` of the `past_key_value` is the same as ++ # # the provided `key_value_states` to support prefix tuning ++ # # cross-attn ++ # # (batch_size, n_heads, seq_length, dim_per_head) ++ # hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value +@@ -388,10 +430,10 @@ class MT5Attention(nn.Module): + + # get key/value states + key_states = project( +- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None ++ hidden_states, self.k, key_value_states, past_key if past_key is not None else None + ) + value_states = project( +- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None ++ hidden_states, self.v, key_value_states, past_value if past_value is not None else None + ) + + # compute scores +@@ -411,7 +453,7 @@ class MT5Attention(nn.Module): + + # if key and values are already calculated + # we want only the last query position bias +- if past_key_value is not None: ++ if past_key is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: +@@ -439,14 +481,124 @@ class MT5Attention(nn.Module): + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- ++ # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) ++ + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + ++class MT5SelfAttention(MT5Attention): ++ def __init__(self, config: MT5Config, has_relative_attention_bias=False): ++ super().__init__(config, has_relative_attention_bias) ++ ++ def forward( ++ self, ++ hidden_states, ++ mask=None, ++ position_bias=None, ++ past_key=None, ++ past_value=None, ++ layer_head_mask=None, ++ use_cache=False, ++ output_attentions=False, ++ ): ++ """ ++ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). ++ """ ++ # Input is (batch_size, seq_length, dim) ++ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) ++ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) ++ batch_size, seq_length = hidden_states.shape[:2] ++ ++ real_seq_length = seq_length ++ ++ if past_key is not None: ++ real_seq_length += past_key.shape[2] ++ key_length = real_seq_length ++ def shape(states): ++ """projection""" ++ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) ++ ++ def unshape(states): ++ """reshape""" ++ return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) ++ ++ def project(hidden_states, proj_layer, past_key_value): ++ """projects hidden states correctly to key/query states""" ++ if past_key_value is None: ++ # cross-attn ++ # (batch_size, n_heads, seq_length, dim_per_head) ++ hidden_states = shape(proj_layer(hidden_states)) ++ ++ if past_key_value is not None: ++ hidden_states = shape(proj_layer(hidden_states)) ++ hidden_states = torch.cat([past_key_value, hidden_states], dim=2) ++ return hidden_states ++ ++ # get query states ++ query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) ++ ++ # get key/value states ++ key_states = project( ++ hidden_states, self.k, past_key if past_key is not None else None ++ ) ++ value_states = project( ++ hidden_states, self.v, past_value if past_value is not None else None ++ ) ++ # compute scores ++ scores = torch.matmul( ++ query_states, key_states.transpose(3, 2) ++ ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 ++ if position_bias is None: ++ if not self.has_relative_attention_bias: ++ position_bias = torch.zeros( ++ (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype ++ ) ++ if self.gradient_checkpointing and self.training: ++ position_bias.requires_grad = True ++ else: ++ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) ++ ++ # if key and values are already calculated ++ # we want only the last query position bias ++ if past_key is not None: ++ position_bias = position_bias[:, :, -hidden_states.size(1) :, :] ++ if mask is not None: ++ position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) ++ ++ if self.pruned_heads: ++ mask = torch.ones(position_bias.shape[1]) ++ mask[list(self.pruned_heads)] = 0 ++ position_bias_masked = position_bias[:, mask.bool()] ++ else: ++ position_bias_masked = position_bias ++ scores += position_bias_masked ++ attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( ++ scores ++ ) # (batch_size, n_heads, seq_length, key_length) ++ attn_weights = nn.functional.dropout( ++ attn_weights, p=self.dropout, training=self.training ++ ) # (batch_size, n_heads, seq_length, key_length) ++ ++ # Mask heads if we want to ++ if layer_head_mask is not None: ++ attn_weights = attn_weights * layer_head_mask ++ ++ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) ++ attn_output = self.o(attn_output) ++ ++ # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) ++ if output_attentions: ++ outputs = outputs + (attn_weights,) ++ return outputs ++ + # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 + class MT5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): +@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + output_attentions=False, + ): +@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module): + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + query_length=None, + output_attentions=False, +@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module): + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, +@@ -539,39 +695,34 @@ class MT5Block(nn.Module): + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): +- if past_key_value is not None: +- if not self.is_decoder: +- logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") +- expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 +- +- if len(past_key_value) != expected_num_past_key_values: +- raise ValueError( +- f"There should be {expected_num_past_key_values} past states. " +- f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- f"Got {len(past_key_value)} past key / value states" +- ) +- +- self_attn_past_key_value = past_key_value[:2] +- cross_attn_past_key_value = past_key_value[2:] ++ if past_key is not None: ++ self_attn_past_key = past_key ++ self_attn_past_value = past_value ++ cross_attn_past_key = past_cross_key ++ cross_attn_past_value = past_cross_value + else: +- self_attn_past_key_value, cross_attn_past_key_value = None, None ++ self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=self_attn_past_key_value, ++ past_key=self_attn_past_key, ++ past_value=self_attn_past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +- hidden_states, present_key_value_state = self_attention_outputs[:2] +- attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights ++ hidden_states, present_key_state, present_value_state = self_attention_outputs[:3] ++ attention_outputs = self_attention_outputs[3:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: +@@ -586,8 +737,8 @@ class MT5Block(nn.Module): + if do_cross_attention: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here +- if present_key_value_state is not None: +- query_length = present_key_value_state[0].shape[2] ++ if present_key_state is not None: ++ query_length = present_key_state[0].shape[2] + else: + query_length = None + +@@ -597,7 +748,8 @@ class MT5Block(nn.Module): + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=cross_attn_past_key_value, ++ past_key=cross_attn_past_key, ++ past_value=cross_attn_past_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, +@@ -614,11 +766,9 @@ class MT5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- if present_key_value_state is not None: +- present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- ++ # cross_attn_past_key_values = cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights +- attention_outputs = attention_outputs + cross_attention_outputs[2:] ++ attention_outputs = attention_outputs + cross_attention_outputs[3:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) +@@ -635,7 +785,7 @@ class MT5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) + attention_outputs ++ outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel): + + # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 + class MT5Stack(MT5PreTrainedModel): +- def __init__(self, config, embed_tokens=None): ++ def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder ++ self.lm_head=lm_head ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model + + self.block = nn.ModuleList( + [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel): + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + ++ def invert_attention_mask(self, encoder_attention_mask): ++ if encoder_attention_mask.dim() == 3: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] ++ if encoder_attention_mask.dim() == 2: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] ++ encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility ++ ++ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ ++ return encoder_extended_attention_mask ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ + def forward( + self, + input_ids=None, +- attention_mask=None, + encoder_hidden_states=None, ++ past_keys=None, ++ past_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, + encoder_attention_mask=None, ++ attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, +- past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, ++ **model_kwargs + ): + # Model parallel + if self.model_parallel: +@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel): + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: ++ + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) ++ input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: +@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel): + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape +- + # required mask seq length can be calculated via length of past +- mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length ++ mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length + + if use_cache is True: + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- if past_key_values is None: +- past_key_values = [None] * len(self.block) +- ++ if not self.is_decoder: ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + +@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- present_key_value_states = () if use_cache else None ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- +- for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): ++ # for i, layer_module in enumerate(self.block): ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel): + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state = layer_outputs[:2] ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[2] ++ position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: +- present_key_value_states = present_key_value_states + (present_key_value_state,) ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: +@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel): + ] + if v is not None + ) +- return BaseModelOutputWithPastAndCrossAttentions( +- last_hidden_state=hidden_states, +- past_key_values=present_key_value_states, +- hidden_states=all_hidden_states, +- attentions=all_attentions, +- cross_attentions=all_cross_attentions, ++ if not self.is_decoder: ++ cross_keys = None ++ cross_values = None ++ if self.encodecrosskey: ++ cross_keys = self.encodecrosskey(hidden_states) ++ if self.encodecrossvalue: ++ cross_values = self.encodecrossvalue(hidden_states) ++ return tuple((hidden_states, cross_keys, cross_values)) ++ lm_logits = None ++ if self.is_decoder: ++ if self.config.tie_word_embeddings: ++ hidden_states = hidden_states * (self.model_dim ** -0.5) ++ lm_logits = self.lm_head(hidden_states) ++ return tuple((lm_logits, present_key_states, present_value_states)) ++ ++ ++class MT5Stack_Encoder(MT5PreTrainedModel): ++ def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): ++ super().__init__(config) ++ self.embed_tokens = embed_tokens ++ self.is_decoder = config.is_decoder ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model ++ ++ self.block = nn.ModuleList( ++ [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) ++ self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) ++ self.dropout = nn.Dropout(config.dropout_rate) ++ ++ # Initialize weights and apply final processing ++ self.post_init() ++ # Model parallel ++ self.model_parallel = False ++ self.device_map = None ++ self.gradient_checkpointing = False ++ ++ def get_input_embeddings(self): ++ return self.embed_tokens ++ ++ def set_input_embeddings(self, new_embeddings): ++ self.embed_tokens = new_embeddings ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ ++ def forward( ++ self, ++ input_ids=None, ++ attention_mask=None, ++ head_mask=None, ++ cross_attn_head_mask=None, ++ use_cache=None, ++ output_attentions=None, ++ output_hidden_states=None, ++ return_dict=None, ++ **model_kwargs ++ ): ++ # Model parallel ++ use_cache = use_cache if use_cache is not None else self.config.use_cache ++ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions ++ output_hidden_states = ( ++ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ++ ) ++ return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ ++ input_shape = input_ids.size() ++ input_ids = input_ids.view(-1, input_shape[-1]) ++ ++ inputs_embeds = self.embed_tokens(input_ids) ++ ++ batch_size, seq_length = input_shape ++ # required mask seq length can be calculated via length of past ++ mask_seq_length = seq_length ++ ++ # initialize past_key_values with `None` if past does not exist ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) ++ if attention_mask is None: ++ attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) ++ ++ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] ++ # ourselves in which case we just need to make it broadcastable to all heads. ++ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) ++ ++ # If a 2D or 3D attention mask is provided for the cross-attention ++ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] ++ ++ encoder_extended_attention_mask = None ++ ++ # Prepare head mask if needed ++ head_mask = self.get_head_mask(head_mask, self.config.num_layers) ++ cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None ++ all_hidden_states = () if output_hidden_states else None ++ all_attentions = () if output_attentions else None ++ all_cross_attentions = () if (output_attentions and self.is_decoder) else None ++ position_bias = None ++ encoder_decoder_position_bias = None ++ ++ hidden_states = self.dropout(inputs_embeds) ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): ++ layer_head_mask = head_mask[i] ++ cross_attn_layer_head_mask = cross_attn_head_mask[i] ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ ++ layer_outputs = layer_module( ++ hidden_states, ++ attention_mask=extended_attention_mask, ++ position_bias=position_bias, ++ encoder_hidden_states=None, ++ encoder_attention_mask=encoder_extended_attention_mask, ++ encoder_decoder_position_bias=encoder_decoder_position_bias, ++ layer_head_mask=layer_head_mask, ++ cross_attn_layer_head_mask=cross_attn_layer_head_mask, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ ) ++ ++ # layer_outputs is a tuple with: ++ # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) ++ if use_cache is False: ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] ++ ++ # We share the position biases between the layers - the first layer store them ++ # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), ++ # (cross-attention position bias), (cross-attention weights) ++ position_bias = layer_outputs[3] ++ if self.is_decoder and encoder_hidden_states is not None: ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] ++ # append next layer key value states ++ if use_cache: ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state ++ ++ if output_attentions: ++ all_attentions = all_attentions + (layer_outputs[3],) ++ if self.is_decoder: ++ all_cross_attentions = all_cross_attentions + (layer_outputs[5],) ++ ++ # Model Parallel: If it's the last layer for that device, put things on the next device ++ if self.model_parallel: ++ for k, v in self.device_map.items(): ++ if i == v[-1] and "cuda:" + str(k) != self.last_device: ++ hidden_states = hidden_states.to("cuda:" + str(k + 1)) ++ ++ hidden_states = self.final_layer_norm(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() ++ ++ # Add last layer ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ ++ if not return_dict: ++ return tuple( ++ v ++ for v in [ ++ hidden_states, ++ present_key_value_states, ++ all_hidden_states, ++ all_attentions, ++ all_cross_attentions, ++ ] ++ if v is not None ++ ) ++ # present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None ++ if not self.is_decoder: ++ cross_keys = None ++ cross_values = None ++ if self.encodecrosskey: ++ cross_keys = self.encodecrosskey(hidden_states) ++ if self.encodecrossvalue: ++ cross_values = self.encodecrossvalue(hidden_states) ++ return tuple((hidden_states, cross_keys, cross_values)) + + + MT5_START_DOCSTRING = r""" +@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel): + ) + + ++class EncoderToCrossKey(nn.Module): ++ def __init__(self, cross_key, num_heads, d_kv): ++ super().__init__() ++ self.cross_key = cross_key ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_keys = () ++ for i in range(len(self.cross_key)): ++ past_cross_keys += (self.cross_key[i](hidden_states),) ++ # import pdb ++ # pdb.set_trace() ++ return past_cross_keys ++ ++ ++class EncoderToCrossValue(nn.Module): ++ def __init__(self, cross_value, num_heads, d_kv): ++ super().__init__() ++ self.cross_value = cross_value ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_values = () ++ for i in range(len(self.cross_value)): ++ past_cross_values += (self.cross_value[i](hidden_states),) ++ return past_cross_values ++ + @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) + class MT5ForConditionalGeneration(MT5PreTrainedModel): + r""" +@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 +- def __init__(self, config: MT5Config): ++ def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): + super().__init__(config) +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = MT5Stack(encoder_config, self.shared) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- self.decoder = MT5Stack(decoder_config, self.shared) +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.encoder_path = encoder_path ++ self.decoder_path = decoder_path ++ self.is_mindie = False ++ if not self.encoder_path or not self.decoder_path: ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head) ++ cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) ++ cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) ++ encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv) ++ encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv) ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) ++ self.encoder_mindie = None ++ self.decoder_mindie = None ++ if self.encoder_path: ++ self.encoder_mindie = torch.jit.load(self.encoder_path) ++ self.is_mindie = True ++ if self.decoder_path: ++ self.decoder_mindie = torch.jit.load(self.decoder_path) ++ self.stream = torch.npu.Stream(f"npu:{device_id}") ++ self.device_id = device_id + + # Initialize weights and apply final processing +- self.post_init() ++ if not self.is_mindie: ++ self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + ++ def get_device(self): ++ return f"npu:{self.device_id}" ++ + @add_start_docstrings(PARALLELIZE_DOCSTRING) + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize + def parallelize(self, device_map=None): +@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5 +- def forward( +- self, +- input_ids: Optional[torch.LongTensor] = None, +- attention_mask: Optional[torch.FloatTensor] = None, +- decoder_input_ids: Optional[torch.LongTensor] = None, +- decoder_attention_mask: Optional[torch.BoolTensor] = None, +- head_mask: Optional[torch.FloatTensor] = None, +- decoder_head_mask: Optional[torch.FloatTensor] = None, +- cross_attn_head_mask: Optional[torch.Tensor] = None, +- encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- inputs_embeds: Optional[torch.FloatTensor] = None, +- decoder_inputs_embeds: Optional[torch.FloatTensor] = None, +- labels: Optional[torch.LongTensor] = None, +- use_cache: Optional[bool] = None, +- output_attentions: Optional[bool] = None, +- output_hidden_states: Optional[bool] = None, +- return_dict: Optional[bool] = None, +- ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: ++ def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., +@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" +- use_cache = use_cache if use_cache is not None else self.config.use_cache +- return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- +- # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +- if head_mask is not None and decoder_head_mask is None: +- if self.config.num_layers == self.config.num_decoder_layers: +- warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) +- decoder_head_mask = head_mask +- +- # Encode if needed (training, first prediction pass) +- if encoder_outputs is None: +- # Convert encoder inputs in embeddings if needed +- encoder_outputs = self.encoder( +- input_ids=input_ids, +- attention_mask=attention_mask, +- inputs_embeds=inputs_embeds, +- head_mask=head_mask, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): +- encoder_outputs = BaseModelOutput( +- last_hidden_state=encoder_outputs[0], +- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, +- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, +- ) +- +- hidden_states = encoder_outputs[0] +- +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- +- if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: +- # get decoder inputs from shifting lm labels to the right +- decoder_input_ids = self._shift_right(labels) +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- hidden_states = hidden_states.to(self.decoder.first_device) +- if decoder_input_ids is not None: +- decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) +- if attention_mask is not None: +- attention_mask = attention_mask.to(self.decoder.first_device) +- if decoder_attention_mask is not None: +- decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) +- +- # Decode +- decoder_outputs = self.decoder( +- input_ids=decoder_input_ids, +- attention_mask=decoder_attention_mask, +- inputs_embeds=decoder_inputs_embeds, +- past_key_values=past_key_values, +- encoder_hidden_states=hidden_states, +- encoder_attention_mask=attention_mask, +- head_mask=decoder_head_mask, +- cross_attn_head_mask=cross_attn_head_mask, +- use_cache=use_cache, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- +- sequence_output = decoder_outputs[0] +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.encoder.first_device) +- self.lm_head = self.lm_head.to(self.encoder.first_device) +- sequence_output = sequence_output.to(self.lm_head.weight.device) +- +- if self.config.tie_word_embeddings: +- # Rescale output before projecting on vocab +- # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 +- sequence_output = sequence_output * (self.model_dim**-0.5) +- +- lm_logits = self.lm_head(sequence_output) ++ if self.is_mindie: ++ with torch.npu.stream(self.stream): # set stream ++ decoder_outputs = self.decoder_mindie.forward(*args) ++ self.stream.synchronize() # synchronize ++ else: ++ hidden_states = args[0] ++ past_cross_keys = args[1:self.config.num_decoder_layers+1] ++ past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1] ++ past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1] ++ past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1] ++ encoder_attention_mask = args[-2] ++ decoder_input_ids = args[-1] ++ decoder_outputs = self.decoder(input_ids=decoder_input_ids, ++ encoder_hidden_states=hidden_states, ++ past_keys=past_keys, ++ past_values=past_values, ++ past_cross_keys=past_cross_keys, ++ past_cross_values=past_cross_values, ++ encoder_attention_mask=encoder_attention_mask) ++ + + loss = None +- if labels is not None: +- loss_fct = CrossEntropyLoss(ignore_index=-100) +- # move labels to correct device to enable PP +- labels = labels.to(lm_logits.device) +- loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) +- # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 +- +- if not return_dict: +- output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs +- return ((loss,) + output) if loss is not None else output +- +- return Seq2SeqLMOutput( +- loss=loss, +- logits=lm_logits, +- past_key_values=decoder_outputs.past_key_values, +- decoder_hidden_states=decoder_outputs.hidden_states, +- decoder_attentions=decoder_outputs.attentions, +- cross_attentions=decoder_outputs.cross_attentions, +- encoder_last_hidden_state=encoder_outputs.last_hidden_state, +- encoder_hidden_states=encoder_outputs.hidden_states, +- encoder_attentions=encoder_outputs.attentions, +- ) ++ return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2]) + +- # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + input_ids, +- past_key_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, ++ past_keys=None, ++ past_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used +- if past_key_values is not None: +- past_length = past_key_values[0][0].shape[2] ++ if past_keys is not None: ++ past_length = past_keys[0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: +@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + + return { + "decoder_input_ids": input_ids, +- "past_key_values": past_key_values, ++ "past_cross_keys":past_cross_keys, ++ "past_cross_values":past_cross_values, ++ "past_keys":past_keys, ++ "past_values":past_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, +@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) + return reordered_decoder_past + ++ def _prepare_encoder_decoder_kwargs_for_generation( ++ self, ++ inputs_tensor: torch.Tensor, ++ model_kwargs, ++ model_input_name, ++ generation_config, ++ ): ++ irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] ++ encoder_kwargs = { ++ argument: value ++ for argument, value in model_kwargs.items() ++ if not any(argument.startswith(p) for p in irrelevant_prefix) ++ } ++ encoder_kwargs["output_attentions"] = generation_config.output_attentions ++ encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states ++ model_input_name = model_input_name if model_input_name is not None else self.main_input_name ++ encoder_kwargs["return_dict"] = True ++ encoder_kwargs[model_input_name] = inputs_tensor ++ import time ++ start_time = time.time() ++ with torch.npu.stream(self.stream): # set stream ++ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"]) ++ self.stream.synchronize() # synchronize ++ model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]} ++ model_kwargs["past_cross_keys"] = encoder_outputs[1] ++ model_kwargs["past_cross_values"] =encoder_outputs[2] ++ return model_kwargs ++ ++ def _update_model_kwargs_for_generation( ++ self, ++ outputs, ++ model_kwargs, ++ is_encoder_decoder = False, ++ standardize_cache_format = False, ++ num_new_tokens = 1, ++ ): ++ # update past_key_values keeping its naming used in model code ++ cache_name, cache = self._extract_past_from_model_output( ++ outputs, standardize_cache_format=standardize_cache_format ++ ) ++ model_kwargs[cache_name] = cache ++ if "past_keys" in outputs: ++ past_keys = outputs.past_keys ++ model_kwargs["past_keys"] = past_keys ++ if "past_values" in outputs: ++ past_values = outputs.past_values ++ model_kwargs["past_values"] = past_values ++ # update decoder attention mask ++ if "decoder_attention_mask" in model_kwargs: ++ decoder_attention_mask = model_kwargs["decoder_attention_mask"] ++ model_kwargs["decoder_attention_mask"] = torch.cat( ++ [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))], ++ dim=-1, ++ ) ++ return model_kwargs ++ ++ @torch.no_grad() ++ def generate( ++ self, ++ inputs = None, ++ generation_config = None, ++ logits_processor = None, ++ stopping_criteria = None, ++ prefix_allowed_tokens_fn = None, ++ assistant_model = None, ++ negative_prompt_ids = None, ++ negative_prompt_attention_mask = None, ++ **kwargs, ++ ): ++ # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call ++ import time ++ start_time = time.time() ++ self._validate_model_class() ++ tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria ++ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs) ++ self._validate_model_kwargs(model_kwargs.copy()) ++ ++ ++ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() ++ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() ++ ++ accepts_attention_mask = True ++ requires_attention_mask = "encoder_outputs" not in model_kwargs ++ kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None ++ ++ # 3. Define model inputs ++ inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( ++ inputs, generation_config.bos_token_id, model_kwargs ++ ) ++ batch_size = inputs_tensor.shape[0] ++ ++ device = inputs_tensor.device ++ self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) ++ ++ # 4. Define other model kwargs ++ # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are ++ # generating the first new token or not, and we only want to use the embeddings for the first new token) ++ if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": ++ model_kwargs["use_cache"] = True ++ else: ++ model_kwargs["use_cache"] = generation_config.use_cache ++ if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: ++ model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( ++ inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id ++ ) ++ ++ if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: ++ # if model is encoder decoder encoder_outputs are created and added to `model_kwargs` ++ model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( ++ inputs_tensor, model_kwargs, model_input_name, generation_config ++ ) ++ ++ # 5. Prepare `input_ids` which will be used for auto-regressive generation ++ if self.config.is_encoder_decoder: ++ input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( ++ batch_size=batch_size, ++ model_input_name=model_input_name, ++ model_kwargs=model_kwargs, ++ decoder_start_token_id=generation_config.decoder_start_token_id, ++ device=inputs_tensor.device, ++ ) ++ else: ++ input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") ++ ++ if generation_config.token_healing: ++ input_ids = self.heal_tokens(input_ids, tokenizer) ++ ++ # 6. Prepare `max_length` depending on other stopping criteria. ++ input_ids_length = input_ids.shape[-1] ++ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None ++ has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None ++ generation_config = self._prepare_generated_length( ++ generation_config=generation_config, ++ has_default_max_length=has_default_max_length, ++ has_default_min_length=has_default_min_length, ++ model_input_name=model_input_name, ++ inputs_tensor=inputs_tensor, ++ input_ids_length=input_ids_length, ++ ) ++ ++ use_dynamic_cache_by_default = False ++ if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None: ++ raise ValueError( ++ "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a " ++ "Cache object) is unsupported. Please use only one of the two." ++ ) ++ elif generation_config.cache_implementation is not None: ++ if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING: ++ if generation_config.cache_implementation == "static" and not self._supports_static_cache: ++ raise ValueError( ++ "This model does not support `cache_implementation='static'`. Please check the following " ++ "issue: https://github.com/huggingface/transformers/issues/28981" ++ ) ++ model_kwargs["past_key_values"] = self._get_cache( ++ generation_config.cache_implementation, ++ getattr(generation_config, "num_beams", 1) * batch_size, ++ generation_config.max_length, ++ ) ++ elif generation_config.cache_implementation == "quantized": ++ if not self._supports_quantized_cache: ++ raise ValueError( ++ "This model does not support the quantized cache. If you want your model to support quantized " ++ "cache, please open an issue." ++ ) ++ ++ cache_config = ( ++ generation_config.cache_config ++ if generation_config.cache_config is not None ++ else QuantizedCacheConfig() ++ ) ++ cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend] ++ ++ if cache_config.backend == "quanto" and not is_quanto_available(): ++ raise ImportError( ++ "You need to install `quanto` in order to use KV cache quantization with quanto backend. " ++ "Please install it via with `pip install quanto`" ++ ) ++ elif cache_config.backend == "HQQ" and not is_hqq_available(): ++ raise ImportError( ++ "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. " ++ "Please install it via with `pip install hqq`" ++ ) ++ ++ model_kwargs["past_key_values"] = cache_class(cache_config) ++ # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that ++ # keeps copying the cache thus using much more memory ++ elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache(): ++ past = model_kwargs.get("past_key_values", None) ++ if past is None: ++ model_kwargs["past_key_values"] = DynamicCache() ++ use_dynamic_cache_by_default = True ++ elif isinstance(past, tuple): ++ model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past) ++ use_dynamic_cache_by_default = True ++ ++ self._validate_generated_length(generation_config, input_ids_length, has_default_max_length) ++ ++ # 7. determine generation mode ++ generation_mode = generation_config.get_generation_mode(assistant_model) ++ # 8. prepare distribution pre_processing samplers ++ prepared_logits_processor = self._get_logits_processor( ++ generation_config=generation_config, ++ input_ids_seq_length=input_ids_length, ++ encoder_input_ids=inputs_tensor, ++ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, ++ logits_processor=logits_processor, ++ device=inputs_tensor.device, ++ model_kwargs=model_kwargs, ++ negative_prompt_ids=negative_prompt_ids, ++ negative_prompt_attention_mask=negative_prompt_attention_mask, ++ ) ++ ++ # 9. prepare stopping criteria ++ prepared_stopping_criteria = self._get_stopping_criteria( ++ generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs ++ ) ++ ++ if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): ++ # 11. prepare logits warper ++ prepared_logits_warper = ( ++ self._get_logits_warper(generation_config, device=input_ids.device) ++ if generation_config.do_sample ++ else None ++ ) ++ ++ # 12. expand input_ids with `num_return_sequences` additional sequences per batch ++ input_ids, model_kwargs = self._expand_inputs_for_generation( ++ input_ids=input_ids, ++ expand_size=generation_config.num_return_sequences, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ **model_kwargs, ++ ) ++ # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`) ++ result = self._sample( ++ input_ids, ++ logits_processor=prepared_logits_processor, ++ logits_warper=prepared_logits_warper, ++ stopping_criteria=prepared_stopping_criteria, ++ generation_config=generation_config, ++ **model_kwargs, ++ ) ++ return result ++ ++ def _sample( ++ self, ++ input_ids, ++ logits_processor, ++ stopping_criteria, ++ generation_config, ++ logits_warper = None, ++ **model_kwargs, ++ ): ++ # init values ++ pad_token_id = generation_config.pad_token_id ++ output_attentions = generation_config.output_attentions ++ output_hidden_states = generation_config.output_hidden_states ++ output_scores = generation_config.output_scores ++ output_logits = generation_config.output_logits ++ return_dict_in_generate = generation_config.return_dict_in_generate ++ has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) ++ do_sample = generation_config.do_sample ++ if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): ++ raise ValueError( ++ "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " ++ f"{logits_warper})." ++ ) ++ ++ # init attention / hidden states / scores tuples ++ scores = () if (return_dict_in_generate and output_scores) else None ++ raw_logits = () if (return_dict_in_generate and output_logits) else None ++ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None ++ cross_attentions = () if (return_dict_in_generate and output_attentions) else None ++ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None ++ ++ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states ++ if return_dict_in_generate and self.config.is_encoder_decoder: ++ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None ++ encoder_hidden_states = ( ++ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ++ ) ++ ++ this_peer_finished = False ++ batch_size = input_ids.shape[0] ++ unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) ++ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) ++ ++ # keep track of which sequences are already finished ++ if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration": ++ num_layers = self.config.num_layers ++ num_heads = self.config.num_heads ++ d_kv = self.config.d_kv ++ model_kwargs["past_keys"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)] ++ model_kwargs["past_values"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)] ++ ++ ++ while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device): ++ # prepare model inputs ++ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) ++ model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]] ++ model_args.extend(model_kwargs["past_cross_keys"]) ++ model_args.extend(model_kwargs["past_cross_values"]) ++ model_args.extend(model_inputs["past_keys"]) ++ model_args.extend(model_inputs["past_values"]) ++ model_args.append(model_inputs["attention_mask"]) ++ model_args.append(model_inputs["decoder_input_ids"]) ++ ++ # forward pass to get next token ++ outputs = self(*model_args) ++ outputs = Seq2SeqLMOutput(logits=outputs[0], ++ past_keys=outputs[1], ++ past_values=outputs[2]) ++ ++ # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration ++ # (the clone itself is always small) ++ next_token_logits = outputs.logits[:, -1, :].clone() ++ ++ # pre-process distribution ++ next_token_scores = logits_processor(input_ids, next_token_logits) ++ if do_sample: ++ next_token_scores = logits_warper(input_ids, next_token_scores) ++ ++ # Store scores, attentions and hidden_states when required ++ if return_dict_in_generate: ++ if output_scores: ++ scores += (next_token_scores,) ++ if output_logits: ++ raw_logits += (next_token_logits,) ++ if output_attentions: ++ decoder_attentions += ( ++ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ++ ) ++ if self.config.is_encoder_decoder: ++ cross_attentions += (outputs.cross_attentions,) ++ ++ if output_hidden_states: ++ decoder_hidden_states += ( ++ (outputs.decoder_hidden_states,) ++ if self.config.is_encoder_decoder ++ else (outputs.hidden_states,) ++ ) ++ ++ # token selection ++ if do_sample: ++ probs = nn.functional.softmax(next_token_scores, dim=-1) ++ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) ++ else: ++ next_tokens = torch.argmax(next_token_scores, dim=-1) ++ ++ # finished sentences should have their next token be a padding token ++ if has_eos_stopping_criteria: ++ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) ++ ++ # update generated ids, model inputs, and length for next step ++ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) ++ model_kwargs = self._update_model_kwargs_for_generation( ++ outputs, ++ model_kwargs, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ ) ++ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) ++ this_peer_finished = unfinished_sequences.max() == 0 ++ # This is needed to properly delete outputs.logits which may be very large for first iteration ++ # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration ++ del outputs ++ return input_ids ++ ++ def invert_attention_mask(self, encoder_attention_mask): ++ if encoder_attention_mask.dim() == 3: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] ++ if encoder_attention_mask.dim() == 2: ++ encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] ++ encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility ++ ++ encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ ++ return encoder_extended_attention_mask ++ ++ @property ++ def device(self) -> torch.device: ++ """ ++ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same ++ device). ++ """ ++ return self.get_device() ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, deviceNone, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ + + @add_start_docstrings( + "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -- Gitee From 516533d1fc741d8d03c2300df62b2a6cd0e0e155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:35:07 +0000 Subject: [PATCH 066/110] add MindIE/MindIE-Torch/built-in/MT5. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py diff --git a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 8a3a411bd493f5800c919891bd7178a9b853a8d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:36:01 +0000 Subject: [PATCH 067/110] update MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/MT5/MT5_modeling_patch.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py index e69de29bb2..35a6ec8613 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py +++ b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py @@ -0,0 +1,28 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import transformers + + +def main(): + transformers_path = transformers.__path__ + transformers_version = transformers.__version__ + + assert transformers_version =='4.42.0', "expectation transformers==4.42.0" + os.system(f'patch -p0 {transformers_path[0]}/models/mt5/modeling_mt5.py modeling_mt5.patch') + + +if __name__ == '__main__': + main() -- Gitee From e115bbf46c896f60a881abe45a3decc58bee355a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 11:36:10 +0000 Subject: [PATCH 068/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/MT5/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/MT5/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/MT5/.keep diff --git a/MindIE/MindIE-Torch/built-in/MT5/.keep b/MindIE/MindIE-Torch/built-in/MT5/.keep deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From ba3650fe72702dd5aeac61d37d3363b75f7c201f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 12:02:39 +0000 Subject: [PATCH 069/110] add MindIE/MindIE-Torch/built-in/MT5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/readme.md | 95 ++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/MT5/readme.md diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md new file mode 100644 index 0000000000..3ffa911ed6 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md @@ -0,0 +1,95 @@ +# MT5模型-推理指导 + + +- [概述](#ZH-CN_TOPIC_0000001172161501) + + - [输入输出数据](#section540883920406) + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + + - [模型推理](#section741711594517) + + + +# 概述 + + T5全称是Text-to-Text Transfer Transformer,是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务,如分类、相似度计算、文本生成都用一个Text-to-text(文本到文本)的框架里进行解决。 + + +## 输入输出数据 + +- 输入数据 + + | 输入数据 | 大小 | 数据类型 | 数据排布格式 | + | -------- | -------- | -------- | ------------ | + | input | batchsize x input_seq_len | FLOAT16 | NHWC | + + +- 输出数据 + + | 输出数据 | 大小 | 数据类型 | 数据排布格式 | + | -------- | -------- | -------- | ------------ | + | output | batchsize x input_seq_len | INT32 | NTHWC | + + +# 推理环境准备 + +- 该模型需要以下插件与驱动 + + **表 1** 版本配套表 +- + | 配套 | 版本 | 备注 | + | ------------------------------------------------------------ |--------| ------------------------------------------------------------ | + | Python | 3.10.2 | - | + | torch | 2.1.0 | 导出pt模型所需版本 | + | torch_npu | 2.1.0 | 模型编译和推理所需版本 | + + +# 快速上手 + + +1. 安装transformers4.42.0版本。 + ```bash + pip3 install transformers==4.42.0 + ``` + +2. 安装mindie包,需要与torch_npu配合使用,请参考mindietorch配套torch_npu配置环境 + + ```bash + # 安装mindie + chmod +x ./Ascend-mindie_xxx.run + ./Ascend-mindie_xxx.run --install + source /usr/local/Ascend/mindie/set_env.sh + ``` + +3. 代码修改,在MT5目录下 + + 执行命令: + ```bash + python MT5_modeling_patch.py + ``` +4.导出mindietorch模型 + ```bash + python export_mt5.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} + ``` +参数说明: +{output_path}是输出的目录 +{model_path}模型所在目录 +{max_batchsize}推理过程中最大的batchsize +{max_input_seq_len}推理过程中最大输入长度 +{device_id} 用哪个npu device + +运行该命令后会自动生成encoder和decoder优化后的模型 + +5.精度测试 + ```bash +python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id +``` + +参数说明: +{model_path}模型所在目录 +{encoder_aie_path}优化后的encoder的模型路径,要具体到.pt文件 +{decoder_aie_path}优化后的decoder的模型路径,要具体到.pt文件 +{device_id} 用哪个npu device \ No newline at end of file -- Gitee From 4c4157b5b0868d0901e668e84466070e31f906b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 12:03:02 +0000 Subject: [PATCH 070/110] update MindIE/MindIE-Torch/built-in/MT5/test_mt5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/test_mt5.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py index c73905875e..92717df66f 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py +++ b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py @@ -37,8 +37,6 @@ def main(): with torch.npu.stream(mindie_stream): # set stream mindie_encoder_output = encoder_mindie(encoder_input)[0] mindie_stream.synchronize() # synchronize - import pdb - pdb.set_trace() if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99: print("encoder precision failed") else: -- Gitee From 6fc8a5686c1bf764cd4d25103d66f661ecab8676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 12:08:14 +0000 Subject: [PATCH 071/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch | 1 - 1 file changed, 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index 95d0455bf5..d0c6a08f48 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -476,7 +476,6 @@ index 224769fdf..6af548437 100644 + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 -+ print("encoder_extended_attention_mask=",encoder_extended_attention_mask) + + return encoder_extended_attention_mask + -- Gitee From dbcbf54c8418ddb2cac75d4e309b71ebaa801dbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 12:37:35 +0000 Subject: [PATCH 072/110] update MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/MT5/modeling_mt5.patch | 280 ++++++++++++------ 1 file changed, 182 insertions(+), 98 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch index 0df148b2ea..95d0455bf5 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch +++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch @@ -1,7 +1,7 @@ -diff --git a/modeling_mt5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py -index 1336b9196..5b94d69c7 100644 ---- a/modeling_mt5_origin.py -+++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py +diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +index 224769fdf..6af548437 100644 +--- a/modeling_t5.py ++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py @@ -19,22 +19,26 @@ import math import os import warnings @@ -29,13 +29,13 @@ index 1336b9196..5b94d69c7 100644 ) -from ...modeling_utils import PreTrainedModel +from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin - from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer + from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, @@ -47,8 +51,44 @@ from ...utils import ( ) from ...utils.model_parallel_utils import assert_device_map, get_device_map - from .configuration_mt5 import MT5Config + from .configuration_t5 import T5Config +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.configuration_utils import GenerationMode @@ -76,8 +76,8 @@ index 1336b9196..5b94d69c7 100644 + logger = logging.get_logger(__name__) - _CONFIG_FOR_DOC = "MT5Config" -@@ -323,7 +363,10 @@ class MT5Attention(nn.Module): + _CONFIG_FOR_DOC = "T5Config" +@@ -448,7 +488,10 @@ class T5Attention(nn.Module): mask=None, key_value_states=None, position_bias=None, @@ -89,7 +89,7 @@ index 1336b9196..5b94d69c7 100644 layer_head_mask=None, query_length=None, use_cache=False, -@@ -339,17 +382,15 @@ class MT5Attention(nn.Module): +@@ -464,12 +507,8 @@ class T5Attention(nn.Module): real_seq_length = seq_length @@ -104,14 +104,7 @@ index 1336b9196..5b94d69c7 100644 key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - def shape(states): - """projection""" -+ # import pdb -+ # pdb.set_trace() - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): -@@ -368,16 +409,17 @@ class MT5Attention(nn.Module): +@@ -493,16 +532,17 @@ class T5Attention(nn.Module): hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: @@ -135,7 +128,7 @@ index 1336b9196..5b94d69c7 100644 else: # cross-attn hidden_states = past_key_value -@@ -388,10 +430,10 @@ class MT5Attention(nn.Module): +@@ -513,17 +553,16 @@ class T5Attention(nn.Module): # get key/value states key_states = project( @@ -146,9 +139,17 @@ index 1336b9196..5b94d69c7 100644 - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None + hidden_states, self.v, key_value_states, past_value if past_value is not None else None ) - +- ++ # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,) # compute scores -@@ -411,7 +453,7 @@ class MT5Attention(nn.Module): + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( +@@ -536,7 +575,7 @@ class T5Attention(nn.Module): # if key and values are already calculated # we want only the last query position bias @@ -157,25 +158,32 @@ index 1336b9196..5b94d69c7 100644 position_bias = position_bias[:, :, -hidden_states.size(1) :, :] if mask is not None: -@@ -439,14 +481,124 @@ class MT5Attention(nn.Module): +@@ -548,7 +587,6 @@ class T5Attention(nn.Module): + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias +- + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores +@@ -564,18 +602,131 @@ class T5Attention(nn.Module): attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) -- + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) + - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs - - -+class MT5SelfAttention(MT5Attention): -+ def __init__(self, config: MT5Config, has_relative_attention_bias=False): ++ if output_attentions: ++ outputs = outputs + (attn_weights,) ++ return outputs ++ ++ ++class T5SelfAttention(T5Attention): ++ def __init__(self, config: T5Config, has_relative_attention_bias=False): + super().__init__(config, has_relative_attention_bias) + + def forward( @@ -273,19 +281,27 @@ index 1336b9196..5b94d69c7 100644 + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) -+ + + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) -+ if output_attentions: -+ outputs = outputs + (attn_weights,) -+ return outputs + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + ++ + - # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 - class MT5LayerSelfAttention(nn.Module): + class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): -@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module): + super().__init__() +- self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -295,7 +311,7 @@ index 1336b9196..5b94d69c7 100644 use_cache=False, output_attentions=False, ): -@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module): +@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module): mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -305,7 +321,7 @@ index 1336b9196..5b94d69c7 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module): +@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -315,7 +331,7 @@ index 1336b9196..5b94d69c7 100644 use_cache=False, query_length=None, output_attentions=False, -@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module): +@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module): key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -325,7 +341,7 @@ index 1336b9196..5b94d69c7 100644 use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, -@@ -539,39 +695,34 @@ class MT5Block(nn.Module): +@@ -661,39 +816,34 @@ class T5Block(nn.Module): encoder_decoder_position_bias=None, layer_head_mask=None, cross_attn_layer_head_mask=None, @@ -379,8 +395,11 @@ index 1336b9196..5b94d69c7 100644 # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: -@@ -586,8 +737,8 @@ class MT5Block(nn.Module): +@@ -706,22 +856,23 @@ class T5Block(nn.Module): + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: ++ # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here - if present_key_value_state is not None: @@ -389,8 +408,10 @@ index 1336b9196..5b94d69c7 100644 + query_length = present_key_state[0].shape[2] else: query_length = None - -@@ -597,7 +748,8 @@ class MT5Block(nn.Module): +- + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, @@ -398,9 +419,12 @@ index 1336b9196..5b94d69c7 100644 + past_key=cross_attn_past_key, + past_value=cross_attn_past_value, query_length=query_length, - use_cache=use_cache, +- use_cache=use_cache, ++ use_cache=use_cache, output_attentions=output_attentions, -@@ -614,11 +766,9 @@ class MT5Block(nn.Module): + ) + hidden_states = cross_attention_outputs[0] +@@ -736,11 +887,9 @@ class T5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states @@ -414,7 +438,7 @@ index 1336b9196..5b94d69c7 100644 # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) -@@ -635,7 +785,7 @@ class MT5Block(nn.Module): +@@ -757,7 +906,7 @@ class T5Block(nn.Module): outputs = (hidden_states,) if use_cache: @@ -423,10 +447,10 @@ index 1336b9196..5b94d69c7 100644 else: outputs = outputs + attention_outputs -@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel): +@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel): - # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 - class MT5Stack(MT5PreTrainedModel): + + class T5Stack(T5PreTrainedModel): - def __init__(self, config, embed_tokens=None): + def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): super().__init__(config) @@ -439,8 +463,8 @@ index 1336b9196..5b94d69c7 100644 + self.model_dim = config.d_model self.block = nn.ModuleList( - [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel): + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel): def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -452,6 +476,7 @@ index 1336b9196..5b94d69c7 100644 + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 ++ print("encoder_extended_attention_mask=",encoder_extended_attention_mask) + + return encoder_extended_attention_mask + @@ -506,7 +531,7 @@ index 1336b9196..5b94d69c7 100644 ): # Model parallel if self.model_parallel: -@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel): +@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel): f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: @@ -517,7 +542,7 @@ index 1336b9196..5b94d69c7 100644 elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: -@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel): inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -542,7 +567,7 @@ index 1336b9196..5b94d69c7 100644 if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -552,7 +577,7 @@ index 1336b9196..5b94d69c7 100644 all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -563,7 +588,7 @@ index 1336b9196..5b94d69c7 100644 layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, @@ -575,7 +600,7 @@ index 1336b9196..5b94d69c7 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -601,7 +626,7 @@ index 1336b9196..5b94d69c7 100644 if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -610,7 +635,7 @@ index 1336b9196..5b94d69c7 100644 # Add last layer if output_hidden_states: -@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel): +@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel): ] if v is not None ) @@ -636,7 +661,7 @@ index 1336b9196..5b94d69c7 100644 + return tuple((lm_logits, present_key_states, present_value_states)) + + -+class MT5Stack_Encoder(MT5PreTrainedModel): ++class T5Stack_Encoder(T5PreTrainedModel): + def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + self.embed_tokens = embed_tokens @@ -646,9 +671,9 @@ index 1336b9196..5b94d69c7 100644 + self.model_dim = config.d_model + + self.block = nn.ModuleList( -+ [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ++ [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ) -+ self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) ++ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing @@ -832,8 +857,8 @@ index 1336b9196..5b94d69c7 100644 + return tuple((hidden_states, cross_keys, cross_values)) - MT5_START_DOCSTRING = r""" -@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel): + T5_START_DOCSTRING = r""" +@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel): ) @@ -850,8 +875,6 @@ index 1336b9196..5b94d69c7 100644 + past_cross_keys = () + for i in range(len(self.cross_key)): + past_cross_keys += (self.cross_key[i](hidden_states),) -+ # import pdb -+ # pdb.set_trace() + return past_cross_keys + + @@ -870,15 +893,16 @@ index 1336b9196..5b94d69c7 100644 + past_cross_values += (self.cross_value[i](hidden_states),) + return past_cross_values + - @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) - class MT5ForConditionalGeneration(MT5PreTrainedModel): - r""" -@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): ++ + @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) + class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ +@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 -- def __init__(self, config: MT5Config): -+ def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): +- def __init__(self, config: T5Config): ++ def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): super().__init__(config) - self.model_dim = config.d_model - @@ -888,13 +912,13 @@ index 1336b9196..5b94d69c7 100644 - encoder_config.is_decoder = False - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False -- self.encoder = MT5Stack(encoder_config, self.shared) +- self.encoder = T5Stack(encoder_config, self.shared) - - decoder_config = copy.deepcopy(config) - decoder_config.is_decoder = True - decoder_config.is_encoder_decoder = False - decoder_config.num_layers = config.num_decoder_layers -- self.decoder = MT5Stack(decoder_config, self.shared) +- self.decoder = T5Stack(decoder_config, self.shared) - - self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + self.encoder_path = encoder_path @@ -909,17 +933,20 @@ index 1336b9196..5b94d69c7 100644 + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers ++ + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -+ self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head) ++ self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) ++ + cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) + cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) + encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv) + encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv) ++ + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False -+ self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) ++ self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) + self.encoder_mindie = None + self.decoder_mindie = None + if self.encoder_path: @@ -927,28 +954,24 @@ index 1336b9196..5b94d69c7 100644 + self.is_mindie = True + if self.decoder_path: + self.decoder_mindie = torch.jit.load(self.decoder_path) ++ + self.stream = torch.npu.Stream(f"npu:{device_id}") + self.device_id = device_id ++ ++ ++ def get_device(self): ++ return f"npu:{self.device_id}" # Initialize weights and apply final processing - self.post_init() -+ if not self.is_mindie: -+ self.post_init() ++ # self.post_init() # Model parallel self.model_parallel = False - self.device_map = None +@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): -+ def get_device(self): -+ return f"npu:{self.device_id}" -+ - @add_start_docstrings(PARALLELIZE_DOCSTRING) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize - def parallelize(self, device_map=None): -@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): - @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5 - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, @@ -972,7 +995,7 @@ index 1336b9196..5b94d69c7 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): +@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1100,7 +1123,6 @@ index 1336b9196..5b94d69c7 100644 - ) + return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2]) -- # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation def prepare_inputs_for_generation( self, input_ids, @@ -1112,7 +1134,7 @@ index 1336b9196..5b94d69c7 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): +@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1123,7 +1145,12 @@ index 1336b9196..5b94d69c7 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): +@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 +- + input_ids = input_ids[:, remove_prefix_length:] return { "decoder_input_ids": input_ids, @@ -1135,7 +1162,15 @@ index 1336b9196..5b94d69c7 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): +@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + "decoder_attention_mask": decoder_attention_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, ++ + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): +@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1426,7 +1461,7 @@ index 1336b9196..5b94d69c7 100644 + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + # keep track of which sequences are already finished -+ if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration": ++ if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration": + num_layers = self.config.num_layers + num_heads = self.config.num_heads + d_kv = self.config.d_kv @@ -1506,12 +1541,26 @@ index 1336b9196..5b94d69c7 100644 + return input_ids + + def invert_attention_mask(self, encoder_attention_mask): ++ """ ++ Invert an attention mask (e.g., switches 0. and 1.). ++ ++ Args: ++ encoder_attention_mask (`torch.Tensor`): An attention mask. ++ ++ Returns: ++ `torch.Tensor`: The inverted attention mask. ++ """ + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] ++ # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition ++ # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow ++ # /transformer/transformer_layers.py#L270 ++ # encoder_extended_attention_mask = (encoder_extended_attention_mask == ++ # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility -+ ++ #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 + + return encoder_extended_attention_mask @@ -1525,19 +1574,37 @@ index 1336b9196..5b94d69c7 100644 + return self.get_device() + + def get_extended_attention_mask( -+ self, attention_mask, input_shape, deviceNone, dtype=None ++ self, attention_mask, input_shape, devic=None, dtype=None + ): ++ """ ++ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. ++ ++ Arguments: ++ attention_mask (`torch.Tensor`): ++ Mask with ones indicating tokens to attend to, zeros for tokens to ignore. ++ input_shape (`Tuple[int]`): ++ The shape of the input to the model. ++ ++ Returns: ++ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. ++ """ + if dtype is None: + dtype = self.dtype + + if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) ++ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] ++ # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: ++ # Provided a padding mask of dimensions [batch_size, seq_length] ++ # - if the model is a decoder, apply a causal mask in addition to the padding mask ++ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder: + extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( + input_shape, attention_mask, device @@ -1548,10 +1615,27 @@ index 1336b9196..5b94d69c7 100644 + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) ++ ++ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for ++ # masked positions, this operation will create a tensor which is 0.0 for ++ # positions we want to attend and the dtype's smallest value for masked positions. ++ # Since we are adding it to the raw scores before the softmax, this is ++ # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min + extended_attention_mask = (1.0 - extended_attention_mask) * -1000 + return extended_attention_mask ++ ++ + @add_start_docstrings( - "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", + "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", +@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel): + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, -- Gitee From a0a6f76dcbe2ffd3ca97046f9315d1a87551dba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 11 Sep 2024 12:43:51 +0000 Subject: [PATCH 073/110] update MindIE/MindIE-Torch/built-in/T5/main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/main.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py index 8ac34ceec5..ccad949c44 100644 --- a/MindIE/MindIE-Torch/built-in/T5/main.py +++ b/MindIE/MindIE-Torch/built-in/T5/main.py @@ -19,11 +19,9 @@ def main(): args = parse_args() torch.npu.set_device(args.device_id) tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path) - text = [ - "translate English to German: The house is wonderful.", - "summarize: I am a high-performance inference optimizer and runtime.", - "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world", - ] + text = ["贵州毛台现在多少钱一瓶啊,想买两瓶尝尝味道。", + "能不能帮我买点淇淋,好久没吃了", + "脑子有点胡涂了,这道题冥冥学过还没有做出来"] t5_config = T5Config.from_pretrained(args.hf_model_path) model = T5ForConditionalGeneration(config=t5_config, encoder_path=args.encoder_aie_path, -- Gitee From 155a808d014aea535aece353d3fbfa1a88afa404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 12 Sep 2024 03:38:38 +0000 Subject: [PATCH 074/110] update MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/MT5/modeling_mt5.patch | 280 ++++++------------ 1 file changed, 98 insertions(+), 182 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch index 95d0455bf5..a5afef98e2 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch +++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch @@ -1,7 +1,7 @@ -diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..6af548437 100644 ---- a/modeling_t5.py -+++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +diff --git a/modeling_mt5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py +index 1336b9196..5b94d69c7 100644 +--- a/modeling_mt5.py ++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py @@ -19,22 +19,26 @@ import math import os import warnings @@ -29,13 +29,13 @@ index 224769fdf..6af548437 100644 ) -from ...modeling_utils import PreTrainedModel +from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin - from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer + from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, @@ -47,8 +51,44 @@ from ...utils import ( ) from ...utils.model_parallel_utils import assert_device_map, get_device_map - from .configuration_t5 import T5Config + from .configuration_mt5 import MT5Config +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.configuration_utils import GenerationMode @@ -76,8 +76,8 @@ index 224769fdf..6af548437 100644 + logger = logging.get_logger(__name__) - _CONFIG_FOR_DOC = "T5Config" -@@ -448,7 +488,10 @@ class T5Attention(nn.Module): + _CONFIG_FOR_DOC = "MT5Config" +@@ -323,7 +363,10 @@ class MT5Attention(nn.Module): mask=None, key_value_states=None, position_bias=None, @@ -89,7 +89,7 @@ index 224769fdf..6af548437 100644 layer_head_mask=None, query_length=None, use_cache=False, -@@ -464,12 +507,8 @@ class T5Attention(nn.Module): +@@ -339,17 +382,15 @@ class MT5Attention(nn.Module): real_seq_length = seq_length @@ -104,7 +104,14 @@ index 224769fdf..6af548437 100644 key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] -@@ -493,16 +532,17 @@ class T5Attention(nn.Module): + def shape(states): + """projection""" ++ # import pdb ++ # pdb.set_trace() + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): +@@ -368,16 +409,17 @@ class MT5Attention(nn.Module): hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: @@ -128,7 +135,7 @@ index 224769fdf..6af548437 100644 else: # cross-attn hidden_states = past_key_value -@@ -513,17 +553,16 @@ class T5Attention(nn.Module): +@@ -388,10 +430,10 @@ class MT5Attention(nn.Module): # get key/value states key_states = project( @@ -139,17 +146,9 @@ index 224769fdf..6af548437 100644 - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None + hidden_states, self.v, key_value_states, past_value if past_value is not None else None ) -- -+ # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,) + # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 -- - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( -@@ -536,7 +575,7 @@ class T5Attention(nn.Module): +@@ -411,7 +453,7 @@ class MT5Attention(nn.Module): # if key and values are already calculated # we want only the last query position bias @@ -158,32 +157,25 @@ index 224769fdf..6af548437 100644 position_bias = position_bias[:, :, -hidden_states.size(1) :, :] if mask is not None: -@@ -548,7 +587,6 @@ class T5Attention(nn.Module): - position_bias_masked = position_bias[:, mask.bool()] - else: - position_bias_masked = position_bias -- - scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores -@@ -564,18 +602,131 @@ class T5Attention(nn.Module): +@@ -439,14 +481,124 @@ class MT5Attention(nn.Module): attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) +- + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) + -+ if output_attentions: -+ outputs = outputs + (attn_weights,) -+ return outputs -+ -+ -+class T5SelfAttention(T5Attention): -+ def __init__(self, config: T5Config, has_relative_attention_bias=False): + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + ++class MT5SelfAttention(MT5Attention): ++ def __init__(self, config: MT5Config, has_relative_attention_bias=False): + super().__init__(config, has_relative_attention_bias) + + def forward( @@ -281,27 +273,19 @@ index 224769fdf..6af548437 100644 + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) - ++ + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) - if output_attentions: - outputs = outputs + (attn_weights,) - return outputs - - -+ ++ if output_attentions: ++ outputs = outputs + (attn_weights,) ++ return outputs + - class T5LayerSelfAttention(nn.Module): + # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 + class MT5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): - super().__init__() -- self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) -+ self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - -@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -311,7 +295,7 @@ index 224769fdf..6af548437 100644 use_cache=False, output_attentions=False, ): -@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module): mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -321,7 +305,7 @@ index 224769fdf..6af548437 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -331,7 +315,7 @@ index 224769fdf..6af548437 100644 use_cache=False, query_length=None, output_attentions=False, -@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module): key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -341,7 +325,7 @@ index 224769fdf..6af548437 100644 use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, -@@ -661,39 +816,34 @@ class T5Block(nn.Module): +@@ -539,39 +695,34 @@ class MT5Block(nn.Module): encoder_decoder_position_bias=None, layer_head_mask=None, cross_attn_layer_head_mask=None, @@ -395,11 +379,8 @@ index 224769fdf..6af548437 100644 # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: -@@ -706,22 +856,23 @@ class T5Block(nn.Module): - - do_cross_attention = self.is_decoder and encoder_hidden_states is not None +@@ -586,8 +737,8 @@ class MT5Block(nn.Module): if do_cross_attention: -+ # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here - if present_key_value_state is not None: @@ -408,10 +389,8 @@ index 224769fdf..6af548437 100644 + query_length = present_key_state[0].shape[2] else: query_length = None -- - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, + +@@ -597,7 +748,8 @@ class MT5Block(nn.Module): attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, @@ -419,12 +398,9 @@ index 224769fdf..6af548437 100644 + past_key=cross_attn_past_key, + past_value=cross_attn_past_value, query_length=query_length, -- use_cache=use_cache, -+ use_cache=use_cache, + use_cache=use_cache, output_attentions=output_attentions, - ) - hidden_states = cross_attention_outputs[0] -@@ -736,11 +887,9 @@ class T5Block(nn.Module): +@@ -614,11 +766,9 @@ class MT5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states @@ -438,7 +414,7 @@ index 224769fdf..6af548437 100644 # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) -@@ -757,7 +906,7 @@ class T5Block(nn.Module): +@@ -635,7 +785,7 @@ class MT5Block(nn.Module): outputs = (hidden_states,) if use_cache: @@ -447,10 +423,10 @@ index 224769fdf..6af548437 100644 else: outputs = outputs + attention_outputs -@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel): +@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel): - - class T5Stack(T5PreTrainedModel): + # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5 + class MT5Stack(MT5PreTrainedModel): - def __init__(self, config, embed_tokens=None): + def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): super().__init__(config) @@ -463,8 +439,8 @@ index 224769fdf..6af548437 100644 + self.model_dim = config.d_model self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel): + [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel): def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -476,7 +452,6 @@ index 224769fdf..6af548437 100644 + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 -+ print("encoder_extended_attention_mask=",encoder_extended_attention_mask) + + return encoder_extended_attention_mask + @@ -531,7 +506,7 @@ index 224769fdf..6af548437 100644 ): # Model parallel if self.model_parallel: -@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel): +@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel): f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: @@ -542,7 +517,7 @@ index 224769fdf..6af548437 100644 elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: -@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel): +@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel): inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -567,7 +542,7 @@ index 224769fdf..6af548437 100644 if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -577,7 +552,7 @@ index 224769fdf..6af548437 100644 all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -588,7 +563,7 @@ index 224769fdf..6af548437 100644 layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel): +@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel): encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, @@ -600,7 +575,7 @@ index 224769fdf..6af548437 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel): +@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -626,7 +601,7 @@ index 224769fdf..6af548437 100644 if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel): +@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -635,7 +610,7 @@ index 224769fdf..6af548437 100644 # Add last layer if output_hidden_states: -@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel): +@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel): ] if v is not None ) @@ -661,7 +636,7 @@ index 224769fdf..6af548437 100644 + return tuple((lm_logits, present_key_states, present_value_states)) + + -+class T5Stack_Encoder(T5PreTrainedModel): ++class MT5Stack_Encoder(MT5PreTrainedModel): + def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + self.embed_tokens = embed_tokens @@ -671,9 +646,9 @@ index 224769fdf..6af548437 100644 + self.model_dim = config.d_model + + self.block = nn.ModuleList( -+ [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ++ [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ) -+ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) ++ self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing @@ -857,8 +832,8 @@ index 224769fdf..6af548437 100644 + return tuple((hidden_states, cross_keys, cross_values)) - T5_START_DOCSTRING = r""" -@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel): + MT5_START_DOCSTRING = r""" +@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel): ) @@ -875,6 +850,8 @@ index 224769fdf..6af548437 100644 + past_cross_keys = () + for i in range(len(self.cross_key)): + past_cross_keys += (self.cross_key[i](hidden_states),) ++ # import pdb ++ # pdb.set_trace() + return past_cross_keys + + @@ -893,16 +870,15 @@ index 224769fdf..6af548437 100644 + past_cross_values += (self.cross_value[i](hidden_states),) + return past_cross_values + -+ - @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) - class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [ -@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): - ] + @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) + class MT5ForConditionalGeneration(MT5PreTrainedModel): + r""" +@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] -- def __init__(self, config: T5Config): -+ def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 +- def __init__(self, config: MT5Config): ++ def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0): super().__init__(config) - self.model_dim = config.d_model - @@ -912,13 +888,13 @@ index 224769fdf..6af548437 100644 - encoder_config.is_decoder = False - encoder_config.use_cache = False - encoder_config.is_encoder_decoder = False -- self.encoder = T5Stack(encoder_config, self.shared) +- self.encoder = MT5Stack(encoder_config, self.shared) - - decoder_config = copy.deepcopy(config) - decoder_config.is_decoder = True - decoder_config.is_encoder_decoder = False - decoder_config.num_layers = config.num_decoder_layers -- self.decoder = T5Stack(decoder_config, self.shared) +- self.decoder = MT5Stack(decoder_config, self.shared) - - self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + self.encoder_path = encoder_path @@ -933,20 +909,17 @@ index 224769fdf..6af548437 100644 + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers -+ + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) -+ self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) -+ ++ self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head) + cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) + cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) + encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv) + encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv) -+ + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False -+ self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) ++ self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) + self.encoder_mindie = None + self.decoder_mindie = None + if self.encoder_path: @@ -954,24 +927,28 @@ index 224769fdf..6af548437 100644 + self.is_mindie = True + if self.decoder_path: + self.decoder_mindie = torch.jit.load(self.decoder_path) -+ + self.stream = torch.npu.Stream(f"npu:{device_id}") + self.device_id = device_id -+ -+ -+ def get_device(self): -+ return f"npu:{self.device_id}" # Initialize weights and apply final processing - self.post_init() -+ # self.post_init() ++ if not self.is_mindie: ++ self.post_init() # Model parallel self.model_parallel = False -@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + self.device_map = None - @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) ++ def get_device(self): ++ return f"npu:{self.device_id}" ++ + @add_start_docstrings(PARALLELIZE_DOCSTRING) + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize + def parallelize(self, device_map=None): +@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): + @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5 - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, @@ -995,7 +972,7 @@ index 224769fdf..6af548437 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1123,6 +1100,7 @@ index 224769fdf..6af548437 100644 - ) + return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2]) +- # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation def prepare_inputs_for_generation( self, input_ids, @@ -1134,7 +1112,7 @@ index 224769fdf..6af548437 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1145,12 +1123,7 @@ index 224769fdf..6af548437 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 -- - input_ids = input_ids[:, remove_prefix_length:] +@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): return { "decoder_input_ids": input_ids, @@ -1162,15 +1135,7 @@ index 224769fdf..6af548437 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): - "decoder_attention_mask": decoder_attention_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, -+ - } - - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): -@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1461,7 +1426,7 @@ index 224769fdf..6af548437 100644 + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + # keep track of which sequences are already finished -+ if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration": ++ if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration": + num_layers = self.config.num_layers + num_heads = self.config.num_heads + d_kv = self.config.d_kv @@ -1541,26 +1506,12 @@ index 224769fdf..6af548437 100644 + return input_ids + + def invert_attention_mask(self, encoder_attention_mask): -+ """ -+ Invert an attention mask (e.g., switches 0. and 1.). -+ -+ Args: -+ encoder_attention_mask (`torch.Tensor`): An attention mask. -+ -+ Returns: -+ `torch.Tensor`: The inverted attention mask. -+ """ + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] -+ # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition -+ # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow -+ # /transformer/transformer_layers.py#L270 -+ # encoder_extended_attention_mask = (encoder_extended_attention_mask == -+ # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility -+ #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min ++ + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000 + + return encoder_extended_attention_mask @@ -1574,37 +1525,19 @@ index 224769fdf..6af548437 100644 + return self.get_device() + + def get_extended_attention_mask( -+ self, attention_mask, input_shape, devic=None, dtype=None ++ self, attention_mask, input_shape, deviceNone, dtype=None + ): -+ """ -+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. -+ -+ Arguments: -+ attention_mask (`torch.Tensor`): -+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore. -+ input_shape (`Tuple[int]`): -+ The shape of the input to the model. -+ -+ Returns: -+ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. -+ """ + if dtype is None: + dtype = self.dtype + + if not (attention_mask.dim() == 2 and self.config.is_decoder): -+ # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) -+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] -+ # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: -+ # Provided a padding mask of dimensions [batch_size, seq_length] -+ # - if the model is a decoder, apply a causal mask in addition to the padding mask -+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder: + extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( + input_shape, attention_mask, device @@ -1615,27 +1548,10 @@ index 224769fdf..6af548437 100644 + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) -+ -+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for -+ # masked positions, this operation will create a tensor which is 0.0 for -+ # positions we want to attend and the dtype's smallest value for masked positions. -+ # Since we are adding it to the raw scores before the softmax, this is -+ # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility -+ #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min + extended_attention_mask = (1.0 - extended_attention_mask) * -1000 + return extended_attention_mask -+ -+ + @add_start_docstrings( - "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel): - >>> last_hidden_states = outputs.last_hidden_state - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, + "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -- Gitee From 8076e460d04572e199c4b835d812d617e6162fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 12 Sep 2024 04:32:49 +0000 Subject: [PATCH 075/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5.patch | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index d0c6a08f48..e4bd899bde 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,5 +1,5 @@ diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..6af548437 100644 +index 224769fdf..1c2d8d185 100644 --- a/modeling_t5.py +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py @@ -19,22 +19,26 @@ import math @@ -464,7 +464,7 @@ index 224769fdf..6af548437 100644 self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel): +@@ -966,20 +1119,63 @@ class T5Stack(T5PreTrainedModel): def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -530,7 +530,7 @@ index 224769fdf..6af548437 100644 ): # Model parallel if self.model_parallel: -@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel): +@@ -998,8 +1194,10 @@ class T5Stack(T5PreTrainedModel): f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: @@ -541,7 +541,7 @@ index 224769fdf..6af548437 100644 elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: -@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel): +@@ -1012,18 +1210,19 @@ class T5Stack(T5PreTrainedModel): inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -566,7 +566,7 @@ index 224769fdf..6af548437 100644 if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1054,7 +1253,8 @@ class T5Stack(T5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -576,7 +576,7 @@ index 224769fdf..6af548437 100644 all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1062,8 +1262,8 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -587,7 +587,7 @@ index 224769fdf..6af548437 100644 layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel): +@@ -1112,7 +1312,10 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, @@ -599,7 +599,7 @@ index 224769fdf..6af548437 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel): +@@ -1120,19 +1323,20 @@ class T5Stack(T5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -625,7 +625,7 @@ index 224769fdf..6af548437 100644 if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel): +@@ -1146,7 +1350,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -634,7 +634,7 @@ index 224769fdf..6af548437 100644 # Add last layer if output_hidden_states: -@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel): +@@ -1164,13 +1368,216 @@ class T5Stack(T5PreTrainedModel): ] if v is not None ) @@ -857,7 +857,7 @@ index 224769fdf..6af548437 100644 T5_START_DOCSTRING = r""" -@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel): +@@ -1541,6 +1948,38 @@ class T5Model(T5PreTrainedModel): ) @@ -896,7 +896,7 @@ index 224769fdf..6af548437 100644 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): _keys_to_ignore_on_load_unexpected = [ -@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1548,28 +1987,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -967,7 +967,7 @@ index 224769fdf..6af548437 100644 # Model parallel self.model_parallel = False -@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1637,25 +2099,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) @@ -994,7 +994,7 @@ index 224769fdf..6af548437 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1687,113 +2131,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1133,7 +1133,7 @@ index 224769fdf..6af548437 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1804,8 +2172,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1144,7 +1144,7 @@ index 224769fdf..6af548437 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1813,12 +2181,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): else: # Default to old behavior: keep only final ID remove_prefix_length = input_ids.shape[1] - 1 @@ -1161,7 +1161,7 @@ index 224769fdf..6af548437 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1826,6 +2196,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): "decoder_attention_mask": decoder_attention_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, @@ -1169,7 +1169,7 @@ index 224769fdf..6af548437 100644 } def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): -@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1861,6 +2232,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1630,7 +1630,7 @@ index 224769fdf..6af548437 100644 @add_start_docstrings( "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel): +@@ -1967,7 +2792,6 @@ class T5EncoderModel(T5PreTrainedModel): >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- Gitee From c99410d6e6c9018eeb9cc7975291fd242a343315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 12 Sep 2024 07:08:13 +0000 Subject: [PATCH 076/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/export=5Fmt5.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/export_mt5.py | 181 ------------------ 1 file changed, 181 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/export_mt5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py deleted file mode 100644 index dc8308e362..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py +++ /dev/null @@ -1,181 +0,0 @@ - -import torch -import torch_npu -import argparse -import os -import mindietorch -from transformers import MT5ForConditionalGeneration - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--output_dir", - type=str, - default="./models", - help="save dir" - ) - parser.add_argument( - "--model_path", - type=str, - default="./T5-Small", - help="T5 model path" - ) - parser.add_argument( - "--max_batchsize", - type=int, - default=1, - help="max batchsize when running" - ) - - parser.add_argument( - "--max_input_seq_len", - type=int, - default=256, - help="max input_sequence length when running" - ) - - - parser.add_argument( - "--device_id", - type=int, - default=0, - help="npu device id" - ) - return parser.parse_args() - - -class TextEncoderExport(torch.nn.Module): - def __init__(self, textencoder_model): - super(TextEncoderExport, self).__init__() - self.textencoder_model = textencoder_model - - def forward(self, input_ids): - return self.textencoder_model(input_ids=input_ids) - -class TextDecoderExport(torch.nn.Module): - def __init__(self, textdecoder_model): - super(TextDecoderExport, self).__init__() - self.textdecoder_model = textdecoder_model - - def forward(self, - input_ids, - encoder_hidden_states, - encoder_attention_mask, - past_key_values, - past_cross_key_values): - return self.textdecoder_model(input_ids=input_ids, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - past_cross_key_values=past_cross_key_values, - return_dict=True) - -def export_textencoder(args, model, save_dir, batch_size): - encoder_path = os.path.join(save_dir, "encoder") - if not os.path.exists(encoder_path): - os.makedirs(encoder_path, mode=0o640) - traced_path = os.path.join(encoder_path, "encoder.pt") - compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") - if not os.path.exists(traced_path): - text_encoder = model.encoder - dummy_input = ( - torch.ones([1, 128], dtype=torch.int64).npu() - ) - encoder = TextEncoderExport(text_encoder) - encoder.eval() - torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) - if not os.path.exists(compiled_path): - traced_model = torch.jit.load(traced_path).eval() - - inputs0 = [] - # inputs1 = [] - inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) - print("compiling encoder") - compiled_model = mindietorch.compile( - traced_model, - inputs=inputs0, - allow_tensor_replace_int=True, - require_full_compilation=False, - truncate_long_and_double=True, - precision_policy=mindietorch.PrecisionPolicy.FP16, - soc_version="Ascend910B4", - optimization_level=0 - ) - compiled_model.save(compiled_path) - -def export_textdecoder(args, model, save_dir, batch_size): - decoder_path = os.path.join(save_dir, "decoder") - if not os.path.exists(decoder_path): - os.makedirs(decoder_path, mode=0o640) - traced_path = os.path.join(decoder_path, "decoder.pt") - compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") - model_path = args.model_path - max_lenth = 120 - if not os.path.exists(traced_path): - text_decoder = model.decoder - dummy_input = ( - torch.ones([1, 1], dtype=torch.int64).npu(), - torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(), - torch.ones(1,16).npu(), - torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(), - torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu() - ) - decoder = TextDecoderExport(text_decoder).npu() - decoder.eval() - torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) - if not os.path.exists(compiled_path): - traced_model = torch.jit.load(traced_path).eval() - print("compiling decoder") - compiled_model = mindietorch.compile( - traced_model, - inputs=[mindietorch.Input(min_shape =(1, 1), - max_shape = (args.max_batchsize,1), - dtype=mindietorch.dtype.INT64), - - mindietorch.Input(min_shape =(1, 1, model.config.d_model), - max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), - dtype=mindietorch.dtype.FLOAT16), - - mindietorch.Input(min_shape = (1,1), - max_shape =(args.max_batchsize,args.max_input_seq_len), - dtype=mindietorch.dtype.INT64), - mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv), - max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv), - dtype=mindietorch.dtype.FLOAT16), - - mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads), - max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len,model.config.d_kv*model.config.num_heads), - dtype=mindietorch.dtype.FLOAT16)], - allow_tensor_replace_int=True, - require_full_compilation=False, - truncate_long_and_double=True, - precision_policy=mindietorch.PrecisionPolicy.FP16, - soc_version="Ascend910B4", - optimization_level=0 - ) - compiled_model.save(compiled_path) - -def main(): - args = parse_arguments() - device_id = args.device_id - save_dir = args.output_dir - torch.npu.set_device(device_id) - batch_size = 1 - model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu() - encoder_path = os.path.join(save_dir, "encoder") - compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") - if not os.path.exists(compiled_path): - export_textencoder(args, model, save_dir, batch_size) - print("export encoder_model done!") - - decoder_path = os.path.join(save_dir, "decoder") - compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") - if not os.path.exists(compiled_path): - export_textdecoder(args, model, save_dir, batch_size) - print("export decoder_model done!") - - - - -if __name__ == "__main__": - main() -- Gitee From 216d9ee41b56e57fea983cb995f0367f8b4c99fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 12 Sep 2024 07:08:30 +0000 Subject: [PATCH 077/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/test=5Fmt5.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/T5/test_mt5.py | 54 --------------------- 1 file changed, 54 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/test_mt5.py diff --git a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py deleted file mode 100644 index af441392d4..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py +++ /dev/null @@ -1,54 +0,0 @@ -import torch -import time -import argparse -import torch_npu -from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--hf_model_path", type=str, required=True) - - parser.add_argument("--encoder_aie_path", type=str, required=True) - parser.add_argument("--decoder_aie_path", type=str, required=True) - - parser.add_argument("--device_id", type=int, help="NPU device id", default=0) - - args = parser.parse_args() - return args - -def main(): - args = parse_args() - torch.npu.set_device(args.device_id) - tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path) - text = [ - "translate English to German: The house is wonderful.", - "summarize: I am a high-performance inference optimizer and runtime.", - "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world", - ] - model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu() - encoder = model.encoder - decoder = model.decoder - encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu() - t5_config = MT5Config.from_pretrained(args.hf_model_path) - - encoder_output = encoder(encoder_input)[0] - model = MT5ForConditionalGeneration(config=t5_config, - encoder_path=args.encoder_aie_path, - decoder_path=args.decoder_aie_path, - device_id=args.device_id).half().npu() - - encoder_mindie = model.encoder_mindie - decoder_mindie = model.decoder_mindie - mindie_stream = model.stream - with torch.npu.stream(mindie_stream): # set stream - mindie_encoder_output = encoder_mindie(encoder_input)[0] - mindie_stream.synchronize() # synchronize - if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99: - print("encoder precision failed") - else: - print("test OK") - - -if __name__ == "__main__": - main() - -- Gitee From e3358cabfb41bbd3b2c4d9a9fcaeb2ceb01394ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 23 Sep 2024 11:05:31 +0000 Subject: [PATCH 078/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/perf=5Ftest=5Faie.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MindIE-Torch/built-in/T5/perf_test_aie.py | 115 ------------------ 1 file changed, 115 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py deleted file mode 100644 index 97c02916fe..0000000000 --- a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import time -import argparse -import json - -import numpy as np -import torch -import torch_npu -import mindietorch -from tqdm import tqdm - -def test_encoder(aie_path, args, device_id = 0): - batch_size = args.batch_size - device_id = args.device_id - seq_len = args.seq_len - device = f'npu:{device_id}' - stream = torch.npu.Stream(f"npu:{device_id}") - print("Start loading ts module...") - ts = torch.jit.load(aie_path) - print("Ts module loaded.") - ts.eval() - dummy_input = (torch.ones([batch_size, seq_len], dtype=torch.int64).npu()) - print("Start infering...") - # warmup - for _ in range(10): - with torch.npu.stream(stream): - ts(dummy_input) - stream.synchronize() - - # performance test - num_infer = 100 - - start = time.time() - for _ in tqdm(range(num_infer)): - with torch.npu.stream(stream): - ts(dummy_input) - stream.synchronize() - end = time.time() - print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms") - print(f"Encoder throughput: {num_infer * batch_size / (end - start):.2f} fps") - - -def test_decoder(aie_path, args): - batch_size = args.batch_size - device_id = args.device_id - seq_len = args.seq_len - device = f'npu:{device_id}' - stream = torch.npu.Stream(f"npu:{device_id}") - print("Start loading ts module...") - ts = torch.jit.load(aie_path) - print("Ts module loaded.") - ts.eval() - dummy_input = ( - torch.ones([batch_size, 1], dtype=torch.int64).npu(), - torch.randn(batch_size,seq_len,512).to(torch.float16).npu(), - torch.ones(batch_size,seq_len, dtype=torch.int64).npu(), - torch.randn(6,2,batch_size,8,1,64).to(torch.float16).npu(), - torch.randn(6,2,batch_size,8,24,64).to(torch.float16).npu() - ) - - # warmup - for _ in range(10): - with torch.npu.stream(stream): - ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4]) - stream.synchronize() - - # performance test - num_infer = 100 - start = time.time() - for _ in tqdm(range(num_infer)): - with torch.npu.stream(stream): - ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4]) - stream.synchronize() - end = time.time() - - print(f"Decoder latency: {(end - start) / num_infer * 1000:.2f} ms") - print(f"Decoder throughput: {num_infer * batch_size / (end - start):.2f} fps") - -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument("--encoder_aie_path", type=str, required=True) - parser.add_argument("--decoder_aie_path", type=str, required=True) - parser.add_argument("--batch_size", type=int, help="NPU device id", default=1) - parser.add_argument("--seq_len", type=int, help="NPU device id", default=128) - - parser.add_argument("--device_id", type=int, help="NPU device id", default=0) - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - torch.npu.set_device(args.device_id) - test_encoder(args.encoder_aie_path, args) - test_decoder(args.decoder_aie_path, args) - - -if __name__ == "__main__": - main() -- Gitee From d26ac1dd029258a464c01bd5f967881596a6149e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 23 Sep 2024 11:14:20 +0000 Subject: [PATCH 079/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5.patch | 115 +++++++++--------- 1 file changed, 55 insertions(+), 60 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index e4bd899bde..74bda2bb86 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,8 +1,8 @@ -diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..1c2d8d185 100644 ---- a/modeling_t5.py +diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +index 224769fdf..cfa27e8c6 100644 +--- a/modeling_t5_origin.py +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -@@ -19,22 +19,26 @@ import math +@@ -19,7 +19,7 @@ import math import os import warnings from typing import List, Optional, Tuple, Union @@ -11,13 +11,7 @@ index 224769fdf..1c2d8d185 100644 import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -+# import torch_npu -+import mindietorch -+ -+ -+ - - from ...activations import ACT2FN +@@ -28,13 +28,12 @@ from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -32,7 +26,7 @@ index 224769fdf..1c2d8d185 100644 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, -@@ -47,8 +51,44 @@ from ...utils import ( +@@ -47,7 +46,43 @@ from ...utils import ( ) from ...utils.model_parallel_utils import assert_device_map, get_device_map from .configuration_t5 import T5Config @@ -40,13 +34,13 @@ index 224769fdf..1c2d8d185 100644 +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.configuration_utils import GenerationMode +from transformers.utils.generic import ModelOutput - - ++ ++ +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. -+ + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. @@ -73,11 +67,10 @@ index 224769fdf..1c2d8d185 100644 + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None -+ + logger = logging.get_logger(__name__) - _CONFIG_FOR_DOC = "T5Config" -@@ -448,7 +488,10 @@ class T5Attention(nn.Module): +@@ -448,7 +483,10 @@ class T5Attention(nn.Module): mask=None, key_value_states=None, position_bias=None, @@ -89,7 +82,7 @@ index 224769fdf..1c2d8d185 100644 layer_head_mask=None, query_length=None, use_cache=False, -@@ -464,12 +507,8 @@ class T5Attention(nn.Module): +@@ -464,12 +502,8 @@ class T5Attention(nn.Module): real_seq_length = seq_length @@ -104,7 +97,7 @@ index 224769fdf..1c2d8d185 100644 key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] -@@ -493,16 +532,17 @@ class T5Attention(nn.Module): +@@ -493,16 +527,17 @@ class T5Attention(nn.Module): hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: @@ -128,7 +121,7 @@ index 224769fdf..1c2d8d185 100644 else: # cross-attn hidden_states = past_key_value -@@ -513,17 +553,16 @@ class T5Attention(nn.Module): +@@ -513,17 +548,16 @@ class T5Attention(nn.Module): # get key/value states key_states = project( @@ -149,7 +142,7 @@ index 224769fdf..1c2d8d185 100644 if position_bias is None: if not self.has_relative_attention_bias: position_bias = torch.zeros( -@@ -536,7 +575,7 @@ class T5Attention(nn.Module): +@@ -536,7 +570,7 @@ class T5Attention(nn.Module): # if key and values are already calculated # we want only the last query position bias @@ -158,7 +151,7 @@ index 224769fdf..1c2d8d185 100644 position_bias = position_bias[:, :, -hidden_states.size(1) :, :] if mask is not None: -@@ -548,7 +587,6 @@ class T5Attention(nn.Module): +@@ -548,7 +582,6 @@ class T5Attention(nn.Module): position_bias_masked = position_bias[:, mask.bool()] else: position_bias_masked = position_bias @@ -166,7 +159,7 @@ index 224769fdf..1c2d8d185 100644 scores += position_bias_masked attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( scores -@@ -564,18 +602,131 @@ class T5Attention(nn.Module): +@@ -564,18 +597,131 @@ class T5Attention(nn.Module): attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) attn_output = self.o(attn_output) @@ -224,7 +217,7 @@ index 224769fdf..1c2d8d185 100644 + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) -+ + + if past_key_value is not None: + hidden_states = shape(proj_layer(hidden_states)) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) @@ -281,7 +274,7 @@ index 224769fdf..1c2d8d185 100644 + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) - ++ + # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None + present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None @@ -301,7 +294,7 @@ index 224769fdf..1c2d8d185 100644 self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) -@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -585,7 +731,8 @@ class T5LayerSelfAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -311,7 +304,7 @@ index 224769fdf..1c2d8d185 100644 use_cache=False, output_attentions=False, ): -@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -595,7 +742,8 @@ class T5LayerSelfAttention(nn.Module): mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -321,7 +314,7 @@ index 224769fdf..1c2d8d185 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -618,7 +766,8 @@ class T5LayerCrossAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -331,7 +324,7 @@ index 224769fdf..1c2d8d185 100644 use_cache=False, query_length=None, output_attentions=False, -@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -630,7 +779,8 @@ class T5LayerCrossAttention(nn.Module): key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -341,7 +334,7 @@ index 224769fdf..1c2d8d185 100644 use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, -@@ -661,39 +816,34 @@ class T5Block(nn.Module): +@@ -661,39 +811,34 @@ class T5Block(nn.Module): encoder_decoder_position_bias=None, layer_head_mask=None, cross_attn_layer_head_mask=None, @@ -395,7 +388,7 @@ index 224769fdf..1c2d8d185 100644 # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: -@@ -706,22 +856,23 @@ class T5Block(nn.Module): +@@ -706,22 +851,23 @@ class T5Block(nn.Module): do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: @@ -424,7 +417,7 @@ index 224769fdf..1c2d8d185 100644 output_attentions=output_attentions, ) hidden_states = cross_attention_outputs[0] -@@ -736,11 +887,9 @@ class T5Block(nn.Module): +@@ -736,11 +882,9 @@ class T5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states @@ -438,7 +431,7 @@ index 224769fdf..1c2d8d185 100644 # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) -@@ -757,7 +906,7 @@ class T5Block(nn.Module): +@@ -757,7 +901,7 @@ class T5Block(nn.Module): outputs = (hidden_states,) if use_cache: @@ -447,7 +440,7 @@ index 224769fdf..1c2d8d185 100644 else: outputs = outputs + attention_outputs -@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel): +@@ -897,11 +1041,15 @@ class T5PreTrainedModel(PreTrainedModel): class T5Stack(T5PreTrainedModel): @@ -464,7 +457,7 @@ index 224769fdf..1c2d8d185 100644 self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -966,20 +1119,63 @@ class T5Stack(T5PreTrainedModel): +@@ -966,20 +1114,63 @@ class T5Stack(T5PreTrainedModel): def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -530,7 +523,7 @@ index 224769fdf..1c2d8d185 100644 ): # Model parallel if self.model_parallel: -@@ -998,8 +1194,10 @@ class T5Stack(T5PreTrainedModel): +@@ -998,8 +1189,10 @@ class T5Stack(T5PreTrainedModel): f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: @@ -541,7 +534,7 @@ index 224769fdf..1c2d8d185 100644 elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: -@@ -1012,18 +1210,19 @@ class T5Stack(T5PreTrainedModel): +@@ -1012,18 +1205,19 @@ class T5Stack(T5PreTrainedModel): inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -566,7 +559,7 @@ index 224769fdf..1c2d8d185 100644 if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1054,7 +1253,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1054,7 +1248,8 @@ class T5Stack(T5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -576,7 +569,7 @@ index 224769fdf..1c2d8d185 100644 all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1062,8 +1262,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1062,8 +1257,8 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -587,7 +580,7 @@ index 224769fdf..1c2d8d185 100644 layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1112,7 +1312,10 @@ class T5Stack(T5PreTrainedModel): +@@ -1112,7 +1307,10 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, @@ -599,7 +592,7 @@ index 224769fdf..1c2d8d185 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1120,19 +1323,20 @@ class T5Stack(T5PreTrainedModel): +@@ -1120,19 +1318,20 @@ class T5Stack(T5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -625,7 +618,7 @@ index 224769fdf..1c2d8d185 100644 if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1146,7 +1350,7 @@ class T5Stack(T5PreTrainedModel): +@@ -1146,7 +1345,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -634,7 +627,7 @@ index 224769fdf..1c2d8d185 100644 # Add last layer if output_hidden_states: -@@ -1164,13 +1368,216 @@ class T5Stack(T5PreTrainedModel): +@@ -1164,13 +1363,216 @@ class T5Stack(T5PreTrainedModel): ] if v is not None ) @@ -857,7 +850,7 @@ index 224769fdf..1c2d8d185 100644 T5_START_DOCSTRING = r""" -@@ -1541,6 +1948,38 @@ class T5Model(T5PreTrainedModel): +@@ -1541,6 +1943,38 @@ class T5Model(T5PreTrainedModel): ) @@ -896,7 +889,7 @@ index 224769fdf..1c2d8d185 100644 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): _keys_to_ignore_on_load_unexpected = [ -@@ -1548,28 +1987,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1548,28 +1982,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -967,7 +960,7 @@ index 224769fdf..1c2d8d185 100644 # Model parallel self.model_parallel = False -@@ -1637,25 +2099,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1637,25 +2094,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) @@ -994,7 +987,7 @@ index 224769fdf..1c2d8d185 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1687,113 +2131,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1687,113 +2126,40 @@ class T5ForConditionalGeneration(T5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1077,6 +1070,9 @@ index 224769fdf..1c2d8d185 100644 - - lm_logits = self.lm_head(sequence_output) + if self.is_mindie: ++ print("aaaaaaaaaaaaaaaa") ++ # import pdb ++ # pdb.set_trace() + with torch.npu.stream(self.stream): # set stream + decoder_outputs = self.decoder_mindie.forward(*args) + self.stream.synchronize() # synchronize @@ -1133,7 +1129,7 @@ index 224769fdf..1c2d8d185 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1804,8 +2172,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1804,8 +2170,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1144,7 +1140,7 @@ index 224769fdf..1c2d8d185 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1813,12 +2181,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1813,12 +2179,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): else: # Default to old behavior: keep only final ID remove_prefix_length = input_ids.shape[1] - 1 @@ -1161,7 +1157,7 @@ index 224769fdf..1c2d8d185 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1826,6 +2196,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1826,6 +2194,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): "decoder_attention_mask": decoder_attention_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, @@ -1169,7 +1165,7 @@ index 224769fdf..1c2d8d185 100644 } def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): -@@ -1861,6 +2232,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1861,6 +2230,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1191,11 +1187,12 @@ index 224769fdf..1c2d8d185 100644 + model_input_name = model_input_name if model_input_name is not None else self.main_input_name + encoder_kwargs["return_dict"] = True + encoder_kwargs[model_input_name] = inputs_tensor -+ import time -+ start_time = time.time() -+ with torch.npu.stream(self.stream): # set stream -+ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"]) -+ self.stream.synchronize() # synchronize ++ if self.is_mindie: ++ with torch.npu.stream(self.stream): # set stream ++ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"]) ++ self.stream.synchronize() # synchronize ++ else: ++ encoder_outputs=self.encoder.forward(**encoder_kwargs) + model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]} + model_kwargs["past_cross_keys"] = encoder_outputs[1] + model_kwargs["past_cross_values"] =encoder_outputs[2] @@ -1243,8 +1240,6 @@ index 224769fdf..1c2d8d185 100644 + **kwargs, + ): + # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call -+ import time -+ start_time = time.time() + self._validate_model_class() + tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria + generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs) @@ -1630,7 +1625,7 @@ index 224769fdf..1c2d8d185 100644 @add_start_docstrings( "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -@@ -1967,7 +2792,6 @@ class T5EncoderModel(T5PreTrainedModel): +@@ -1967,7 +2789,6 @@ class T5EncoderModel(T5PreTrainedModel): >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- Gitee From 82ae72a9cf69f468af23edacd67b1c699dcb57ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 23 Sep 2024 11:14:39 +0000 Subject: [PATCH 080/110] update MindIE/MindIE-Torch/built-in/T5/main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py index ccad949c44..28d85df24a 100644 --- a/MindIE/MindIE-Torch/built-in/T5/main.py +++ b/MindIE/MindIE-Torch/built-in/T5/main.py @@ -1,4 +1,6 @@ import torch +import torch_npu +import mindietorch import time import argparse from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config -- Gitee From 1eb296aea57aec90db68e42e18f046a7c2396470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 23 Sep 2024 12:26:03 +0000 Subject: [PATCH 081/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../MindIE-Torch/built-in/T5/modeling_t5.patch | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index 74bda2bb86..26b0ce5e87 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,5 +1,5 @@ diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..cfa27e8c6 100644 +index 224769fdf..65c058e6e 100644 --- a/modeling_t5_origin.py +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py @@ -19,7 +19,7 @@ import math @@ -987,7 +987,7 @@ index 224769fdf..cfa27e8c6 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1687,113 +2126,40 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1687,113 +2126,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1070,9 +1070,6 @@ index 224769fdf..cfa27e8c6 100644 - - lm_logits = self.lm_head(sequence_output) + if self.is_mindie: -+ print("aaaaaaaaaaaaaaaa") -+ # import pdb -+ # pdb.set_trace() + with torch.npu.stream(self.stream): # set stream + decoder_outputs = self.decoder_mindie.forward(*args) + self.stream.synchronize() # synchronize @@ -1129,7 +1126,7 @@ index 224769fdf..cfa27e8c6 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1804,8 +2170,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1804,8 +2167,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1140,7 +1137,7 @@ index 224769fdf..cfa27e8c6 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1813,12 +2179,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1813,12 +2176,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): else: # Default to old behavior: keep only final ID remove_prefix_length = input_ids.shape[1] - 1 @@ -1157,7 +1154,7 @@ index 224769fdf..cfa27e8c6 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1826,6 +2194,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1826,6 +2191,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): "decoder_attention_mask": decoder_attention_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, @@ -1165,7 +1162,7 @@ index 224769fdf..cfa27e8c6 100644 } def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): -@@ -1861,6 +2230,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1861,6 +2227,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1625,7 +1622,7 @@ index 224769fdf..cfa27e8c6 100644 @add_start_docstrings( "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -@@ -1967,7 +2789,6 @@ class T5EncoderModel(T5PreTrainedModel): +@@ -1967,7 +2786,6 @@ class T5EncoderModel(T5PreTrainedModel): >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- Gitee From 25e9d1ce45045cef7a624a136174002cb532097e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 03:31:32 +0000 Subject: [PATCH 082/110] add MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/export_t5_800IA2.py | 202 ++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py b/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py new file mode 100644 index 0000000000..e150e8e93a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py @@ -0,0 +1,202 @@ + +import torch +import torch_npu +import argparse +import os +import math +import mindietorch +from transformers import T5ForConditionalGeneration + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--output_dir", + type=str, + default="./models", + help="save dir" + ) + parser.add_argument( + "--model_path", + type=str, + default="./T5-Small", + help="T5 model path" + ) + parser.add_argument( + "--max_batchsize", + type=int, + default=1, + help="max batchsize when running" + ) + + parser.add_argument( + "--max_input_seq_len", + type=int, + default=256, + help="max input_sequence length when running" + ) + + + parser.add_argument( + "--device_id", + type=int, + default=0, + help="npu device id" + ) + return parser.parse_args() + + +class TextEncoderExport(torch.nn.Module): + def __init__(self, textencoder_model): + super(TextEncoderExport, self).__init__() + self.textencoder_model = textencoder_model + + def forward(self, input_ids,attention_mask): + return self.textencoder_model(input_ids=input_ids,attention_mask=attention_mask) + +class TextDecoderExport(torch.nn.Module): + def __init__(self, textdecoder_model): + super(TextDecoderExport, self).__init__() + self.textdecoder_model = textdecoder_model + + def forward(self, + *args): + return self.textdecoder_model(*args) + +def export_textencoder(args, model, save_dir, batch_size): + encoder_path = os.path.join(save_dir, "encoder") + if not os.path.exists(encoder_path): + os.makedirs(encoder_path, mode=0o640) + traced_path = os.path.join(encoder_path, "encoder.pt") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(traced_path): + text_encoder = model.encoder + dummy_input = ( + torch.ones([1, 128], dtype=torch.int64).npu(), + torch.ones([1, 1,128,128], dtype=torch.bool).npu() + ) + encoder = TextEncoderExport(text_encoder) + encoder.eval() + torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + + inputs0 = [] + inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) + inputs0.append(mindietorch.Input(min_shape = (1,1,1,1), max_shape= (args.max_batchsize, 1,args.max_input_seq_len,args.max_input_seq_len), dtype=torch.bool)) + print("compiling encoder") + compiled_model = mindietorch.compile( + traced_model, + inputs=inputs0, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + optimization_level=0 + ) + compiled_model.save(compiled_path) + +def export_textdecoder(args, model, save_dir, batch_size): + decoder_path = os.path.join(save_dir, "decoder") + if not os.path.exists(decoder_path): + os.makedirs(decoder_path, mode=0o640) + traced_path = os.path.join(decoder_path, "decoder.pt") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + model_path = args.model_path + max_lenth = 120 + if not os.path.exists(traced_path): + text_decoder = model + all_past_keys = [torch.randn([1, 1, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers + all_past_values = [torch.randn([1, 1, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_keys = [torch.randn([1, 16, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers + all_past_cross_values = [torch.randn([1, 16, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers + dummy_input = [torch.randn(1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()] + dummy_input.extend(all_past_cross_keys) + dummy_input.extend(all_past_cross_values) + dummy_input.extend(all_past_keys) + dummy_input.extend(all_past_values) + # encoder_attention_mask + dummy_input.append(torch.ones((1,1,16,16),dtype=torch.bool).npu()) + # decoder_input_ids + dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu()) + dummy_input.append(torch.ones([1, 1, 1, 1], dtype=torch.bool).npu()) + # decoder_attention_mask + decoder = TextDecoderExport(text_decoder).npu() + decoder.eval() + torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path) + if not os.path.exists(compiled_path): + traced_model = torch.jit.load(traced_path).eval() + print("compiling decoder") + input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model), + dtype=mindietorch.dtype.FLOAT16)] + past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.num_heads*model.config.d_kv), + max_shape=(args.max_batchsize,args.max_input_seq_len, model.config.num_heads*model.config.d_kv), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_cross_value_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_kv*model.config.num_heads), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_key_infos = [mindietorch.Input(min_shape =(1, 0, model.config.d_kv*model.config.num_heads), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + past_value_infos = [mindietorch.Input(min_shape =(1, 0, model.config.d_kv*model.config.num_heads), + max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads), + dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers + decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1), + max_shape = (args.max_batchsize,1), + dtype=mindietorch.dtype.INT64)] + encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1,1, 1), + max_shape = (args.max_batchsize, 1, args.max_input_seq_len,args.max_input_seq_len), + dtype=mindietorch.dtype.BOOL)] + decoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1,1,1), + max_shape = (args.max_batchsize,1,args.max_input_seq_len,args.max_input_seq_len), + dtype=mindietorch.dtype.BOOL)] + input_info.extend(past_cross_key_infos) + input_info.extend(past_cross_value_infos) + input_info.extend(past_key_infos) + input_info.extend(past_value_infos) + input_info.extend(encoder_attention_mask_info) + input_info.extend(decoder_input_ids_info) + input_info.extend(decoder_attention_mask_info) + buffer = [] + for _ in range(2*model.config.num_layers): + buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) + buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024) + buffer.append(buffer_size0) + print("buffer=",buffer) + compiled_model = mindietorch.compile( + traced_model, + inputs=input_info, + allow_tensor_replace_int=True, + require_full_compilation=False, + truncate_long_and_double=True, + precision_policy=mindietorch.PrecisionPolicy.FP16, + soc_version="Ascend910B4", + default_buffer_size_vec=buffer, + optimization_level=0 + ) + compiled_model.save(compiled_path) + + +def main(): + args = parse_arguments() + device_id = args.device_id + save_dir = args.output_dir + torch.npu.set_device(device_id) + batch_size = 1 + model = T5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu() + encoder_path = os.path.join(save_dir, "encoder") + compiled_path = os.path.join(encoder_path, "encoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textencoder(args, model, save_dir, batch_size) + print("export encoder_model done!") + + decoder_path = os.path.join(save_dir, "decoder") + compiled_path = os.path.join(decoder_path, "decoder_compiled.pt") + if not os.path.exists(compiled_path): + export_textdecoder(args, model, save_dir, batch_size) + print("export decoder_model done!") + + +if __name__ == "__main__": + main() -- Gitee From c00de3f5a57bb3d2d02b2d6b2043370d20863cab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 03:32:36 +0000 Subject: [PATCH 083/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index e152265ae9..995274d0bd 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -89,7 +89,7 @@ def export_textencoder(args, model, save_dir, batch_size): require_full_compilation=False, truncate_long_and_double=True, precision_policy=mindietorch.PrecisionPolicy.FP16, - soc_version="Ascend910B4", + soc_version="Ascend310P3", optimization_level=0 ) compiled_model.save(compiled_path) @@ -161,7 +161,7 @@ def export_textdecoder(args, model, save_dir, batch_size): require_full_compilation=False, truncate_long_and_double=True, precision_policy=mindietorch.PrecisionPolicy.FP16, - soc_version="Ascend910B4", + soc_version="Ascend310P3", default_buffer_size_vec=buffer, optimization_level=0 ) -- Gitee From fc525501a51d52301798d67e0d4f9a81b133ee43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:03:48 +0000 Subject: [PATCH 084/110] add MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 538d828573d04f265b703086cc221c0f08988814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:04:12 +0000 Subject: [PATCH 085/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5_800IA2.patch | 1594 +++++++++++++++++ 1 file changed, 1594 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch index e69de29bb2..664b4359ce 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch @@ -0,0 +1,1594 @@ +diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +index 224769fdf..4f9ffd74f 100644 +--- a/modeling_t5_origin.py ++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +@@ -19,22 +19,26 @@ import math + import os + import warnings + from typing import List, Optional, Tuple, Union +- ++from dataclasses import dataclass + import torch + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss ++# import torch_npu ++# import mindietorch ++ ++ ++ + + from ...activations import ACT2FN + from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, +- Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, + TokenClassifierOutput, + ) +-from ...modeling_utils import PreTrainedModel ++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin + from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer + from ...utils import ( + DUMMY_INPUTS, +@@ -47,7 +51,43 @@ from ...utils import ( + ) + from ...utils.model_parallel_utils import assert_device_map, get_device_map + from .configuration_t5 import T5Config ++from transformers.generation.logits_process import LogitsProcessorList ++from transformers.generation.stopping_criteria import StoppingCriteriaList ++from transformers.generation.configuration_utils import GenerationMode ++from transformers.utils.generic import ModelOutput ++ ++ ++@dataclass ++class Seq2SeqLMOutput(ModelOutput): ++ """ ++ Base class for model's outputs, with potential hidden states and attentions. + ++ Args: ++ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): ++ Sequence of hidden-states at the output of the last layer of the model. ++ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): ++ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + ++ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. ++ ++ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. ++ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): ++ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, ++ sequence_length)`. ++ ++ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention ++ heads. ++ """ ++ loss: Optional[torch.FloatTensor] = None ++ logits: torch.FloatTensor = None ++ past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ++ decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_last_hidden_state: Optional[torch.FloatTensor] = None ++ encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None ++ encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + logger = logging.get_logger(__name__) + +@@ -360,6 +400,7 @@ class T5Attention(nn.Module): + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False ++ self.lay_out = "BSH" + + def prune_heads(self, heads): + if len(heads) == 0: +@@ -448,7 +489,10 @@ class T5Attention(nn.Module): + mask=None, + key_value_states=None, + position_bias=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, +@@ -464,81 +508,86 @@ class T5Attention(nn.Module): + + real_seq_length = seq_length + +- if past_key_value is not None: +- if len(past_key_value) != 2: +- raise ValueError( +- f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" +- ) +- real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length ++ if past_key is not None: ++ real_seq_length += past_key.shape[1] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] ++ # BSH ++ query_states = self.q(hidden_states) ++ key_states = past_key ++ value_states = past_value ++ attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,attn_mask=mask) ++ # mask = mask.expand(3,1,16,mask.shape[3]).bool() ++ # attn_output = torch_npu.npu_prompt_flash_attention(query_states,key_states,value_states,atten_mask=mask,num_heads=self.n_heads,input_layout="BSH") ++ attn_output = self.o(attn_output) ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) ++ return outputs ++ ++ ++class T5SelfAttention(T5Attention): ++ def __init__(self, config: T5Config, has_relative_attention_bias=False): ++ super().__init__(config, has_relative_attention_bias) + +- def shape(states): +- """projection""" +- return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) ++ def forward( ++ self, ++ hidden_states, ++ mask=None, ++ position_bias=None, ++ past_key=None, ++ past_value=None, ++ layer_head_mask=None, ++ use_cache=False, ++ output_attentions=False, ++ ): ++ """ ++ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). ++ """ ++ # Input is (batch_size, seq_length, dim) ++ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) ++ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) ++ batch_size, seq_length = hidden_states.shape[:2] + +- def unshape(states): +- """reshape""" +- return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) ++ real_seq_length = seq_length ++ ++ if past_key is not None: ++ real_seq_length += past_key.shape[1] ++ key_length = real_seq_length + +- def project(hidden_states, proj_layer, key_value_states, past_key_value): ++ def project(hidden_states, proj_layer, past_key_value): + """projects hidden states correctly to key/query states""" +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(hidden_states)) +- elif past_key_value is None: +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) ++ if past_key_value is None: ++ hidden_states = proj_layer(hidden_states) + + if past_key_value is not None: +- if key_value_states is None: +- # self-attn +- # (batch_size, n_heads, key_length, dim_per_head) +- hidden_states = torch.cat([past_key_value, hidden_states], dim=2) +- elif past_key_value.shape[2] != key_value_states.shape[1]: +- # checking that the `sequence_length` of the `past_key_value` is the same as +- # the provided `key_value_states` to support prefix tuning +- # cross-attn +- # (batch_size, n_heads, seq_length, dim_per_head) +- hidden_states = shape(proj_layer(key_value_states)) +- else: +- # cross-attn +- hidden_states = past_key_value ++ hidden_states = proj_layer(hidden_states) ++ hidden_states = torch.cat([past_key_value, hidden_states], dim=1) + return hidden_states + + # get query states +- query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) +- ++ query_states = self.q(hidden_states) + # get key/value states + key_states = project( +- hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None ++ hidden_states, self.k, past_key if past_key is not None else None + ) + value_states = project( +- hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None ++ hidden_states, self.v, past_value if past_value is not None else None + ) +- +- # compute scores +- scores = torch.matmul( +- query_states, key_states.transpose(3, 2) +- ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 +- + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( +- (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype ++ (1, self.n_heads, real_seq_length, key_length), device=query_states.device, dtype=query_states.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: +- position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) ++ position_bias = self.compute_bias(real_seq_length, key_length, device=query_states.device) + + # if key and values are already calculated + # we want only the last query position bias +- if past_key_value is not None: ++ if past_key is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] +- + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + +@@ -548,34 +597,26 @@ class T5Attention(nn.Module): + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias +- +- scores += position_bias_masked +- attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( +- scores +- ) # (batch_size, n_heads, seq_length, key_length) +- attn_weights = nn.functional.dropout( +- attn_weights, p=self.dropout, training=self.training +- ) # (batch_size, n_heads, seq_length, key_length) +- +- # Mask heads if we want to +- if layer_head_mask is not None: +- attn_weights = attn_weights * layer_head_mask +- +- attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) ++ # scores += position_bias_masked ++ # attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,pse=position_bias_masked) ++ attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,pse=position_bias_masked,attn_mask=mask) ++ # print("mask=",mask,mask.shape) ++ # mask = mask.expand(3,1,16,mask.shape[3]).bool() ++ # attn_output = torch_npu.npu_prompt_flash_attention(query_states,key_states,value_states,pse_shift=position_bias_masked, atten_mask=mask,num_heads=self.n_heads,input_layout="BSH") + attn_output = self.o(attn_output) ++ # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None ++ present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None ++ present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None ++ outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,) ++ return outputs + +- present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None +- outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + +- if output_attentions: +- outputs = outputs + (attn_weights,) +- return outputs + + + class T5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() +- self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) ++ self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + +@@ -585,7 +626,8 @@ class T5LayerSelfAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + output_attentions=False, + ): +@@ -595,7 +637,8 @@ class T5LayerSelfAttention(nn.Module): + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -618,7 +661,8 @@ class T5LayerCrossAttention(nn.Module): + attention_mask=None, + position_bias=None, + layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, + use_cache=False, + query_length=None, + output_attentions=False, +@@ -630,7 +674,8 @@ class T5LayerCrossAttention(nn.Module): + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, +@@ -661,39 +706,34 @@ class T5Block(nn.Module): + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, +- past_key_value=None, ++ past_key=None, ++ past_value=None, ++ past_cross_key=None, ++ past_cross_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): +- if past_key_value is not None: +- if not self.is_decoder: +- logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") +- expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 +- +- if len(past_key_value) != expected_num_past_key_values: +- raise ValueError( +- f"There should be {expected_num_past_key_values} past states. " +- f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" +- f"Got {len(past_key_value)} past key / value states" +- ) +- +- self_attn_past_key_value = past_key_value[:2] +- cross_attn_past_key_value = past_key_value[2:] ++ if past_key is not None: ++ self_attn_past_key = past_key ++ self_attn_past_value = past_value ++ cross_attn_past_key = past_cross_key ++ cross_attn_past_value = past_cross_value + else: +- self_attn_past_key_value, cross_attn_past_key_value = None, None ++ self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, +- past_key_value=self_attn_past_key_value, ++ past_key=self_attn_past_key, ++ past_value=self_attn_past_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +- hidden_states, present_key_value_state = self_attention_outputs[:2] +- attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights ++ hidden_states, present_key_state, present_value_state = self_attention_outputs[:3] ++ attention_outputs = self_attention_outputs[3:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: +@@ -706,22 +746,23 @@ class T5Block(nn.Module): + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: ++ + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here +- if present_key_value_state is not None: +- query_length = present_key_value_state[0].shape[2] ++ if present_key_state is not None: ++ query_length = present_key_state[0].shape[1] + else: + query_length = None +- + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=cross_attn_past_key_value, ++ past_key=cross_attn_past_key, ++ past_value=cross_attn_past_value, + query_length=query_length, +- use_cache=use_cache, ++ use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] +@@ -736,11 +777,9 @@ class T5Block(nn.Module): + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states +- if present_key_value_state is not None: +- present_key_value_state = present_key_value_state + cross_attention_outputs[1] +- ++ # cross_attn_past_key_values = cross_attention_outputs[1] + # Keep cross-attention outputs and relative position weights +- attention_outputs = attention_outputs + cross_attention_outputs[2:] ++ attention_outputs = attention_outputs + cross_attention_outputs[3:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) +@@ -757,7 +796,7 @@ class T5Block(nn.Module): + outputs = (hidden_states,) + + if use_cache: +- outputs = outputs + (present_key_value_state,) + attention_outputs ++ outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs + else: + outputs = outputs + attention_outputs + +@@ -897,11 +936,15 @@ class T5PreTrainedModel(PreTrainedModel): + + + class T5Stack(T5PreTrainedModel): +- def __init__(self, config, embed_tokens=None): ++ def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder ++ self.lm_head=lm_head ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model + + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] +@@ -966,16 +1009,48 @@ class T5Stack(T5PreTrainedModel): + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ + def forward( + self, + input_ids=None, +- attention_mask=None, + encoder_hidden_states=None, ++ past_keys=None, ++ past_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, + encoder_attention_mask=None, ++ attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, +- past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, +@@ -998,8 +1073,10 @@ class T5Stack(T5PreTrainedModel): + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: ++ + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) ++ input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: +@@ -1012,25 +1089,29 @@ class T5Stack(T5PreTrainedModel): + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape +- + # required mask seq length can be calculated via length of past +- mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length ++ mask_seq_length = past_keys[0].shape[1] + seq_length if past_keys is not None else seq_length + + if use_cache is True: + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + # initialize past_key_values with `None` if past does not exist +- if past_key_values is None: +- past_key_values = [None] * len(self.block) +- ++ if not self.is_decoder: ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) + if attention_mask is None: +- attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) ++ print("aaaaaaaaaaaaaaaaa") ++ attention_mask = torch.zeros(batch_size, mask_seq_length, device=inputs_embeds.device) ++ attention_mask = attention_mask[:,None,None,:].expand(batch_size,1,mask_seq_length,mask_seq_length).bool() + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. +- extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) +- ++ # extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) ++ extended_attention_mask = attention_mask ++ # print("extended_attention_mask=",extended_attention_mask) + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: +@@ -1040,7 +1121,7 @@ class T5Stack(T5PreTrainedModel): + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long + ) +- encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) ++ encoder_extended_attention_mask = encoder_attention_mask + else: + encoder_extended_attention_mask = None + +@@ -1054,7 +1135,8 @@ class T5Stack(T5PreTrainedModel): + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) +- present_key_value_states = () if use_cache else None ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None +@@ -1062,8 +1144,8 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) +- +- for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): ++ # for i, layer_module in enumerate(self.block): ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel +@@ -1112,7 +1194,10 @@ class T5Stack(T5PreTrainedModel): + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, +- past_key_value=past_key_value, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) +@@ -1120,19 +1205,20 @@ class T5Stack(T5PreTrainedModel): + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: +- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] + +- hidden_states, present_key_value_state = layer_outputs[:2] ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) +- position_bias = layer_outputs[2] ++ position_bias = layer_outputs[3] + if self.is_decoder and encoder_hidden_states is not None: +- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] ++ encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4] + # append next layer key value states + if use_cache: +- present_key_value_states = present_key_value_states + (present_key_value_state,) ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) +@@ -1146,31 +1232,158 @@ class T5Stack(T5PreTrainedModel): + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) +- hidden_states = self.dropout(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) ++ if self.config.tie_word_embeddings: ++ hidden_states = hidden_states * (self.model_dim ** -0.5) ++ lm_logits = self.lm_head(hidden_states) ++ return tuple((lm_logits, present_key_states, present_value_states)) + +- if not return_dict: +- return tuple( +- v +- for v in [ +- hidden_states, +- present_key_value_states, +- all_hidden_states, +- all_attentions, +- all_cross_attentions, +- ] +- if v is not None +- ) +- return BaseModelOutputWithPastAndCrossAttentions( +- last_hidden_state=hidden_states, +- past_key_values=present_key_value_states, +- hidden_states=all_hidden_states, +- attentions=all_attentions, +- cross_attentions=all_cross_attentions, ++ ++class T5Stack_Encoder(T5PreTrainedModel): ++ def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None): ++ super().__init__(config) ++ self.embed_tokens = embed_tokens ++ self.is_decoder = config.is_decoder ++ self.encodecrosskey = encodecrosskey ++ self.encodecrossvalue = encodecrossvalue ++ self.model_dim = config.d_model ++ ++ self.block = nn.ModuleList( ++ [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ++ ) ++ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) ++ self.dropout = nn.Dropout(config.dropout_rate) ++ ++ # Initialize weights and apply final processing ++ self.post_init() ++ # Model parallel ++ self.model_parallel = False ++ self.device_map = None ++ self.gradient_checkpointing = False ++ ++ def get_input_embeddings(self): ++ return self.embed_tokens ++ ++ def set_input_embeddings(self, new_embeddings): ++ self.embed_tokens = new_embeddings ++ ++ ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, device=None, dtype=None ++ ): ++ extended_attention_mask = attention_mask[:,None,None,:].expand(input_shape[0],1,input_shape[1],input_shape[1]).bool() ++ extended_attention_mask = ~extended_attention_mask ++ return extended_attention_mask ++ ++ def forward( ++ self, ++ input_ids=None, ++ attention_mask=None, ++ head_mask=None, ++ cross_attn_head_mask=None, ++ use_cache=None, ++ output_attentions=None, ++ output_hidden_states=None, ++ return_dict=None, ++ ): ++ # Model parallel ++ use_cache = use_cache if use_cache is not None else self.config.use_cache ++ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions ++ output_hidden_states = ( ++ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) ++ return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ ++ input_shape = input_ids.size() ++ input_ids = input_ids.view(-1, input_shape[-1]) ++ ++ inputs_embeds = self.embed_tokens(input_ids) ++ ++ batch_size, seq_length = input_shape ++ # required mask seq length can be calculated via length of past ++ mask_seq_length = seq_length ++ ++ # initialize past_key_values with `None` if past does not exist ++ past_keys = [None] * len(self.block) ++ past_values = [None] * len(self.block) ++ past_cross_keys = [None] * len(self.block) ++ past_cross_values = [None] * len(self.block) ++ # print("attention_mask=",attention_mask) ++ if attention_mask is None: ++ attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) ++ encoder_extended_attention_mask = None ++ # Prepare head mask if needed ++ head_mask = self.get_head_mask(head_mask, self.config.num_layers) ++ cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) ++ present_key_states = () if use_cache else None ++ present_value_states = () if use_cache else None ++ all_hidden_states = () if output_hidden_states else None ++ all_attentions = () if output_attentions else None ++ all_cross_attentions = () if (output_attentions and self.is_decoder) else None ++ position_bias = None ++ encoder_decoder_position_bias = None ++ ++ hidden_states = self.dropout(inputs_embeds) ++ for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)): ++ layer_head_mask = head_mask[i] ++ cross_attn_layer_head_mask = cross_attn_head_mask[i] ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ layer_outputs = layer_module( ++ hidden_states, ++ attention_mask=attention_mask, ++ position_bias=position_bias, ++ encoder_hidden_states=None, ++ encoder_attention_mask=encoder_extended_attention_mask, ++ encoder_decoder_position_bias=encoder_decoder_position_bias, ++ layer_head_mask=layer_head_mask, ++ cross_attn_layer_head_mask=cross_attn_layer_head_mask, ++ past_key=past_key, ++ past_value=past_value, ++ past_cross_key=past_cross_key, ++ past_cross_value=past_cross_value, ++ use_cache=use_cache, ++ output_attentions=output_attentions, ++ ) ++ ++ # layer_outputs is a tuple with: ++ # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) ++ if use_cache is False: ++ layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:] ++ ++ hidden_states, present_key_state, present_value_state = layer_outputs[:3] ++ ++ # We share the position biases between the layers - the first layer store them ++ # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), ++ # (cross-attention position bias), (cross-attention weights) ++ position_bias = layer_outputs[3] ++ # append next layer key value states ++ if use_cache: ++ present_key_states = present_key_states + present_key_state ++ present_value_states = present_value_states + present_value_state ++ ++ if output_attentions: ++ all_attentions = all_attentions + (layer_outputs[3],) ++ if self.is_decoder: ++ all_cross_attentions = all_cross_attentions + (layer_outputs[5],) ++ ++ hidden_states = self.final_layer_norm(hidden_states) ++ hidden_states = self.dropout(hidden_states).half() ++ ++ # Add last layer ++ if output_hidden_states: ++ all_hidden_states = all_hidden_states + (hidden_states,) ++ ++ if self.encodecrosskey: ++ cross_keys = self.encodecrosskey(hidden_states) ++ if self.encodecrossvalue: ++ cross_values = self.encodecrossvalue(hidden_states) ++ return tuple((hidden_states, cross_keys, cross_values)) + + + T5_START_DOCSTRING = r""" +@@ -1541,6 +1754,41 @@ class T5Model(T5PreTrainedModel): + ) + + ++class EncoderToCrossKey(nn.Module): ++ def __init__(self, cross_key, num_heads, d_kv): ++ super().__init__() ++ self.cross_key = cross_key ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_keys = () ++ for i in range(len(self.cross_key)): ++ # past_cross_keys += (self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1,2),) ++ past_cross_keys += (self.cross_key[i](hidden_states),) ++ return past_cross_keys ++ ++ ++class EncoderToCrossValue(nn.Module): ++ def __init__(self, cross_value, num_heads, d_kv): ++ super().__init__() ++ self.cross_value = cross_value ++ self.num_heads = num_heads ++ self.d_kv = d_kv ++ ++ ++ def forward(self, hidden_states): ++ batch_size = hidden_states.shape[0] ++ past_cross_values = () ++ for i in range(len(self.cross_value)): ++ # past_cross_values += (self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1,2),) ++ past_cross_values += (self.cross_value[i](hidden_states),) ++ # print("aaa",past_cross_values[0].shape) ++ return past_cross_values ++ ++ + @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) + class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ +@@ -1548,28 +1796,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + +- def __init__(self, config: T5Config): ++ def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0): + super().__init__(config) +- self.model_dim = config.d_model +- +- self.shared = nn.Embedding(config.vocab_size, config.d_model) +- +- encoder_config = copy.deepcopy(config) +- encoder_config.is_decoder = False +- encoder_config.use_cache = False +- encoder_config.is_encoder_decoder = False +- self.encoder = T5Stack(encoder_config, self.shared) +- +- decoder_config = copy.deepcopy(config) +- decoder_config.is_decoder = True +- decoder_config.is_encoder_decoder = False +- decoder_config.num_layers = config.num_decoder_layers +- self.decoder = T5Stack(decoder_config, self.shared) +- +- self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.encoder_path = encoder_path ++ self.decoder_path = decoder_path ++ self.is_mindie = False ++ if not self.encoder_path or not self.decoder_path: ++ self.model_dim = config.d_model ++ ++ self.shared = nn.Embedding(config.vocab_size, config.d_model) ++ ++ decoder_config = copy.deepcopy(config) ++ decoder_config.is_decoder = True ++ decoder_config.is_encoder_decoder = False ++ decoder_config.num_layers = config.num_decoder_layers ++ ++ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) ++ self.decoder = T5Stack(decoder_config, self.shared, self.lm_head) ++ ++ cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers)) ++ cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers)) ++ encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv) ++ encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv) ++ ++ encoder_config = copy.deepcopy(config) ++ encoder_config.is_decoder = False ++ encoder_config.use_cache = False ++ encoder_config.is_encoder_decoder = False ++ self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue) ++ self.encoder_mindie = None ++ self.decoder_mindie = None ++ if self.encoder_path: ++ self.encoder_mindie = torch.jit.load(self.encoder_path) ++ self.is_mindie = True ++ if self.decoder_path: ++ self.decoder_mindie = torch.jit.load(self.decoder_path) ++ ++ self.stream = torch.npu.Stream(f"npu:{device_id}") ++ self.device_id = device_id ++ ++ ++ def get_device(self): ++ return f"npu:{self.device_id}" + + # Initialize weights and apply final processing +- self.post_init() ++ # self.post_init() + + # Model parallel + self.model_parallel = False +@@ -1637,25 +1908,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) +- def forward( +- self, +- input_ids: Optional[torch.LongTensor] = None, +- attention_mask: Optional[torch.FloatTensor] = None, +- decoder_input_ids: Optional[torch.LongTensor] = None, +- decoder_attention_mask: Optional[torch.BoolTensor] = None, +- head_mask: Optional[torch.FloatTensor] = None, +- decoder_head_mask: Optional[torch.FloatTensor] = None, +- cross_attn_head_mask: Optional[torch.Tensor] = None, +- encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, +- inputs_embeds: Optional[torch.FloatTensor] = None, +- decoder_inputs_embeds: Optional[torch.FloatTensor] = None, +- labels: Optional[torch.LongTensor] = None, +- use_cache: Optional[bool] = None, +- output_attentions: Optional[bool] = None, +- output_hidden_states: Optional[bool] = None, +- return_dict: Optional[bool] = None, +- ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: ++ def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., +@@ -1687,113 +1940,36 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" +- use_cache = use_cache if use_cache is not None else self.config.use_cache +- return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- +- # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +- if head_mask is not None and decoder_head_mask is None: +- if self.config.num_layers == self.config.num_decoder_layers: +- warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) +- decoder_head_mask = head_mask +- +- # Encode if needed (training, first prediction pass) +- if encoder_outputs is None: +- # Convert encoder inputs in embeddings if needed +- encoder_outputs = self.encoder( +- input_ids=input_ids, +- attention_mask=attention_mask, +- inputs_embeds=inputs_embeds, +- head_mask=head_mask, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): +- encoder_outputs = BaseModelOutput( +- last_hidden_state=encoder_outputs[0], +- hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, +- attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, +- ) +- +- hidden_states = encoder_outputs[0] +- +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- +- if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: +- # get decoder inputs from shifting lm labels to the right +- decoder_input_ids = self._shift_right(labels) +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.decoder.first_device) +- hidden_states = hidden_states.to(self.decoder.first_device) +- if decoder_input_ids is not None: +- decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) +- if attention_mask is not None: +- attention_mask = attention_mask.to(self.decoder.first_device) +- if decoder_attention_mask is not None: +- decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) +- +- # Decode +- decoder_outputs = self.decoder( +- input_ids=decoder_input_ids, +- attention_mask=decoder_attention_mask, +- inputs_embeds=decoder_inputs_embeds, +- past_key_values=past_key_values, +- encoder_hidden_states=hidden_states, +- encoder_attention_mask=attention_mask, +- head_mask=decoder_head_mask, +- cross_attn_head_mask=cross_attn_head_mask, +- use_cache=use_cache, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) +- +- sequence_output = decoder_outputs[0] +- +- # Set device for model parallelism +- if self.model_parallel: +- torch.cuda.set_device(self.encoder.first_device) +- self.lm_head = self.lm_head.to(self.encoder.first_device) +- sequence_output = sequence_output.to(self.lm_head.weight.device) +- +- if self.config.tie_word_embeddings: +- # Rescale output before projecting on vocab +- # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 +- sequence_output = sequence_output * (self.model_dim**-0.5) +- +- lm_logits = self.lm_head(sequence_output) +- +- loss = None +- if labels is not None: +- loss_fct = CrossEntropyLoss(ignore_index=-100) +- # move labels to correct device to enable PP +- labels = labels.to(lm_logits.device) +- loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) +- # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 +- +- if not return_dict: +- output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs +- return ((loss,) + output) if loss is not None else output +- +- return Seq2SeqLMOutput( +- loss=loss, +- logits=lm_logits, +- past_key_values=decoder_outputs.past_key_values, +- decoder_hidden_states=decoder_outputs.hidden_states, +- decoder_attentions=decoder_outputs.attentions, +- cross_attentions=decoder_outputs.cross_attentions, +- encoder_last_hidden_state=encoder_outputs.last_hidden_state, +- encoder_hidden_states=encoder_outputs.hidden_states, +- encoder_attentions=encoder_outputs.attentions, +- ) ++ if self.is_mindie: ++ with torch.npu.stream(self.stream): # set stream ++ decoder_outputs = self.decoder_mindie.forward(*args) ++ self.stream.synchronize() # synchronize ++ else: ++ hidden_states = args[0] ++ past_cross_keys = args[1:self.config.num_decoder_layers+1] ++ past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1] ++ past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1] ++ past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1] ++ encoder_attention_mask = args[-3] ++ decoder_input_ids = args[-2] ++ decoder_attention_mask = args[-1] ++ decoder_outputs = self.decoder(input_ids=decoder_input_ids, ++ encoder_hidden_states=hidden_states, ++ past_keys=past_keys, ++ past_values=past_values, ++ past_cross_keys=past_cross_keys, ++ past_cross_values=past_cross_values, ++ encoder_attention_mask=encoder_attention_mask, ++ attention_mask=decoder_attention_mask) ++ return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2]) + + def prepare_inputs_for_generation( + self, + input_ids, +- past_key_values=None, ++ past_cross_keys=None, ++ past_cross_values=None, ++ past_keys=None, ++ past_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, +@@ -1804,8 +1980,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used +- if past_key_values is not None: +- past_length = past_key_values[0][0].shape[2] ++ if past_keys is not None: ++ past_length = past_keys[0].shape[1] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: +@@ -1813,12 +1989,19 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 +- + input_ids = input_ids[:, remove_prefix_length:] + ++ batch_size, seq_length = input_ids.shape ++ # required mask seq length can be calculated via length of past ++ mask_seq_length = past_keys[0].shape[1] + seq_length if past_keys is not None else seq_length ++ decoder_attention_mask = torch.zeros(batch_size, mask_seq_length, device=input_ids.device) ++ decoder_attention_mask = decoder_attention_mask[:,None,None,:].expand(batch_size,1,mask_seq_length,mask_seq_length).bool() + return { + "decoder_input_ids": input_ids, +- "past_key_values": past_key_values, ++ "past_cross_keys":past_cross_keys, ++ "past_cross_values":past_cross_values, ++ "past_keys":past_keys, ++ "past_values":past_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, +@@ -1826,6 +2009,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + "decoder_attention_mask": decoder_attention_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, ++ + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): +@@ -1861,6 +2045,440 @@ class T5ForConditionalGeneration(T5PreTrainedModel): + reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) + return reordered_decoder_past + ++ def _prepare_encoder_decoder_kwargs_for_generation( ++ self, ++ inputs_tensor: torch.Tensor, ++ model_kwargs, ++ model_input_name, ++ generation_config, ++ ): ++ irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] ++ encoder_kwargs = { ++ argument: value ++ for argument, value in model_kwargs.items() ++ if not any(argument.startswith(p) for p in irrelevant_prefix) ++ } ++ encoder_kwargs["output_attentions"] = generation_config.output_attentions ++ encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states ++ model_input_name = model_input_name if model_input_name is not None else self.main_input_name ++ encoder_kwargs["return_dict"] = True ++ encoder_kwargs[model_input_name] = inputs_tensor ++ encoder_outputs = None ++ if self.is_mindie: ++ with torch.npu.stream(self.stream): # set stream ++ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"],encoder_kwargs["attention_mask"]) ++ self.stream.synchronize() # synchronize ++ else: ++ encoder_outputs=self.encoder.forward(**encoder_kwargs) ++ model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]} ++ model_kwargs["past_cross_keys"] = encoder_outputs[1] ++ model_kwargs["past_cross_values"] =encoder_outputs[2] ++ # print("model_kwargs=",model_kwargs) ++ return model_kwargs ++ ++ def _update_model_kwargs_for_generation( ++ self, ++ outputs, ++ model_kwargs, ++ is_encoder_decoder = False, ++ standardize_cache_format = False, ++ num_new_tokens = 1, ++ ): ++ # update past_key_values keeping its naming used in model code ++ cache_name, cache = self._extract_past_from_model_output( ++ outputs, standardize_cache_format=standardize_cache_format ++ ) ++ model_kwargs[cache_name] = cache ++ if "past_keys" in outputs: ++ past_keys = outputs.past_keys ++ model_kwargs["past_keys"] = past_keys ++ if "past_values" in outputs: ++ past_values = outputs.past_values ++ model_kwargs["past_values"] = past_values ++ # update decoder attention mask ++ if "decoder_attention_mask" in model_kwargs: ++ decoder_attention_mask = model_kwargs["decoder_attention_mask"] ++ model_kwargs["decoder_attention_mask"] = torch.cat( ++ [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))], ++ dim=-1, ++ ) ++ return model_kwargs ++ ++ @torch.no_grad() ++ def generate( ++ self, ++ inputs = None, ++ generation_config = None, ++ logits_processor = None, ++ stopping_criteria = None, ++ prefix_allowed_tokens_fn = None, ++ assistant_model = None, ++ negative_prompt_ids = None, ++ negative_prompt_attention_mask = None, ++ **kwargs, ++ ): ++ # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call ++ self._validate_model_class() ++ tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria ++ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs) ++ self._validate_model_kwargs(model_kwargs.copy()) ++ ++ ++ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() ++ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() ++ ++ accepts_attention_mask = True ++ requires_attention_mask = "encoder_outputs" not in model_kwargs ++ kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None ++ ++ # 3. Define model inputs ++ inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( ++ inputs, generation_config.bos_token_id, model_kwargs ++ ) ++ batch_size = inputs_tensor.shape[0] ++ seq_len = inputs_tensor.shape[1] ++ device = inputs_tensor.device ++ self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) ++ ++ # 4. Define other model kwargs ++ # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are ++ # generating the first new token or not, and we only want to use the embeddings for the first new token) ++ if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": ++ model_kwargs["use_cache"] = True ++ else: ++ model_kwargs["use_cache"] = generation_config.use_cache ++ if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: ++ model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( ++ inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id ++ ) ++ attention_mask = model_kwargs["attention_mask"] ++ attention_mask = attention_mask[:,None,None,:].expand(batch_size,1,seq_len,seq_len).bool() ++ model_kwargs["attention_mask"] = ~attention_mask ++ if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: ++ # if model is encoder decoder encoder_outputs are created and added to `model_kwargs` ++ model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( ++ inputs_tensor, model_kwargs, model_input_name, generation_config ++ ) ++ ++ # 5. Prepare `input_ids` which will be used for auto-regressive generation ++ if self.config.is_encoder_decoder: ++ input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( ++ batch_size=batch_size, ++ model_input_name=model_input_name, ++ model_kwargs=model_kwargs, ++ decoder_start_token_id=generation_config.decoder_start_token_id, ++ device=inputs_tensor.device, ++ ) ++ else: ++ input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") ++ ++ if generation_config.token_healing: ++ input_ids = self.heal_tokens(input_ids, tokenizer) ++ ++ # 6. Prepare `max_length` depending on other stopping criteria. ++ input_ids_length = input_ids.shape[-1] ++ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None ++ has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None ++ generation_config = self._prepare_generated_length( ++ generation_config=generation_config, ++ has_default_max_length=has_default_max_length, ++ has_default_min_length=has_default_min_length, ++ model_input_name=model_input_name, ++ inputs_tensor=inputs_tensor, ++ input_ids_length=input_ids_length, ++ ) ++ ++ use_dynamic_cache_by_default = False ++ if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None: ++ raise ValueError( ++ "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a " ++ "Cache object) is unsupported. Please use only one of the two." ++ ) ++ elif generation_config.cache_implementation is not None: ++ if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING: ++ if generation_config.cache_implementation == "static" and not self._supports_static_cache: ++ raise ValueError( ++ "This model does not support `cache_implementation='static'`. Please check the following " ++ "issue: https://github.com/huggingface/transformers/issues/28981" ++ ) ++ model_kwargs["past_key_values"] = self._get_cache( ++ generation_config.cache_implementation, ++ getattr(generation_config, "num_beams", 1) * batch_size, ++ generation_config.max_length, ++ ) ++ elif generation_config.cache_implementation == "quantized": ++ if not self._supports_quantized_cache: ++ raise ValueError( ++ "This model does not support the quantized cache. If you want your model to support quantized " ++ "cache, please open an issue." ++ ) ++ ++ cache_config = ( ++ generation_config.cache_config ++ if generation_config.cache_config is not None ++ else QuantizedCacheConfig() ++ ) ++ cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend] ++ ++ if cache_config.backend == "quanto" and not is_quanto_available(): ++ raise ImportError( ++ "You need to install `quanto` in order to use KV cache quantization with quanto backend. " ++ "Please install it via with `pip install quanto`" ++ ) ++ elif cache_config.backend == "HQQ" and not is_hqq_available(): ++ raise ImportError( ++ "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. " ++ "Please install it via with `pip install hqq`" ++ ) ++ ++ model_kwargs["past_key_values"] = cache_class(cache_config) ++ # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that ++ # keeps copying the cache thus using much more memory ++ elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache(): ++ past = model_kwargs.get("past_key_values", None) ++ if past is None: ++ model_kwargs["past_key_values"] = DynamicCache() ++ use_dynamic_cache_by_default = True ++ elif isinstance(past, tuple): ++ model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past) ++ use_dynamic_cache_by_default = True ++ ++ self._validate_generated_length(generation_config, input_ids_length, has_default_max_length) ++ ++ # 7. determine generation mode ++ generation_mode = generation_config.get_generation_mode(assistant_model) ++ # 8. prepare distribution pre_processing samplers ++ prepared_logits_processor = self._get_logits_processor( ++ generation_config=generation_config, ++ input_ids_seq_length=input_ids_length, ++ encoder_input_ids=inputs_tensor, ++ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, ++ logits_processor=logits_processor, ++ device=inputs_tensor.device, ++ model_kwargs=model_kwargs, ++ negative_prompt_ids=negative_prompt_ids, ++ negative_prompt_attention_mask=negative_prompt_attention_mask, ++ ) ++ ++ # 9. prepare stopping criteria ++ prepared_stopping_criteria = self._get_stopping_criteria( ++ generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs ++ ) ++ ++ if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): ++ # 11. prepare logits warper ++ prepared_logits_warper = ( ++ self._get_logits_warper(generation_config, device=input_ids.device) ++ if generation_config.do_sample ++ else None ++ ) ++ ++ # 12. expand input_ids with `num_return_sequences` additional sequences per batch ++ input_ids, model_kwargs = self._expand_inputs_for_generation( ++ input_ids=input_ids, ++ expand_size=generation_config.num_return_sequences, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ **model_kwargs, ++ ) ++ # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`) ++ result = self._sample( ++ input_ids, ++ logits_processor=prepared_logits_processor, ++ logits_warper=prepared_logits_warper, ++ stopping_criteria=prepared_stopping_criteria, ++ generation_config=generation_config, ++ **model_kwargs, ++ ) ++ return result ++ ++ def _sample( ++ self, ++ input_ids, ++ logits_processor, ++ stopping_criteria, ++ generation_config, ++ logits_warper = None, ++ **model_kwargs, ++ ): ++ # init values ++ pad_token_id = generation_config.pad_token_id ++ output_attentions = generation_config.output_attentions ++ output_hidden_states = generation_config.output_hidden_states ++ output_scores = generation_config.output_scores ++ output_logits = generation_config.output_logits ++ return_dict_in_generate = generation_config.return_dict_in_generate ++ has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) ++ do_sample = generation_config.do_sample ++ if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): ++ raise ValueError( ++ "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " ++ f"{logits_warper})." ++ ) ++ ++ # init attention / hidden states / scores tuples ++ scores = () if (return_dict_in_generate and output_scores) else None ++ raw_logits = () if (return_dict_in_generate and output_logits) else None ++ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None ++ cross_attentions = () if (return_dict_in_generate and output_attentions) else None ++ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None ++ ++ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states ++ if return_dict_in_generate and self.config.is_encoder_decoder: ++ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None ++ encoder_hidden_states = ( ++ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ++ ) ++ ++ this_peer_finished = False ++ batch_size = input_ids.shape[0] ++ unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) ++ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) ++ ++ # keep track of which sequences are already finished ++ if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration": ++ num_layers = self.config.num_layers ++ num_heads = self.config.num_heads ++ d_kv = self.config.d_kv ++ model_kwargs["past_keys"] = [torch.randn(batch_size, 0, num_heads*d_kv).half().npu() for _ in range(num_layers)] ++ model_kwargs["past_values"] = [torch.randn(batch_size, 0, num_heads*d_kv).half().npu() for _ in range(num_layers)] ++ ++ ++ while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device): ++ # prepare model inputs ++ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) ++ model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]] ++ model_args.extend(model_kwargs["past_cross_keys"]) ++ model_args.extend(model_kwargs["past_cross_values"]) ++ model_args.extend(model_inputs["past_keys"]) ++ model_args.extend(model_inputs["past_values"]) ++ model_args.append(model_inputs["attention_mask"]) ++ model_args.append(model_inputs["decoder_input_ids"]) ++ model_args.append(model_inputs["decoder_attention_mask"]) ++ ++ # forward pass to get next token ++ outputs = self(*model_args) ++ outputs = Seq2SeqLMOutput(logits=outputs[0], ++ past_keys=outputs[1], ++ past_values=outputs[2]) ++ ++ # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration ++ # (the clone itself is always small) ++ next_token_logits = outputs.logits[:, -1, :].clone() ++ ++ # pre-process distribution ++ next_token_scores = logits_processor(input_ids, next_token_logits) ++ if do_sample: ++ next_token_scores = logits_warper(input_ids, next_token_scores) ++ ++ # Store scores, attentions and hidden_states when required ++ if return_dict_in_generate: ++ if output_scores: ++ scores += (next_token_scores,) ++ if output_logits: ++ raw_logits += (next_token_logits,) ++ if output_attentions: ++ decoder_attentions += ( ++ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ++ ) ++ if self.config.is_encoder_decoder: ++ cross_attentions += (outputs.cross_attentions,) ++ ++ if output_hidden_states: ++ decoder_hidden_states += ( ++ (outputs.decoder_hidden_states,) ++ if self.config.is_encoder_decoder ++ else (outputs.hidden_states,) ++ ) ++ ++ # token selection ++ if do_sample: ++ probs = nn.functional.softmax(next_token_scores, dim=-1) ++ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) ++ else: ++ next_tokens = torch.argmax(next_token_scores, dim=-1) ++ ++ # finished sentences should have their next token be a padding token ++ if has_eos_stopping_criteria: ++ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) ++ ++ # update generated ids, model inputs, and length for next step ++ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) ++ model_kwargs = self._update_model_kwargs_for_generation( ++ outputs, ++ model_kwargs, ++ is_encoder_decoder=self.config.is_encoder_decoder, ++ ) ++ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) ++ this_peer_finished = unfinished_sequences.max() == 0 ++ # This is needed to properly delete outputs.logits which may be very large for first iteration ++ # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration ++ del outputs ++ return input_ids ++ ++ ++ @property ++ def device(self) -> torch.device: ++ """ ++ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same ++ device). ++ """ ++ return self.get_device() ++ ++ def get_extended_attention_mask( ++ self, attention_mask, input_shape, devic=None, dtype=None ++ ): ++ """ ++ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. ++ ++ Arguments: ++ attention_mask (`torch.Tensor`): ++ Mask with ones indicating tokens to attend to, zeros for tokens to ignore. ++ input_shape (`Tuple[int]`): ++ The shape of the input to the model. ++ ++ Returns: ++ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. ++ """ ++ if dtype is None: ++ dtype = self.dtype ++ ++ if not (attention_mask.dim() == 2 and self.config.is_decoder): ++ # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` ++ if device is not None: ++ warnings.warn( ++ "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ++ ) ++ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] ++ # ourselves in which case we just need to make it broadcastable to all heads. ++ if attention_mask.dim() == 3: ++ extended_attention_mask = attention_mask[:, None, :, :] ++ elif attention_mask.dim() == 2: ++ # Provided a padding mask of dimensions [batch_size, seq_length] ++ # - if the model is a decoder, apply a causal mask in addition to the padding mask ++ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] ++ if self.config.is_decoder: ++ extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( ++ input_shape, attention_mask, device ++ ) ++ else: ++ extended_attention_mask = attention_mask[:, None, None, :] ++ else: ++ raise ValueError( ++ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ++ ) ++ ++ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for ++ # masked positions, this operation will create a tensor which is 0.0 for ++ # positions we want to attend and the dtype's smallest value for masked positions. ++ # Since we are adding it to the raw scores before the softmax, this is ++ # effectively the same as removing these entirely. ++ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility ++ #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min ++ extended_attention_mask = (1.0 - extended_attention_mask) * -1000 ++ return extended_attention_mask ++ ++ ++ + + @add_start_docstrings( + "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", +@@ -1878,6 +2496,9 @@ class T5EncoderModel(T5PreTrainedModel): + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) ++ self.decoder_mindie = torch.jit.load("encoder_model_path") ++ ++ self.stream = torch.npu.Stream(f"npu:{2}") + + # Initialize weights and apply final processing + self.post_init() +@@ -1966,17 +2587,21 @@ class T5EncoderModel(T5PreTrainedModel): + >>> outputs = model(input_ids=input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" +- return_dict = return_dict if return_dict is not None else self.config.use_return_dict +- +- encoder_outputs = self.encoder( +- input_ids=input_ids, +- attention_mask=attention_mask, +- inputs_embeds=inputs_embeds, +- head_mask=head_mask, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- return_dict=return_dict, +- ) ++ # return_dict = return_dict if return_dict is not None else self.config.use_return_dict ++ # encoder_outputs = self.encoder( ++ # input_ids=input_ids, ++ # attention_mask=attention_mask, ++ # inputs_embeds=inputs_embeds, ++ # head_mask=head_mask, ++ # output_attentions=output_attentions, ++ # output_hidden_states=output_hidden_states, ++ # return_dict=return_dict, ++ # ) ++ attention_mask = attention_mask[:,None,None,:].expand(attention_mask.shape[0],1,attention_mask.shape[1],attention_mask.shape[1]).bool() ++ attention_mask = ~attention_mask ++ with torch.npu.stream(self.stream): # set stream ++ encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask) ++ self.stream.synchronize() # synchronize + + return encoder_outputs + -- Gitee From 1e09a5b7aa8d4b41ad09bd0e613439006ef0b10d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:07:53 +0000 Subject: [PATCH 086/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/T5_modeling_t5_patch.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index e304f4f9f2..21678e06d2 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -16,13 +16,23 @@ import os import transformers -def main(): +def main(args): transformers_path = transformers.__path__ transformers_version = transformers.__version__ assert transformers_version =='4.42.0', "expectation transformers==4.42.0" - os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') + if args.ascend_soc == "Ascend910B4": + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.py modeling_t5.patch') + elif args.ascend_soc == "Ascend310P3": + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--ascend_soc", type=str, default="Ascend910B4",required=True) + return args if __name__ == '__main__': - main() + args = parse_args() + main(args) -- Gitee From 0e8fe3e1672c54480b1715ab812f29b2b1ab13f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:15:59 +0000 Subject: [PATCH 087/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 21678e06d2..4753b3ee0c 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -14,6 +14,7 @@ import os import transformers +import argparse def main(args): -- Gitee From fdc9df000188876398c910aa025f125330b7bfcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:18:06 +0000 Subject: [PATCH 088/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 4753b3ee0c..43d0caf25e 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -31,6 +31,7 @@ def main(args): def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--ascend_soc", type=str, default="Ascend910B4",required=True) + args = parser.parse_args() return args -- Gitee From 69b484910a92f239c40de01a7ed2494517e95f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:19:49 +0000 Subject: [PATCH 089/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 43d0caf25e..0e3c076ca6 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -23,7 +23,7 @@ def main(args): assert transformers_version =='4.42.0', "expectation transformers==4.42.0" if args.ascend_soc == "Ascend910B4": - os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.py modeling_t5.patch') + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.patch modeling_t5.patch') elif args.ascend_soc == "Ascend310P3": os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') -- Gitee From 79ac0bf58bc66d01c0efd7e8800d74af540f34ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:23:12 +0000 Subject: [PATCH 090/110] update MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py index 0e3c076ca6..c6733e6904 100644 --- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py +++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py @@ -23,7 +23,7 @@ def main(args): assert transformers_version =='4.42.0', "expectation transformers==4.42.0" if args.ascend_soc == "Ascend910B4": - os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.patch modeling_t5.patch') + os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5_800IA2.patch') elif args.ascend_soc == "Ascend310P3": os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch') -- Gitee From 457414d33d921b087149ef2005ebe39c320cf7e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 24 Sep 2024 06:29:37 +0000 Subject: [PATCH 091/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index b677c10796..45792728bd 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -68,11 +68,15 @@ 执行命令: ```bash - python T5_modeling_t5_patch.py + python T5_modeling_t5_patch.py --ascend_soc {Ascend910B4 or Ascend310P3} ``` 4.导出mindietorch模型 +300IDUO卡环境下: ```bash python export_t5.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} +800IA2卡环境下: + ```bash + python export_t5_800IA2.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} ``` 参数说明: {output_path}是输出的目录 -- Gitee From 88d86bfb27fafdd590228a064aa5b9bd1eccb172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Fri, 27 Sep 2024 11:43:13 +0000 Subject: [PATCH 092/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/export_t5.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py index 995274d0bd..9c67b7c7ef 100644 --- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py +++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py @@ -50,8 +50,8 @@ class TextEncoderExport(torch.nn.Module): super(TextEncoderExport, self).__init__() self.textencoder_model = textencoder_model - def forward(self, input_ids): - return self.textencoder_model(input_ids=input_ids) + def forward(self, input_ids,attention_mask): + return self.textencoder_model(input_ids=input_ids, attention_mask=attention_mask) class TextDecoderExport(torch.nn.Module): def __init__(self, textdecoder_model): @@ -71,6 +71,7 @@ def export_textencoder(args, model, save_dir, batch_size): if not os.path.exists(traced_path): text_encoder = model.encoder dummy_input = ( + torch.ones([1, 128], dtype=torch.int64).npu(), torch.ones([1, 128], dtype=torch.int64).npu() ) encoder = TextEncoderExport(text_encoder) @@ -81,6 +82,7 @@ def export_textencoder(args, model, save_dir, batch_size): inputs0 = [] inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) + inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64)) print("compiling encoder") compiled_model = mindietorch.compile( traced_model, -- Gitee From 2db6e6fc1e69aeb9b13e634bbb9be436b5aa4878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Sun, 29 Sep 2024 01:18:49 +0000 Subject: [PATCH 093/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index 26b0ce5e87..8923d7b3d4 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,5 +1,5 @@ diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..65c058e6e 100644 +index 224769fdf..8a8f9a23a 100644 --- a/modeling_t5_origin.py +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py @@ -19,7 +19,7 @@ import math @@ -1186,7 +1186,7 @@ index 224769fdf..65c058e6e 100644 + encoder_kwargs[model_input_name] = inputs_tensor + if self.is_mindie: + with torch.npu.stream(self.stream): # set stream -+ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"]) ++ encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"],encoder_kwargs["attention_mask"]) + self.stream.synchronize() # synchronize + else: + encoder_outputs=self.encoder.forward(**encoder_kwargs) -- Gitee From 8296f4f76be970e9d521eb87d5d5141ed29e3753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Sat, 12 Oct 2024 08:05:19 +0000 Subject: [PATCH 094/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 50 ++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 45792728bd..95a550f302 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -74,6 +74,7 @@ 300IDUO卡环境下: ```bash python export_t5.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} + ``` 800IA2卡环境下: ```bash python export_t5_800IA2.py --output_dir {output_path} --model_path {model_path} --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id} @@ -96,4 +97,51 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path {model_path}模型所在目录 {encoder_aie_path}优化后的encoder的模型路径,要具体到.pt文件 {decoder_aie_path}优化后的decoder的模型路径,要具体到.pt文件 -{device_id} 用哪个npu device \ No newline at end of file +{device_id} 用哪个npu device + +6.精度测试 + +6.1 精度验收标准 +数据集:https://github.com/embeddings-benchmark/mteb(英文数据集选一种测试),精度和GPU推理结果对比误差小于1% +6.2 精度测试方法 + +6.2.1安装mteb + + ```bash +pip install mteb +``` +6.2.2 下载mteb数据集(如果机器可以连接外部网络可以跳过这步) +下载链接:https://github.com/embeddings-benchmark/mteb + +6.2.3 修改metb的读取数据集的路径地址(如果机器可以连接外部网络可以跳过这步) +例如如果下载的是Banking77Classification数据集,修改mteb python包里的文件路径,例如 +D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classification.py文件里的path路径为6.2.2下载的数据集的路径 + +6.2.4 修改代码 +800IA2卡环境下: +修改transfoermers包下modeling_t5.py下的T5EncoderModel类,将self.decoder_mindie加载路径修改为编译好的encoder的路径 + +300IDUO卡环境下: +修改transfoermers包下modeling_t5.py下的T5EncoderModel类,增加一行,self.decoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径,再修改forward接口为 +```bash +with torch.npu.stream(self.stream): # set stream + encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask) +self.stream.synchronize() # synchronize +return encoder_outputs +``` +6.2.5测试代码 + +```bash +import torch + +import mteb +from sentence_transformers import SentenceTransformer + +model_name = "D:\downloads\T5-v2" +model = SentenceTransformer(model_name,model_kwargs={"torch_dtype":torch.float16}) +tasks = mteb.get_tasks(tasks=["CLSClusteringP2P"]) +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model, output_folder=f"./{model_name}") +``` +6.2.6 结果输出 +会在当前目录输出结果文件 -- Gitee From c1773fd718a61c4355b4f1868aee2889b9727427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Sat, 12 Oct 2024 08:06:19 +0000 Subject: [PATCH 095/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 95a550f302..8e8dceb2fe 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -118,6 +118,7 @@ pip install mteb D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classification.py文件里的path路径为6.2.2下载的数据集的路径 6.2.4 修改代码 + 800IA2卡环境下: 修改transfoermers包下modeling_t5.py下的T5EncoderModel类,将self.decoder_mindie加载路径修改为编译好的encoder的路径 -- Gitee From 4736ea28dd106afe2c4aa4ceede7810e1a9447f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 14 Oct 2024 10:38:01 +0000 Subject: [PATCH 096/110] update MindIE/MindIE-Torch/built-in/T5/main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/main.py | 27 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py index 28d85df24a..6e20f1e05e 100644 --- a/MindIE/MindIE-Torch/built-in/T5/main.py +++ b/MindIE/MindIE-Torch/built-in/T5/main.py @@ -14,29 +14,38 @@ def parse_args(): parser.add_argument("--device_id", type=int, help="NPU device id", default=0) + parser.add_argument("--performance", action="store_true") + args = parser.parse_args() return args def main(): args = parse_args() + torch.npu.set_device(args.device_id) tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path) - text = ["贵州毛台现在多少钱一瓶啊,想买两瓶尝尝味道。", - "能不能帮我买点淇淋,好久没吃了", - "脑子有点胡涂了,这道题冥冥学过还没有做出来"] + text = ["今年2月26日,阿富汗塔里班的最高领秀下令销毁全国范围内所有“非伊斯兰“的古文化遗产,其中包括矗立于巴米扬的世高(大界最约58米)的立式佛像。"] t5_config = T5Config.from_pretrained(args.hf_model_path) + # model = T5ForConditionalGeneration.from_pretrained(args.hf_model_path).half().npu() model = T5ForConditionalGeneration(config=t5_config, encoder_path=args.encoder_aie_path, decoder_path=args.decoder_aie_path, device_id=args.device_id).half().npu() input_ids = tokenizer(text, return_tensors = "pt", padding=True).input_ids - outputs = model.generate(input_ids.npu(),max_new_tokens=24) + if args.performance: + input_ids = torch.randint(0,32000,(1,512)) + outputs = model.generate(input_ids.npu(),max_new_tokens=512) + print("token length : ", input_ids.shape) start_time = time.time() - outputs = model.generate(input_ids.npu(),max_new_tokens=24) - print("time_cost=", time.time()-start_time) - print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) + + outputs = model.generate(input_ids.npu(),max_new_tokens=512) + inference_time = time.time()-start_time + print("time_cost=", inference_time) + print("output token length : ", outputs[0].shape[0]) + print("throught output is : ", outputs[0].shape[0] / inference_time) + if not args.performance: + print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) if __name__ == "__main__": - main() - + main() \ No newline at end of file -- Gitee From bc9588c1f908c1e34e6f2753618552e942239900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 14 Oct 2024 10:40:43 +0000 Subject: [PATCH 097/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 8e8dceb2fe..2be95322d9 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -88,11 +88,16 @@ 运行该命令后会自动生成encoder和decoder优化后的模型 -5.运行 +5.运行与性能测试 +导入环境变量:export TORCH_AIE_NPU_CACHE_MAX_SIZE=32 ```bash python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2 ``` - +性能测试: + ```bash +python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2 --performance +``` +打屏可以看到输入长度为512,输出长度为512单batch下的吞吐 参数说明: {model_path}模型所在目录 {encoder_aie_path}优化后的encoder的模型路径,要具体到.pt文件 -- Gitee From 893b55ac9144820e01f6bd6b8a98283b77e4fc59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 28 Oct 2024 05:58:52 +0000 Subject: [PATCH 098/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md index 3ffa911ed6..f9c6f0ca65 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/readme.md +++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md @@ -16,6 +16,7 @@ # 概述 T5全称是Text-to-Text Transfer Transformer,是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务,如分类、相似度计算、文本生成都用一个Text-to-text(文本到文本)的框架里进行解决。 + 权重下载:https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918 ## 输入输出数据 -- Gitee From 7806612346d3b0a0b7f44bd6f477416b9fa195b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 28 Oct 2024 05:59:26 +0000 Subject: [PATCH 099/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/readme.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md index f9c6f0ca65..96f0c1cb00 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/readme.md +++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md @@ -16,8 +16,7 @@ # 概述 T5全称是Text-to-Text Transfer Transformer,是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务,如分类、相似度计算、文本生成都用一个Text-to-text(文本到文本)的框架里进行解决。 - 权重下载:https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918 - + 权重下载:https://huggingface.co/collections/google/mt5-release-65005f1a520f8d7b4d039509 ## 输入输出数据 -- Gitee From 8f4fc16e19840f81019c632371e99a34fb3b2fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 28 Oct 2024 06:00:06 +0000 Subject: [PATCH 100/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 2be95322d9..8170a54c3f 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -16,6 +16,7 @@ # 概述 T5的全称为Text to Text Transfer Transformer,是谷歌提出的预训练语言模型领域的通用模型,该模型将所有自然语言问题都转化成文本到文本的形式,并用一个统一的模型解决.T5最核心的理念是:使用前缀任务声明及文本答案生成,统一所有自然语言处理任务的输入和输出。在此之前的几乎所有预训练语言模型,在下游任务微调过程中都需要添加非线性层,将模型的输出转化为任务指定的输出格式。T5不需要对模型做任何改动,只需要提供下游任务的微调数据;不需要添加任何非线性层,唯一需要做的就是在输入数据前加上任务声明前缀.T5将自然语言处理任务都转化成几乎一致的格式,即输入是带有任务前缀声明的文本序列,输出的文本序列是相应任务的结果 +权重下载:https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918 ## 输入输出数据 -- Gitee From 0d683e3f6791172aaf05b285d7809e44b3e6cc64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Mon, 28 Oct 2024 12:31:32 +0000 Subject: [PATCH 101/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 8170a54c3f..44feeb7415 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -129,10 +129,10 @@ D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classifica 修改transfoermers包下modeling_t5.py下的T5EncoderModel类,将self.decoder_mindie加载路径修改为编译好的encoder的路径 300IDUO卡环境下: -修改transfoermers包下modeling_t5.py下的T5EncoderModel类,增加一行,self.decoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径,再修改forward接口为 +修改transfoermers包下modeling_t5.py下的T5EncoderModel类,增加一行,self.encoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径,再修改forward接口为 ```bash with torch.npu.stream(self.stream): # set stream - encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask) + encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask) self.stream.synchronize() # synchronize return encoder_outputs ``` @@ -140,7 +140,7 @@ return encoder_outputs ```bash import torch - +import mindietorch import mteb from sentence_transformers import SentenceTransformer -- Gitee From 55d4b5867adb9455c6ec8c8cd734393acfad2e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 29 Oct 2024 06:44:42 +0000 Subject: [PATCH 102/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 44feeb7415..d7ef3160c2 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -108,7 +108,8 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path 6.精度测试 6.1 精度验收标准 -数据集:https://github.com/embeddings-benchmark/mteb(英文数据集选一种测试),精度和GPU推理结果对比误差小于1% +数据集:(英文数据集选一种测试),精度和GPU推理结果对比误差小于1% + 6.2 精度测试方法 6.2.1安装mteb @@ -129,10 +130,15 @@ D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classifica 修改transfoermers包下modeling_t5.py下的T5EncoderModel类,将self.decoder_mindie加载路径修改为编译好的encoder的路径 300IDUO卡环境下: -修改transfoermers包下modeling_t5.py下的T5EncoderModel类,增加一行,self.encoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径,再修改forward接口为 +修改transfoermers包下modeling_t5.py下的T5EncoderModel类,增加2行, +```bash +self.encoder_mindie = torch.jit.load("encoder_model_path") +self.stream = torch.npu.Stream(f"npu:{device_id}") +``` +其中encoder_model_path为编译好的encoder的路径,device_id为当前设置的npu卡号,再修改forward接口为 ```bash with torch.npu.stream(self.stream): # set stream - encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask) + encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0] self.stream.synchronize() # synchronize return encoder_outputs ``` -- Gitee From d049285be09fac63608dba6f5d92d427cc527a01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 29 Oct 2024 07:40:37 +0000 Subject: [PATCH 103/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index d7ef3160c2..f8b822ef23 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -140,7 +140,7 @@ self.stream = torch.npu.Stream(f"npu:{device_id}") with torch.npu.stream(self.stream): # set stream encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0] self.stream.synchronize() # synchronize -return encoder_outputs +return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=encoder_outputs) ``` 6.2.5测试代码 -- Gitee From 730e775598128eb71b360e0f9ec35c50ac59887d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Wed, 30 Oct 2024 01:29:50 +0000 Subject: [PATCH 104/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index f8b822ef23..d2c038770d 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -138,9 +138,9 @@ self.stream = torch.npu.Stream(f"npu:{device_id}") 其中encoder_model_path为编译好的encoder的路径,device_id为当前设置的npu卡号,再修改forward接口为 ```bash with torch.npu.stream(self.stream): # set stream - encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0] + encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask) self.stream.synchronize() # synchronize -return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=encoder_outputs) +return encoder_outputs ``` 6.2.5测试代码 -- Gitee From 8e7c39921e83718866965117c4ad3f207be91da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 31 Oct 2024 12:18:49 +0000 Subject: [PATCH 105/110] update MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- .../built-in/T5/modeling_t5.patch | 83 ++++++++++--------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch index 8923d7b3d4..15f81df2a4 100644 --- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch +++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch @@ -1,17 +1,20 @@ -diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -index 224769fdf..8a8f9a23a 100644 ---- a/modeling_t5_origin.py +diff --git a/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py +index 224769f..24f868b 100644 +--- a/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5_origin.py +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py -@@ -19,7 +19,7 @@ import math +@@ -19,8 +19,10 @@ import math import os import warnings from typing import List, Optional, Tuple, Union - +from dataclasses import dataclass import torch ++import torch_npu ++import mindietorch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -@@ -28,13 +28,12 @@ from ...activations import ACT2FN + +@@ -28,13 +30,12 @@ from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -26,7 +29,7 @@ index 224769fdf..8a8f9a23a 100644 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, -@@ -47,7 +46,43 @@ from ...utils import ( +@@ -47,7 +48,43 @@ from ...utils import ( ) from ...utils.model_parallel_utils import assert_device_map, get_device_map from .configuration_t5 import T5Config @@ -70,7 +73,7 @@ index 224769fdf..8a8f9a23a 100644 logger = logging.get_logger(__name__) -@@ -448,7 +483,10 @@ class T5Attention(nn.Module): +@@ -448,7 +485,10 @@ class T5Attention(nn.Module): mask=None, key_value_states=None, position_bias=None, @@ -82,7 +85,7 @@ index 224769fdf..8a8f9a23a 100644 layer_head_mask=None, query_length=None, use_cache=False, -@@ -464,12 +502,8 @@ class T5Attention(nn.Module): +@@ -464,12 +504,8 @@ class T5Attention(nn.Module): real_seq_length = seq_length @@ -97,7 +100,7 @@ index 224769fdf..8a8f9a23a 100644 key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] -@@ -493,16 +527,17 @@ class T5Attention(nn.Module): +@@ -493,16 +529,17 @@ class T5Attention(nn.Module): hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: @@ -121,7 +124,7 @@ index 224769fdf..8a8f9a23a 100644 else: # cross-attn hidden_states = past_key_value -@@ -513,17 +548,16 @@ class T5Attention(nn.Module): +@@ -513,17 +550,16 @@ class T5Attention(nn.Module): # get key/value states key_states = project( @@ -142,7 +145,7 @@ index 224769fdf..8a8f9a23a 100644 if position_bias is None: if not self.has_relative_attention_bias: position_bias = torch.zeros( -@@ -536,7 +570,7 @@ class T5Attention(nn.Module): +@@ -536,7 +572,7 @@ class T5Attention(nn.Module): # if key and values are already calculated # we want only the last query position bias @@ -151,7 +154,7 @@ index 224769fdf..8a8f9a23a 100644 position_bias = position_bias[:, :, -hidden_states.size(1) :, :] if mask is not None: -@@ -548,7 +582,6 @@ class T5Attention(nn.Module): +@@ -548,7 +584,6 @@ class T5Attention(nn.Module): position_bias_masked = position_bias[:, mask.bool()] else: position_bias_masked = position_bias @@ -159,7 +162,7 @@ index 224769fdf..8a8f9a23a 100644 scores += position_bias_masked attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( scores -@@ -564,18 +597,131 @@ class T5Attention(nn.Module): +@@ -564,18 +599,131 @@ class T5Attention(nn.Module): attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) attn_output = self.o(attn_output) @@ -294,7 +297,7 @@ index 224769fdf..8a8f9a23a 100644 self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) -@@ -585,7 +731,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -585,7 +733,8 @@ class T5LayerSelfAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -304,7 +307,7 @@ index 224769fdf..8a8f9a23a 100644 use_cache=False, output_attentions=False, ): -@@ -595,7 +742,8 @@ class T5LayerSelfAttention(nn.Module): +@@ -595,7 +744,8 @@ class T5LayerSelfAttention(nn.Module): mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -314,7 +317,7 @@ index 224769fdf..8a8f9a23a 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -618,7 +766,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -618,7 +768,8 @@ class T5LayerCrossAttention(nn.Module): attention_mask=None, position_bias=None, layer_head_mask=None, @@ -324,7 +327,7 @@ index 224769fdf..8a8f9a23a 100644 use_cache=False, query_length=None, output_attentions=False, -@@ -630,7 +779,8 @@ class T5LayerCrossAttention(nn.Module): +@@ -630,7 +781,8 @@ class T5LayerCrossAttention(nn.Module): key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, @@ -334,7 +337,7 @@ index 224769fdf..8a8f9a23a 100644 use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, -@@ -661,39 +811,34 @@ class T5Block(nn.Module): +@@ -661,39 +813,34 @@ class T5Block(nn.Module): encoder_decoder_position_bias=None, layer_head_mask=None, cross_attn_layer_head_mask=None, @@ -388,7 +391,7 @@ index 224769fdf..8a8f9a23a 100644 # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: -@@ -706,22 +851,23 @@ class T5Block(nn.Module): +@@ -706,22 +853,23 @@ class T5Block(nn.Module): do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: @@ -417,7 +420,7 @@ index 224769fdf..8a8f9a23a 100644 output_attentions=output_attentions, ) hidden_states = cross_attention_outputs[0] -@@ -736,11 +882,9 @@ class T5Block(nn.Module): +@@ -736,11 +884,9 @@ class T5Block(nn.Module): hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states @@ -431,7 +434,7 @@ index 224769fdf..8a8f9a23a 100644 # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) -@@ -757,7 +901,7 @@ class T5Block(nn.Module): +@@ -757,7 +903,7 @@ class T5Block(nn.Module): outputs = (hidden_states,) if use_cache: @@ -440,7 +443,7 @@ index 224769fdf..8a8f9a23a 100644 else: outputs = outputs + attention_outputs -@@ -897,11 +1041,15 @@ class T5PreTrainedModel(PreTrainedModel): +@@ -897,11 +1043,15 @@ class T5PreTrainedModel(PreTrainedModel): class T5Stack(T5PreTrainedModel): @@ -457,7 +460,7 @@ index 224769fdf..8a8f9a23a 100644 self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] -@@ -966,20 +1114,63 @@ class T5Stack(T5PreTrainedModel): +@@ -966,20 +1116,63 @@ class T5Stack(T5PreTrainedModel): def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -523,7 +526,7 @@ index 224769fdf..8a8f9a23a 100644 ): # Model parallel if self.model_parallel: -@@ -998,8 +1189,10 @@ class T5Stack(T5PreTrainedModel): +@@ -998,8 +1191,10 @@ class T5Stack(T5PreTrainedModel): f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: @@ -534,7 +537,7 @@ index 224769fdf..8a8f9a23a 100644 elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: -@@ -1012,18 +1205,19 @@ class T5Stack(T5PreTrainedModel): +@@ -1012,18 +1207,19 @@ class T5Stack(T5PreTrainedModel): inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -559,7 +562,7 @@ index 224769fdf..8a8f9a23a 100644 if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) -@@ -1054,7 +1248,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1054,7 +1250,8 @@ class T5Stack(T5PreTrainedModel): # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -569,7 +572,7 @@ index 224769fdf..8a8f9a23a 100644 all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None -@@ -1062,8 +1257,8 @@ class T5Stack(T5PreTrainedModel): +@@ -1062,8 +1259,8 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) @@ -580,7 +583,7 @@ index 224769fdf..8a8f9a23a 100644 layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel -@@ -1112,7 +1307,10 @@ class T5Stack(T5PreTrainedModel): +@@ -1112,7 +1309,10 @@ class T5Stack(T5PreTrainedModel): encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, @@ -592,7 +595,7 @@ index 224769fdf..8a8f9a23a 100644 use_cache=use_cache, output_attentions=output_attentions, ) -@@ -1120,19 +1318,20 @@ class T5Stack(T5PreTrainedModel): +@@ -1120,19 +1320,20 @@ class T5Stack(T5PreTrainedModel): # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: @@ -618,7 +621,7 @@ index 224769fdf..8a8f9a23a 100644 if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) -@@ -1146,7 +1345,7 @@ class T5Stack(T5PreTrainedModel): +@@ -1146,7 +1347,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = hidden_states.to("cuda:" + str(k + 1)) hidden_states = self.final_layer_norm(hidden_states) @@ -627,7 +630,7 @@ index 224769fdf..8a8f9a23a 100644 # Add last layer if output_hidden_states: -@@ -1164,13 +1363,216 @@ class T5Stack(T5PreTrainedModel): +@@ -1164,13 +1365,216 @@ class T5Stack(T5PreTrainedModel): ] if v is not None ) @@ -850,7 +853,7 @@ index 224769fdf..8a8f9a23a 100644 T5_START_DOCSTRING = r""" -@@ -1541,6 +1943,38 @@ class T5Model(T5PreTrainedModel): +@@ -1541,6 +1945,38 @@ class T5Model(T5PreTrainedModel): ) @@ -889,7 +892,7 @@ index 224769fdf..8a8f9a23a 100644 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): _keys_to_ignore_on_load_unexpected = [ -@@ -1548,28 +1982,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1548,28 +1984,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -960,7 +963,7 @@ index 224769fdf..8a8f9a23a 100644 # Model parallel self.model_parallel = False -@@ -1637,25 +2094,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1637,25 +2096,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) @@ -987,7 +990,7 @@ index 224769fdf..8a8f9a23a 100644 r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., -@@ -1687,113 +2126,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1687,113 +2128,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel): >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" @@ -1126,7 +1129,7 @@ index 224769fdf..8a8f9a23a 100644 attention_mask=None, head_mask=None, decoder_head_mask=None, -@@ -1804,8 +2167,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1804,8 +2169,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): **kwargs, ): # cut decoder_input_ids if past_key_values is used @@ -1137,7 +1140,7 @@ index 224769fdf..8a8f9a23a 100644 # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: -@@ -1813,12 +2176,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1813,12 +2178,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): else: # Default to old behavior: keep only final ID remove_prefix_length = input_ids.shape[1] - 1 @@ -1154,7 +1157,7 @@ index 224769fdf..8a8f9a23a 100644 "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, -@@ -1826,6 +2191,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1826,6 +2193,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): "decoder_attention_mask": decoder_attention_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, @@ -1162,7 +1165,7 @@ index 224769fdf..8a8f9a23a 100644 } def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): -@@ -1861,6 +2227,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel): +@@ -1861,6 +2229,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel): reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past @@ -1622,7 +1625,7 @@ index 224769fdf..8a8f9a23a 100644 @add_start_docstrings( "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", -@@ -1967,7 +2786,6 @@ class T5EncoderModel(T5PreTrainedModel): +@@ -1967,7 +2788,6 @@ class T5EncoderModel(T5PreTrainedModel): >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict -- Gitee From a06d09d9f5f23604a72340a31b1d048fd2dc8895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 31 Oct 2024 12:20:27 +0000 Subject: [PATCH 106/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index d2c038770d..49bd28021c 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -146,6 +146,7 @@ return encoder_outputs ```bash import torch +import torch_npu import mindietorch import mteb from sentence_transformers import SentenceTransformer -- Gitee From c69001ff85670380dc7da2d882c40c1a48a1e92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 31 Oct 2024 13:05:50 +0000 Subject: [PATCH 107/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 49bd28021c..3423c7c155 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -150,7 +150,7 @@ import torch_npu import mindietorch import mteb from sentence_transformers import SentenceTransformer - +torch.npu.set_device(0) model_name = "D:\downloads\T5-v2" model = SentenceTransformer(model_name,model_kwargs={"torch_dtype":torch.float16}) tasks = mteb.get_tasks(tasks=["CLSClusteringP2P"]) -- Gitee From 3df3f5bd0dec297e8b768e99d2b6ab690b408cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Thu, 5 Dec 2024 09:29:52 +0000 Subject: [PATCH 108/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md index 96f0c1cb00..3d5e155a81 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/readme.md +++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md @@ -84,6 +84,7 @@ 运行该命令后会自动生成encoder和decoder优化后的模型 5.精度测试 +sentense-transformers版本必须是3.1.1 ```bash python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id ``` -- Gitee From 3fa2f9067a94978f7e484c03400f68b15072ba63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 10 Dec 2024 06:17:48 +0000 Subject: [PATCH 109/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/MT5/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md index 3d5e155a81..4862823913 100644 --- a/MindIE/MindIE-Torch/built-in/MT5/readme.md +++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md @@ -84,7 +84,7 @@ 运行该命令后会自动生成encoder和decoder优化后的模型 5.精度测试 -sentense-transformers版本必须是3.1.1 + ```bash python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id ``` -- Gitee From 2354286269b76e811e9422137267499cf0480909 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= Date: Tue, 10 Dec 2024 06:19:48 +0000 Subject: [PATCH 110/110] update MindIE/MindIE-Torch/built-in/T5/readme.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 甄文奇 --- MindIE/MindIE-Torch/built-in/T5/readme.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md index 3423c7c155..a8cd519940 100644 --- a/MindIE/MindIE-Torch/built-in/T5/readme.md +++ b/MindIE/MindIE-Torch/built-in/T5/readme.md @@ -112,9 +112,10 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path 6.2 精度测试方法 -6.2.1安装mteb +6.2.1安装mteb和sentence_transformes ```bash +pip sentence_transformes==3.1.1 pip install mteb ``` 6.2.2 下载mteb数据集(如果机器可以连接外部网络可以跳过这步) -- Gitee