From edb41b343375369ba0b943100177d67a12d987a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 02:41:56 +0000
Subject: [PATCH 001/110] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20T5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/.keep

diff --git a/MindIE/MindIE-Torch/built-in/T5/.keep b/MindIE/MindIE-Torch/built-in/T5/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 11d9724cff613b632f2bd41e67db0f624f1fd26d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 02:44:45 +0000
Subject: [PATCH 002/110] add MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 181 +++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_t5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
new file mode 100644
index 0000000000..2b421aff68
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -0,0 +1,181 @@
+
+import torch
+import torch_npu
+import argparse
+import os
+import mindietorch
+from transformers import T5ForConditionalGeneration
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./models",
+        help="save dir"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./DeepFloyd--t5-v1_1-xxl",
+        help="encoder model path"
+    )
+    parser.add_argument(
+        "--max_batchsize",
+        type=int,
+        default=1,
+        help="max batchsize when running"
+    )
+
+    parser.add_argument(
+        "--max_input_seq_len",
+        type=int,
+        default=256,
+        help="max input_sequence length when running"
+    )
+
+    
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="npu device id"
+    )
+    return parser.parse_args()
+
+
+class TextEncoderExport(torch.nn.Module):
+    def __init__(self, textencoder_model):
+        super(TextEncoderExport, self).__init__()
+        self.textencoder_model = textencoder_model
+    
+    def forward(self, input_ids):
+        return self.textencoder_model(input_ids=input_ids)
+
+class TextDecoderExport(torch.nn.Module):
+    def __init__(self, textdecoder_model):
+        super(TextDecoderExport, self).__init__()
+        self.textdecoder_model = textdecoder_model
+    
+    def forward(self,
+                input_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values,
+                past_cross_key_values):
+        return self.textdecoder_model(input_ids=input_ids,
+                                      encoder_hidden_states=encoder_hidden_states,
+                                      encoder_attention_mask=encoder_attention_mask,
+                                      past_key_values=past_key_values,
+                                      past_cross_key_values=past_cross_key_values,
+                                      return_dict=True)
+
+def export_textencoder(args, model, save_dir, batch_size):
+    encoder_path = os.path.join(save_dir, "encoder")
+    if not os.path.exists(encoder_path):
+        os.makedirs(encoder_path, mode=0o640)
+    traced_path = os.path.join(encoder_path, "encoder.pt")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(traced_path):
+        text_encoder = model.encoder
+        dummy_input = (
+            torch.ones([1, 128], dtype=torch.int64).npu()
+        )
+        encoder = TextEncoderExport(text_encoder)
+        encoder.eval()
+        torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        model = torch.jit.load(traced_path).eval()
+        
+        inputs0 = []
+        # inputs1 = []
+        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
+        print("compiling encoder")
+        compiled_model = mindietorch.compile(
+            model,
+            inputs=inputs0,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def export_textdecoder(args, model, save_dir, batch_size):
+    decoder_path = os.path.join(save_dir, "decoder")
+    if not os.path.exists(decoder_path):
+        os.makedirs(decoder_path, mode=0o640)
+    traced_path = os.path.join(decoder_path, "decoder.pt")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    model_path = args.model_path
+    max_lenth = 120
+    if not os.path.exists(traced_path):
+        text_decoder = model.decoder
+        dummy_input = (
+            torch.ones([1, 1], dtype=torch.int64).npu(),
+            torch.randn(1,16,512).to(torch.float16).npu(),
+            torch.ones(1,16).npu(),
+            torch.randn(6,2,1,8,1,64).to(torch.float16).npu(),
+            torch.randn(6,2,1,8,24,64).to(torch.float16).npu()
+        )
+        decoder = TextDecoderExport(text_decoder).npu()
+        decoder.eval()
+        torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        model = torch.jit.load(traced_path).eval()
+        print("compiling decoder")
+        compiled_model = mindietorch.compile(
+            model,
+            inputs=[mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,1),
+                                      dtype=mindietorch.dtype.INT64),
+
+                    mindietorch.Input(min_shape =(1, 1, 512),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, 512),
+                                      dtype=mindietorch.dtype.FLOAT16),
+                                      
+                    mindietorch.Input(min_shape = (1,1),
+                                      max_shape =(args.max_batchsize,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.INT64),
+                    mindietorch.Input(min_shape = (6,2,1,8,0,64),
+                                      max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64),
+                                      dtype=mindietorch.dtype.FLOAT16),
+
+                    mindietorch.Input(min_shape = (6,2,1,8,1,64),
+                                      max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64),
+                                      dtype=mindietorch.dtype.FLOAT16)],
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def main():
+    args = parse_arguments()
+    device_id = args.device_id
+    save_dir = args.output_dir
+    torch.npu.set_device(device_id)
+    batch_size = 1
+    model = T5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu()
+    encoder_path = os.path.join(save_dir, "encoder")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textencoder(args, model, save_dir, batch_size)
+    print("export encoder_model done!")
+
+    decoder_path = os.path.join(save_dir, "decoder")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textdecoder(args, model, save_dir, batch_size)
+    print("export decoder_model done!")
+    
+
+
+
+if __name__ == "__main__":
+    main()
-- 
Gitee


From 978760fe5e3cc2deaaa3641b4000c5c346385d67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:28:31 +0000
Subject: [PATCH 003/110] transformers patch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/modeling_outputs.patch           |   0
 .../MindIE-Torch/built-in/modeling_t5.patch   | 819 ++++++++++++++++++
 .../built-in/modeling_utils.patch             |   0
 MindIE/MindIE-Torch/built-in/utils.patch      |   0
 4 files changed, 819 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/modeling_outputs.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/modeling_t5.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/modeling_utils.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/modeling_outputs.patch
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/MindIE/MindIE-Torch/built-in/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/modeling_t5.patch
new file mode 100644
index 0000000000..4a376cf5eb
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/modeling_t5.patch
@@ -0,0 +1,819 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-08-29 11:11:23.852000000 +0800
++++ modeling_t5.py	2024-08-29 11:19:34.572000000 +0800
+@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple
+ import torch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+-import torch_npu
+-import mindietorch
+ 
+ from ...activations import ACT2FN
+ from ...modeling_outputs import (
+@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module):
+ 
+         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+-        # print("self.weight.dtype=",self.weight.dtype)
++
+         # convert into half-precision if necessary
+         if self.weight.dtype in [torch.float16, torch.bfloat16]:
+             hidden_states = hidden_states.to(self.weight.dtype)
+@@ -451,7 +449,6 @@ class T5Attention(nn.Module):
+         key_value_states=None,
+         position_bias=None,
+         past_key_value=None,
+-        past_cross_key_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -468,8 +465,7 @@ class T5Attention(nn.Module):
+         real_seq_length = seq_length
+ 
+         if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
++            if len(past_key_value) != 2:
+                 raise ValueError(
+                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                 )
+@@ -497,7 +493,6 @@ class T5Attention(nn.Module):
+                 hidden_states = shape(proj_layer(key_value_states))
+ 
+             if past_key_value is not None:
+-                past_key_value = shape(past_key_value)
+                 if key_value_states is None:
+                     # self-attn
+                     # (batch_size, n_heads, key_length, dim_per_head)
+@@ -571,261 +566,7 @@ class T5Attention(nn.Module):
+ 
+         present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+         outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
+-
+-
+-class T5SelfAttention(T5Attention):
+-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+-        super().__init__(config, has_relative_attention_bias)
+-
+-    def forward(
+-        self,
+-        hidden_states,
+-        mask=None,
+-        position_bias=None,
+-        past_key_value=None,
+-        layer_head_mask=None,
+-        use_cache=False,
+-        output_attentions=False,
+-    ):
+-        """
+-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+-        """
+-        # Input is (batch_size, seq_length, dim)
+-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+-        batch_size, seq_length = hidden_states.shape[:2]
+-
+-        real_seq_length = seq_length
+-
+-        if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2]
+-        # print("key_value_states=",real_seq_length)
+-        key_length = real_seq_length
+-
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+-
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+-
+-        def project(hidden_states, proj_layer, past_key_value):
+-            """projects hidden states correctly to key/query states"""
+-            if past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-
+-            if past_key_value is not None:
+-                hidden_states = shape(proj_layer(hidden_states))
+-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-            return hidden_states
+-
+-        # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
+-        # get key/value states
+-        key_states = project(
+-            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
+-        )
+-        value_states = project(
+-            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
+-        )
+-        # print("key_states=",hidden_states.dtype,key_states.dtype)
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-        # print("scores=",scores.dtype)
+-        if position_bias is None:
+-            if not self.has_relative_attention_bias:
+-                position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+-                )
+-                if self.gradient_checkpointing and self.training:
+-                    position_bias.requires_grad = True
+-            else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+-
+-            # if key and values are already calculated
+-            # we want only the last query position bias
+-            if past_key_value is not None:
+-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+-            if mask is not None:
+-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+-
+-        if self.pruned_heads:
+-            mask = torch.ones(position_bias.shape[1])
+-            mask[list(self.pruned_heads)] = 0
+-            position_bias_masked = position_bias[:, mask.bool()]
+-        else:
+-            position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+-        attn_output = self.o(attn_output)
+ 
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
+-
+-
+-class T5CrossAttention(T5Attention):
+-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+-        super().__init__(config, has_relative_attention_bias)
+-
+-    def forward(
+-        self,
+-        hidden_states,
+-        mask=None,
+-        key_value_states=None,
+-        position_bias=None,
+-        past_cross_key_value=None,
+-        layer_head_mask=None,
+-        query_length=None,
+-        use_cache=False,
+-        output_attentions=False,
+-    ):
+-        """
+-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+-        """
+-        # Input is (batch_size, seq_length, dim)
+-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+-        batch_size, seq_length = hidden_states.shape[:2]
+-
+-        real_seq_length = seq_length
+-
+-        if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+-        # print("key_value_states=",key_value_states, real_seq_length)
+-        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+-
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+-
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+-
+-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+-            """projects hidden states correctly to key/query states"""
+-            if key_value_states is None:
+-                # self-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-            elif past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(key_value_states))
+-
+-            if past_key_value is not None:
+-                if key_value_states is None:
+-                    # self-attn
+-                    # (batch_size, n_heads, key_length, dim_per_head)
+-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-                    # print("hidden_states=",hidden_states.shape)
+-                elif past_key_value.shape[2] != key_value_states.shape[1]:
+-                    # checking that the `sequence_length` of the `past_key_value` is the same as
+-                    # the provided `key_value_states` to support prefix tuning
+-                    # cross-attn
+-                    # (batch_size, n_heads, seq_length, dim_per_head)
+-                    hidden_states = shape(proj_layer(key_value_states))
+-                else:
+-                    # cross-attn
+-                    hidden_states = past_key_value
+-            return hidden_states
+-
+-        # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
+-        # get key/value states
+-        key_states = project(
+-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
+-        )
+-        value_states = project(
+-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
+-        )
+-
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-
+-        if position_bias is None:
+-            if not self.has_relative_attention_bias:
+-                position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+-                )
+-                if self.gradient_checkpointing and self.training:
+-                    position_bias.requires_grad = True
+-            else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+-
+-            # if key and values are already calculated
+-            # we want only the last query position bias
+-            if past_key_value is not None:
+-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+-            if mask is not None:
+-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+-
+-        if self.pruned_heads:
+-            mask = torch.ones(position_bias.shape[1])
+-            mask[list(self.pruned_heads)] = 0
+-            position_bias_masked = position_bias[:, mask.bool()]
+-        else:
+-            position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+-        attn_output = self.o(attn_output)
+-
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention):
+ class T5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+         super().__init__()
+-        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -921,7 +662,6 @@ class T5Block(nn.Module):
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+         past_key_value=None,
+-        past_cross_key_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+@@ -931,17 +671,15 @@ class T5Block(nn.Module):
+                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+ 
+-            # if len(past_key_value) != expected_num_past_key_values:
+-            #     raise ValueError(
+-            #         f"There should be {expected_num_past_key_values} past states. "
+-            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-            #         f"Got {len(past_key_value)} past key / value states"
+-            #     )
+-
+-            self_attn_past_key_value = past_key_value
+-            # print("self_attn_past_key_value=",self_attn_past_key_value.dtype)
+-            cross_attn_past_key_value = past_cross_key_value
+-            # cross_attn_past_key_value = past_key_value[2:]
++            if len(past_key_value) != expected_num_past_key_values:
++                raise ValueError(
++                    f"There should be {expected_num_past_key_values} past states. "
++                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
++                    f"Got {len(past_key_value)} past key / value states"
++                )
++
++            self_attn_past_key_value = past_key_value[:2]
++            cross_attn_past_key_value = past_key_value[2:]
+         else:
+             self_attn_past_key_value, cross_attn_past_key_value = None, None
+ 
+@@ -955,8 +693,6 @@ class T5Block(nn.Module):
+             output_attentions=output_attentions,
+         )
+         hidden_states, present_key_value_state = self_attention_outputs[:2]
+-        # if self.is_decoder:
+-            # print("present_key_value_state=",present_key_value_state[0].dtype)
+         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+ 
+         # clamp inf values to enable fp16 training
+@@ -967,7 +703,7 @@ class T5Block(nn.Module):
+                 torch.finfo(hidden_states.dtype).max,
+             )
+             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+-        present_cross_key_value_state = ()
++
+         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+         if do_cross_attention:
+             # the actual query length is unknown for cross attention
+@@ -1000,10 +736,9 @@ class T5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            # if present_key_value_state is not None:
+-            #     present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-            cross_attn_past_key_values = cross_attention_outputs[1]
+-            # print("cross_attn_past_key_values=",cross_attn_past_key_values)
++            if present_key_value_state is not None:
++                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
++
+             # Keep cross-attention outputs and relative position weights
+             attention_outputs = attention_outputs + cross_attention_outputs[2:]
+ 
+@@ -1022,7 +757,7 @@ class T5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
++            outputs = outputs + (present_key_value_state,) + attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
+ 
+ 
+ class T5Stack(T5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
++    def __init__(self, config, embed_tokens=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
+-        self.lm_head=lm_head
+-        self.encodecrosskeyvalue = encodecrosskeyvalue
+-        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel):
+     def forward(
+         self,
+         input_ids=None,
++        attention_mask=None,
+         encoder_hidden_states=None,
+         encoder_attention_mask=None,
+-        past_key_values=None,
+-        past_cross_key_values=None,
+-        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
++        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+         return_dict=None,
+     ):
+         # Model parallel
+-        # print("aaaaaaaaaaaaaaaaa")
+         if self.model_parallel:
+             torch.cuda.set_device(self.first_device)
+             self.embed_tokens = self.embed_tokens.to(self.first_device)
+@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel):
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        #modified
+-        # if past_key_values is None:
+-        #     past_key_values = [None] * len(self.block)
+-        #added
+-        if not self.is_decoder:
++        if past_key_values is None:
+             past_key_values = [None] * len(self.block)
+-            past_cross_key_values = [None] * len(self.block)
++
+         if attention_mask is None:
+             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+ 
+@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        # present_key_value_states = () if use_cache else None
+-        # present_cross_key_value_states = () if use_cache else None
+-        present_key_value_states = [] if use_cache else None
+-        # present_cross_key_value_states = [] if use_cache else None
++        present_key_value_states = () if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-        for i, layer_module in enumerate(self.block):
+-        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+-            past_key_value = past_key_values[i]
+-            past_cross_key_value = past_cross_key_values[i]
++
++        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                     past_key_value=past_key_value,
+-                    past_cross_key_value=past_cross_key_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
++            hidden_states, present_key_value_state = layer_outputs[:2]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[3]
++            position_bias = layer_outputs[2]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
++                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states.extend(present_key_value_state)
+-                # present_cross_key_value_states.extend(present_cross_key_value_state)
+-                # present_key_value_states = present_key_value_states + (present_key_value_state,)
+-                # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,)
++                present_key_value_states = present_key_value_states + (present_key_value_state,)
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states).half()
++        hidden_states = self.dropout(hidden_states)
+ 
+         # Add last layer
+         if output_hidden_states:
+             all_hidden_states = all_hidden_states + (hidden_states,)
+-        # print("return_dict=",return_dict)
++
+         if not return_dict:
+             return tuple(
+                 v
+                 for v in [
+                     hidden_states,
+                     present_key_value_states,
+-                    # present_cross_key_value_states,
+                     all_hidden_states,
+                     all_attentions,
+                     all_cross_attentions,
+                 ]
+                 if v is not None
+             )
+-        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
+-        # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2,
+-        #                                                                           *present_cross_key_value_states[0].shape) if use_cache else None
+-        # print("dddddddddddd")
+-        # if use_cache:
+-        #     print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype)
+-        # return BaseModelOutputWithPastAndCrossAttentions(
+-        #     last_hidden_state=hidden_states,
+-        #     past_key_values=present_key_value_states,
+-        #     past_cross_key_values=present_cross_key_value_states
+-        # )
+-        if not self.is_decoder and self.encodecrosskeyvalue:
+-            res = self.encodecrosskeyvalue(hidden_states)
+-            return tuple((hidden_states, res))
+-            # return BaseModelOutputWithPastAndCrossAttentions(
+-            #     last_hidden_state=hidden_states,
+-            #     past_key_values=present_key_value_states,
+-            #     # past_cross_key_values=past_cross_key_values,
+-            #     hidden_states=all_hidden_states,
+-            #     attentions=all_attentions,
+-            #     cross_attentions=all_cross_attentions,
+-            # )
+-        if self.is_decoder:
+-            if self.config.tie_word_embeddings:
+-                hidden_states_1 = hidden_states * (self.model_dim ** -0.5)
+-                lm_logits = self.lm_head(hidden_states_1)
+-            return tuple((lm_logits, present_key_value_states))
++        return BaseModelOutputWithPastAndCrossAttentions(
++            last_hidden_state=hidden_states,
++            past_key_values=present_key_value_states,
++            hidden_states=all_hidden_states,
++            attentions=all_attentions,
++            cross_attentions=all_cross_attentions,
++        )
+ 
+ 
+ T5_START_DOCSTRING = r"""
+@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel):
+         )
+ 
+ 
+-
+-class EncoderToCrossKeyValue(nn.Module):
+-    def __init__(self, cross_key, cross_value, num_heads, d_kv):
+-        super().__init__()
+-        self.cross_key = cross_key
+-        self.cross_value = cross_value
+-        self.num_heads = num_heads
+-        self.d_kv = d_kv
+-
+-
+-    def forward(self, hidden_states):
+-        batch_size = hidden_states.shape[0]
+-        encoder_hidden_states_kvs = []
+-        for i in range(len(self.cross_value)):
+-            encoder_hidden_states_kvs.append(
+-                torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
+-                             self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
+-
+-        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
+-        return past_cross_key_values
+-
+-
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
+ class T5ForConditionalGeneration(T5PreTrainedModel):
+     _keys_to_ignore_on_load_unexpected = [
+@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
+     ]
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+-    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
++    def __init__(self, config: T5Config):
+         super().__init__(config)
+-        self.encoder_path = encoder_path
+-        self.decoder_path = decoder_path
+-        if not self.encoder_path or not self.decoder_path:
+-            self.model_dim = config.d_model
+-
+-            self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-            decoder_config = copy.deepcopy(config)
+-            decoder_config.is_decoder = True
+-            decoder_config.is_encoder_decoder = False
+-            decoder_config.num_layers = config.num_decoder_layers
+-       
+-            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+-            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
+-
+-            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
+-            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
+-            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
+-
+-            encoder_config = copy.deepcopy(config)
+-            encoder_config.is_decoder = False
+-            encoder_config.use_cache = False
+-            encoder_config.is_encoder_decoder = False
+-            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
+-        self.encoder_mindie = None
+-        self.decoder_mindie = None
+-        if self.encoder_path:
+-            self.encoder_mindie = torch.jit.load(self.encoder_path)
+-        if self.decoder_path:
+-            self.decoder_mindie = torch.jit.load(self.decoder_path)
+-            self.stream = torch.npu.Stream(f"npu:{device_id}")
+-            self.device_id = device_id
+-
+-    
+-    def get_device(self):
+-        return f"npu:{self.device_id}"
++        self.model_dim = config.d_model
++
++        self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++        encoder_config = copy.deepcopy(config)
++        encoder_config.is_decoder = False
++        encoder_config.use_cache = False
++        encoder_config.is_encoder_decoder = False
++        self.encoder = T5Stack(encoder_config, self.shared)
++
++        decoder_config = copy.deepcopy(config)
++        decoder_config.is_decoder = True
++        decoder_config.is_encoder_decoder = False
++        decoder_config.num_layers = config.num_decoder_layers
++        self.decoder = T5Stack(decoder_config, self.shared)
++
++        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+ 
+         # Initialize weights and apply final processing
+-        # self.post_init()
++        self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
+         cross_attn_head_mask: Optional[torch.Tensor] = None,
+         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         inputs_embeds: Optional[torch.FloatTensor] = None,
+         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+         labels: Optional[torch.LongTensor] = None,
+@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
+             if self.config.num_layers == self.config.num_decoder_layers:
+                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                 decoder_head_mask = head_mask
+-        
+-        hidden_states = encoder_outputs["last_hidden_state"]
+-        # import pdb
+-        # pdb.set_trace()
+ 
+-        # if self.model_parallel:
+-        #     torch.cuda.set_device(self.decoder.first_device)
++        # Encode if needed (training, first prediction pass)
++        if encoder_outputs is None:
++            # Convert encoder inputs in embeddings if needed
++            encoder_outputs = self.encoder(
++                input_ids=input_ids,
++                attention_mask=attention_mask,
++                inputs_embeds=inputs_embeds,
++                head_mask=head_mask,
++                output_attentions=output_attentions,
++                output_hidden_states=output_hidden_states,
++                return_dict=return_dict,
++            )
++        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
++            encoder_outputs = BaseModelOutput(
++                last_hidden_state=encoder_outputs[0],
++                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
++                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
++            )
++
++        hidden_states = encoder_outputs[0]
++
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
+ 
+         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+             # get decoder inputs from shifting lm labels to the right
+             decoder_input_ids = self._shift_right(labels)
+ 
+-        import time
+-        start_time = time.time()
+-        with torch.npu.stream(self.stream): # set stream
+-            
+-            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
+-        self.stream.synchronize() # synchronize
+-        print("time is", time.time() - start_time)
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
++            hidden_states = hidden_states.to(self.decoder.first_device)
++            if decoder_input_ids is not None:
++                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
++            if attention_mask is not None:
++                attention_mask = attention_mask.to(self.decoder.first_device)
++            if decoder_attention_mask is not None:
++                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
++
++        # Decode
++        decoder_outputs = self.decoder(
++            input_ids=decoder_input_ids,
++            attention_mask=decoder_attention_mask,
++            inputs_embeds=decoder_inputs_embeds,
++            past_key_values=past_key_values,
++            encoder_hidden_states=hidden_states,
++            encoder_attention_mask=attention_mask,
++            head_mask=decoder_head_mask,
++            cross_attn_head_mask=cross_attn_head_mask,
++            use_cache=use_cache,
++            output_attentions=output_attentions,
++            output_hidden_states=output_hidden_states,
++            return_dict=return_dict,
++        )
++
++        sequence_output = decoder_outputs[0]
++
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.encoder.first_device)
++            self.lm_head = self.lm_head.to(self.encoder.first_device)
++            sequence_output = sequence_output.to(self.lm_head.weight.device)
++
++        if self.config.tie_word_embeddings:
++            # Rescale output before projecting on vocab
++            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
++            sequence_output = sequence_output * (self.model_dim**-0.5)
++
++        lm_logits = self.lm_head(sequence_output)
+ 
+         loss = None
+         if labels is not None:
+@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
+         if not return_dict:
+             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+             return ((loss,) + output) if loss is not None else output
++
+         return Seq2SeqLMOutput(
+             loss=loss,
+-            logits=decoder_outputs[0],
+-            past_key_values=decoder_outputs[1]
++            logits=lm_logits,
++            past_key_values=decoder_outputs.past_key_values,
++            decoder_hidden_states=decoder_outputs.hidden_states,
++            decoder_attentions=decoder_outputs.attentions,
++            cross_attentions=decoder_outputs.cross_attentions,
++            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
++            encoder_hidden_states=encoder_outputs.hidden_states,
++            encoder_attentions=encoder_outputs.attentions,
+         )
+ 
+     def prepare_inputs_for_generation(
+         self,
+         input_ids,
+         past_key_values=None,
+-        past_cross_key_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
+         return {
+             "decoder_input_ids": input_ids,
+             "past_key_values": past_key_values,
+-            "past_cross_key_values": past_cross_key_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
+@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel):
+         encoder_config.use_cache = False
+         encoder_config.is_encoder_decoder = False
+         self.encoder = T5Stack(encoder_config, self.shared)
+-        self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt")
+-        # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt")
+-        self.stream = torch.npu.Stream("npu:2")
+ 
+         # Initialize weights and apply final processing
+         self.post_init()
+@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
+         >>> last_hidden_states = outputs.last_hidden_state
+         ```"""
+         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++
+         encoder_outputs = self.encoder(
+             input_ids=input_ids,
+             attention_mask=attention_mask,
diff --git a/MindIE/MindIE-Torch/built-in/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/modeling_utils.patch
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/MindIE/MindIE-Torch/built-in/utils.patch b/MindIE/MindIE-Torch/built-in/utils.patch
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 575815494b5d89ed682b83353806888aae398671 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:28:45 +0000
Subject: [PATCH 004/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Foutputs.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/modeling_outputs.patch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_outputs.patch

diff --git a/MindIE/MindIE-Torch/built-in/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/modeling_outputs.patch
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From a4e490eadbd112d032d6bdac5aa4942a9cf70a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:28:56 +0000
Subject: [PATCH 005/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Ft5.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MindIE-Torch/built-in/modeling_t5.patch   | 819 ------------------
 1 file changed, 819 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_t5.patch

diff --git a/MindIE/MindIE-Torch/built-in/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/modeling_t5.patch
deleted file mode 100644
index 4a376cf5eb..0000000000
--- a/MindIE/MindIE-Torch/built-in/modeling_t5.patch
+++ /dev/null
@@ -1,819 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-08-29 11:11:23.852000000 +0800
-+++ modeling_t5.py	2024-08-29 11:19:34.572000000 +0800
-@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple
- import torch
- from torch import nn
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
--import torch_npu
--import mindietorch
- 
- from ...activations import ACT2FN
- from ...modeling_outputs import (
-@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module):
- 
-         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
--        # print("self.weight.dtype=",self.weight.dtype)
-+
-         # convert into half-precision if necessary
-         if self.weight.dtype in [torch.float16, torch.bfloat16]:
-             hidden_states = hidden_states.to(self.weight.dtype)
-@@ -451,7 +449,6 @@ class T5Attention(nn.Module):
-         key_value_states=None,
-         position_bias=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         layer_head_mask=None,
-         query_length=None,
-         use_cache=False,
-@@ -468,8 +465,7 @@ class T5Attention(nn.Module):
-         real_seq_length = seq_length
- 
-         if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
-+            if len(past_key_value) != 2:
-                 raise ValueError(
-                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-                 )
-@@ -497,7 +493,6 @@ class T5Attention(nn.Module):
-                 hidden_states = shape(proj_layer(key_value_states))
- 
-             if past_key_value is not None:
--                past_key_value = shape(past_key_value)
-                 if key_value_states is None:
-                     # self-attn
-                     # (batch_size, n_heads, key_length, dim_per_head)
-@@ -571,261 +566,7 @@ class T5Attention(nn.Module):
- 
-         present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-         outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
--        if output_attentions:
--            outputs = outputs + (attn_weights,)
--        return outputs
--
--
--class T5SelfAttention(T5Attention):
--    def __init__(self, config: T5Config, has_relative_attention_bias=False):
--        super().__init__(config, has_relative_attention_bias)
--
--    def forward(
--        self,
--        hidden_states,
--        mask=None,
--        position_bias=None,
--        past_key_value=None,
--        layer_head_mask=None,
--        use_cache=False,
--        output_attentions=False,
--    ):
--        """
--        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
--        """
--        # Input is (batch_size, seq_length, dim)
--        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
--        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
--        batch_size, seq_length = hidden_states.shape[:2]
--
--        real_seq_length = seq_length
--
--        if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
--                raise ValueError(
--                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
--                )
--            real_seq_length += past_key_value[0].shape[2]
--        # print("key_value_states=",real_seq_length)
--        key_length = real_seq_length
--
--        def shape(states):
--            """projection"""
--            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
--
--        def unshape(states):
--            """reshape"""
--            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
--
--        def project(hidden_states, proj_layer, past_key_value):
--            """projects hidden states correctly to key/query states"""
--            if past_key_value is None:
--                # cross-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(hidden_states))
--
--            if past_key_value is not None:
--                hidden_states = shape(proj_layer(hidden_states))
--                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
--            return hidden_states
--
--        # get query states
--        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
--
--        # get key/value states
--        key_states = project(
--            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
--        )
--        value_states = project(
--            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
--        )
--        # print("key_states=",hidden_states.dtype,key_states.dtype)
--        # compute scores
--        scores = torch.matmul(
--            query_states, key_states.transpose(3, 2)
--        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--        # print("scores=",scores.dtype)
--        if position_bias is None:
--            if not self.has_relative_attention_bias:
--                position_bias = torch.zeros(
--                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
--                )
--                if self.gradient_checkpointing and self.training:
--                    position_bias.requires_grad = True
--            else:
--                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
--
--            # if key and values are already calculated
--            # we want only the last query position bias
--            if past_key_value is not None:
--                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
--
--            if mask is not None:
--                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
--
--        if self.pruned_heads:
--            mask = torch.ones(position_bias.shape[1])
--            mask[list(self.pruned_heads)] = 0
--            position_bias_masked = position_bias[:, mask.bool()]
--        else:
--            position_bias_masked = position_bias
--
--        scores += position_bias_masked
--        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
--            scores
--        )  # (batch_size, n_heads, seq_length, key_length)
--        attn_weights = nn.functional.dropout(
--            attn_weights, p=self.dropout, training=self.training
--        )  # (batch_size, n_heads, seq_length, key_length)
--
--        # Mask heads if we want to
--        if layer_head_mask is not None:
--            attn_weights = attn_weights * layer_head_mask
--
--        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
--        attn_output = self.o(attn_output)
- 
--        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
--        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
--        if output_attentions:
--            outputs = outputs + (attn_weights,)
--        return outputs
--
--
--class T5CrossAttention(T5Attention):
--    def __init__(self, config: T5Config, has_relative_attention_bias=False):
--        super().__init__(config, has_relative_attention_bias)
--
--    def forward(
--        self,
--        hidden_states,
--        mask=None,
--        key_value_states=None,
--        position_bias=None,
--        past_cross_key_value=None,
--        layer_head_mask=None,
--        query_length=None,
--        use_cache=False,
--        output_attentions=False,
--    ):
--        """
--        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
--        """
--        # Input is (batch_size, seq_length, dim)
--        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
--        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
--        batch_size, seq_length = hidden_states.shape[:2]
--
--        real_seq_length = seq_length
--
--        if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
--                raise ValueError(
--                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
--                )
--            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
--        # print("key_value_states=",key_value_states, real_seq_length)
--        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
--
--        def shape(states):
--            """projection"""
--            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
--
--        def unshape(states):
--            """reshape"""
--            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
--
--        def project(hidden_states, proj_layer, key_value_states, past_key_value):
--            """projects hidden states correctly to key/query states"""
--            if key_value_states is None:
--                # self-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(hidden_states))
--            elif past_key_value is None:
--                # cross-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(key_value_states))
--
--            if past_key_value is not None:
--                if key_value_states is None:
--                    # self-attn
--                    # (batch_size, n_heads, key_length, dim_per_head)
--                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
--                    # print("hidden_states=",hidden_states.shape)
--                elif past_key_value.shape[2] != key_value_states.shape[1]:
--                    # checking that the `sequence_length` of the `past_key_value` is the same as
--                    # the provided `key_value_states` to support prefix tuning
--                    # cross-attn
--                    # (batch_size, n_heads, seq_length, dim_per_head)
--                    hidden_states = shape(proj_layer(key_value_states))
--                else:
--                    # cross-attn
--                    hidden_states = past_key_value
--            return hidden_states
--
--        # get query states
--        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
--
--        # get key/value states
--        key_states = project(
--            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
--        )
--        value_states = project(
--            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
--        )
--
--        # compute scores
--        scores = torch.matmul(
--            query_states, key_states.transpose(3, 2)
--        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--
--        if position_bias is None:
--            if not self.has_relative_attention_bias:
--                position_bias = torch.zeros(
--                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
--                )
--                if self.gradient_checkpointing and self.training:
--                    position_bias.requires_grad = True
--            else:
--                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
--
--            # if key and values are already calculated
--            # we want only the last query position bias
--            if past_key_value is not None:
--                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
--
--            if mask is not None:
--                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
--
--        if self.pruned_heads:
--            mask = torch.ones(position_bias.shape[1])
--            mask[list(self.pruned_heads)] = 0
--            position_bias_masked = position_bias[:, mask.bool()]
--        else:
--            position_bias_masked = position_bias
--
--        scores += position_bias_masked
--        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
--            scores
--        )  # (batch_size, n_heads, seq_length, key_length)
--        attn_weights = nn.functional.dropout(
--            attn_weights, p=self.dropout, training=self.training
--        )  # (batch_size, n_heads, seq_length, key_length)
--
--        # Mask heads if we want to
--        if layer_head_mask is not None:
--            attn_weights = attn_weights * layer_head_mask
--
--        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
--        attn_output = self.o(attn_output)
--
--        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
--        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
-         if output_attentions:
-             outputs = outputs + (attn_weights,)
-         return outputs
-@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention):
- class T5LayerSelfAttention(nn.Module):
-     def __init__(self, config, has_relative_attention_bias=False):
-         super().__init__()
--        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-         self.dropout = nn.Dropout(config.dropout_rate)
- 
-@@ -921,7 +662,6 @@ class T5Block(nn.Module):
-         layer_head_mask=None,
-         cross_attn_layer_head_mask=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         use_cache=False,
-         output_attentions=False,
-         return_dict=True,
-@@ -931,17 +671,15 @@ class T5Block(nn.Module):
-                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
-             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
- 
--            # if len(past_key_value) != expected_num_past_key_values:
--            #     raise ValueError(
--            #         f"There should be {expected_num_past_key_values} past states. "
--            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
--            #         f"Got {len(past_key_value)} past key / value states"
--            #     )
--
--            self_attn_past_key_value = past_key_value
--            # print("self_attn_past_key_value=",self_attn_past_key_value.dtype)
--            cross_attn_past_key_value = past_cross_key_value
--            # cross_attn_past_key_value = past_key_value[2:]
-+            if len(past_key_value) != expected_num_past_key_values:
-+                raise ValueError(
-+                    f"There should be {expected_num_past_key_values} past states. "
-+                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-+                    f"Got {len(past_key_value)} past key / value states"
-+                )
-+
-+            self_attn_past_key_value = past_key_value[:2]
-+            cross_attn_past_key_value = past_key_value[2:]
-         else:
-             self_attn_past_key_value, cross_attn_past_key_value = None, None
- 
-@@ -955,8 +693,6 @@ class T5Block(nn.Module):
-             output_attentions=output_attentions,
-         )
-         hidden_states, present_key_value_state = self_attention_outputs[:2]
--        # if self.is_decoder:
--            # print("present_key_value_state=",present_key_value_state[0].dtype)
-         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
- 
-         # clamp inf values to enable fp16 training
-@@ -967,7 +703,7 @@ class T5Block(nn.Module):
-                 torch.finfo(hidden_states.dtype).max,
-             )
-             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
--        present_cross_key_value_state = ()
-+
-         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-         if do_cross_attention:
-             # the actual query length is unknown for cross attention
-@@ -1000,10 +736,9 @@ class T5Block(nn.Module):
-                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
- 
-             # Combine self attn and cross attn key value states
--            # if present_key_value_state is not None:
--            #     present_key_value_state = present_key_value_state + cross_attention_outputs[1]
--            cross_attn_past_key_values = cross_attention_outputs[1]
--            # print("cross_attn_past_key_values=",cross_attn_past_key_values)
-+            if present_key_value_state is not None:
-+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-+
-             # Keep cross-attention outputs and relative position weights
-             attention_outputs = attention_outputs + cross_attention_outputs[2:]
- 
-@@ -1022,7 +757,7 @@ class T5Block(nn.Module):
-         outputs = (hidden_states,)
- 
-         if use_cache:
--            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
-+            outputs = outputs + (present_key_value_state,) + attention_outputs
-         else:
-             outputs = outputs + attention_outputs
- 
-@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
- 
- 
- class T5Stack(T5PreTrainedModel):
--    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
-+    def __init__(self, config, embed_tokens=None):
-         super().__init__(config)
- 
-         self.embed_tokens = embed_tokens
-         self.is_decoder = config.is_decoder
--        self.lm_head=lm_head
--        self.encodecrosskeyvalue = encodecrosskeyvalue
--        self.model_dim = config.d_model
- 
-         self.block = nn.ModuleList(
-             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel):
-     def forward(
-         self,
-         input_ids=None,
-+        attention_mask=None,
-         encoder_hidden_states=None,
-         encoder_attention_mask=None,
--        past_key_values=None,
--        past_cross_key_values=None,
--        attention_mask=None,
-         inputs_embeds=None,
-         head_mask=None,
-         cross_attn_head_mask=None,
-+        past_key_values=None,
-         use_cache=None,
-         output_attentions=None,
-         output_hidden_states=None,
-         return_dict=None,
-     ):
-         # Model parallel
--        # print("aaaaaaaaaaaaaaaaa")
-         if self.model_parallel:
-             torch.cuda.set_device(self.first_device)
-             self.embed_tokens = self.embed_tokens.to(self.first_device)
-@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel):
-                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
- 
-         # initialize past_key_values with `None` if past does not exist
--        #modified
--        # if past_key_values is None:
--        #     past_key_values = [None] * len(self.block)
--        #added
--        if not self.is_decoder:
-+        if past_key_values is None:
-             past_key_values = [None] * len(self.block)
--            past_cross_key_values = [None] * len(self.block)
-+
-         if attention_mask is None:
-             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
- 
-@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel):
-         # Prepare head mask if needed
-         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
--        # present_key_value_states = () if use_cache else None
--        # present_cross_key_value_states = () if use_cache else None
--        present_key_value_states = [] if use_cache else None
--        # present_cross_key_value_states = [] if use_cache else None
-+        present_key_value_states = () if use_cache else None
-         all_hidden_states = () if output_hidden_states else None
-         all_attentions = () if output_attentions else None
-         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
-         encoder_decoder_position_bias = None
- 
-         hidden_states = self.dropout(inputs_embeds)
--        for i, layer_module in enumerate(self.block):
--        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
--            past_key_value = past_key_values[i]
--            past_cross_key_value = past_cross_key_values[i]
-+
-+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-             layer_head_mask = head_mask[i]
-             cross_attn_layer_head_mask = cross_attn_head_mask[i]
-             # Model parallel
-@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
-                     layer_head_mask=layer_head_mask,
-                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                     past_key_value=past_key_value,
--                    past_cross_key_value=past_cross_key_value,
-                     use_cache=use_cache,
-                     output_attentions=output_attentions,
-                 )
-@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel):
-             # layer_outputs is a tuple with:
-             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-             if use_cache is False:
--                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
-+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
- 
--            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
-+            hidden_states, present_key_value_state = layer_outputs[:2]
- 
-             # We share the position biases between the layers - the first layer store them
-             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-             # (cross-attention position bias), (cross-attention weights)
--            position_bias = layer_outputs[3]
-+            position_bias = layer_outputs[2]
-             if self.is_decoder and encoder_hidden_states is not None:
--                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
-+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-             # append next layer key value states
-             if use_cache:
--                present_key_value_states.extend(present_key_value_state)
--                # present_cross_key_value_states.extend(present_cross_key_value_state)
--                # present_key_value_states = present_key_value_states + (present_key_value_state,)
--                # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,)
-+                present_key_value_states = present_key_value_states + (present_key_value_state,)
- 
-             if output_attentions:
-                 all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel):
-                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
- 
-         hidden_states = self.final_layer_norm(hidden_states)
--        hidden_states = self.dropout(hidden_states).half()
-+        hidden_states = self.dropout(hidden_states)
- 
-         # Add last layer
-         if output_hidden_states:
-             all_hidden_states = all_hidden_states + (hidden_states,)
--        # print("return_dict=",return_dict)
-+
-         if not return_dict:
-             return tuple(
-                 v
-                 for v in [
-                     hidden_states,
-                     present_key_value_states,
--                    # present_cross_key_value_states,
-                     all_hidden_states,
-                     all_attentions,
-                     all_cross_attentions,
-                 ]
-                 if v is not None
-             )
--        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
--        # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2,
--        #                                                                           *present_cross_key_value_states[0].shape) if use_cache else None
--        # print("dddddddddddd")
--        # if use_cache:
--        #     print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype)
--        # return BaseModelOutputWithPastAndCrossAttentions(
--        #     last_hidden_state=hidden_states,
--        #     past_key_values=present_key_value_states,
--        #     past_cross_key_values=present_cross_key_value_states
--        # )
--        if not self.is_decoder and self.encodecrosskeyvalue:
--            res = self.encodecrosskeyvalue(hidden_states)
--            return tuple((hidden_states, res))
--            # return BaseModelOutputWithPastAndCrossAttentions(
--            #     last_hidden_state=hidden_states,
--            #     past_key_values=present_key_value_states,
--            #     # past_cross_key_values=past_cross_key_values,
--            #     hidden_states=all_hidden_states,
--            #     attentions=all_attentions,
--            #     cross_attentions=all_cross_attentions,
--            # )
--        if self.is_decoder:
--            if self.config.tie_word_embeddings:
--                hidden_states_1 = hidden_states * (self.model_dim ** -0.5)
--                lm_logits = self.lm_head(hidden_states_1)
--            return tuple((lm_logits, present_key_value_states))
-+        return BaseModelOutputWithPastAndCrossAttentions(
-+            last_hidden_state=hidden_states,
-+            past_key_values=present_key_value_states,
-+            hidden_states=all_hidden_states,
-+            attentions=all_attentions,
-+            cross_attentions=all_cross_attentions,
-+        )
- 
- 
- T5_START_DOCSTRING = r"""
-@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel):
-         )
- 
- 
--
--class EncoderToCrossKeyValue(nn.Module):
--    def __init__(self, cross_key, cross_value, num_heads, d_kv):
--        super().__init__()
--        self.cross_key = cross_key
--        self.cross_value = cross_value
--        self.num_heads = num_heads
--        self.d_kv = d_kv
--
--
--    def forward(self, hidden_states):
--        batch_size = hidden_states.shape[0]
--        encoder_hidden_states_kvs = []
--        for i in range(len(self.cross_value)):
--            encoder_hidden_states_kvs.append(
--                torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
--                             self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
--
--        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
--        return past_cross_key_values
--
--
- @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
- class T5ForConditionalGeneration(T5PreTrainedModel):
-     _keys_to_ignore_on_load_unexpected = [
-@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
-     ]
-     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
- 
--    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
-+    def __init__(self, config: T5Config):
-         super().__init__(config)
--        self.encoder_path = encoder_path
--        self.decoder_path = decoder_path
--        if not self.encoder_path or not self.decoder_path:
--            self.model_dim = config.d_model
--
--            self.shared = nn.Embedding(config.vocab_size, config.d_model)
--
--            decoder_config = copy.deepcopy(config)
--            decoder_config.is_decoder = True
--            decoder_config.is_encoder_decoder = False
--            decoder_config.num_layers = config.num_decoder_layers
--       
--            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
--            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
--
--            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
--            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
--            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
--
--            encoder_config = copy.deepcopy(config)
--            encoder_config.is_decoder = False
--            encoder_config.use_cache = False
--            encoder_config.is_encoder_decoder = False
--            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
--        self.encoder_mindie = None
--        self.decoder_mindie = None
--        if self.encoder_path:
--            self.encoder_mindie = torch.jit.load(self.encoder_path)
--        if self.decoder_path:
--            self.decoder_mindie = torch.jit.load(self.decoder_path)
--            self.stream = torch.npu.Stream(f"npu:{device_id}")
--            self.device_id = device_id
--
--    
--    def get_device(self):
--        return f"npu:{self.device_id}"
-+        self.model_dim = config.d_model
-+
-+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-+
-+        encoder_config = copy.deepcopy(config)
-+        encoder_config.is_decoder = False
-+        encoder_config.use_cache = False
-+        encoder_config.is_encoder_decoder = False
-+        self.encoder = T5Stack(encoder_config, self.shared)
-+
-+        decoder_config = copy.deepcopy(config)
-+        decoder_config.is_decoder = True
-+        decoder_config.is_encoder_decoder = False
-+        decoder_config.num_layers = config.num_decoder_layers
-+        self.decoder = T5Stack(decoder_config, self.shared)
-+
-+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
- 
-         # Initialize weights and apply final processing
--        # self.post_init()
-+        self.post_init()
- 
-         # Model parallel
-         self.model_parallel = False
-@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
-         cross_attn_head_mask: Optional[torch.Tensor] = None,
-         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
--        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         inputs_embeds: Optional[torch.FloatTensor] = None,
-         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-         labels: Optional[torch.LongTensor] = None,
-@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
-             if self.config.num_layers == self.config.num_decoder_layers:
-                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                 decoder_head_mask = head_mask
--        
--        hidden_states = encoder_outputs["last_hidden_state"]
--        # import pdb
--        # pdb.set_trace()
- 
--        # if self.model_parallel:
--        #     torch.cuda.set_device(self.decoder.first_device)
-+        # Encode if needed (training, first prediction pass)
-+        if encoder_outputs is None:
-+            # Convert encoder inputs in embeddings if needed
-+            encoder_outputs = self.encoder(
-+                input_ids=input_ids,
-+                attention_mask=attention_mask,
-+                inputs_embeds=inputs_embeds,
-+                head_mask=head_mask,
-+                output_attentions=output_attentions,
-+                output_hidden_states=output_hidden_states,
-+                return_dict=return_dict,
-+            )
-+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-+            encoder_outputs = BaseModelOutput(
-+                last_hidden_state=encoder_outputs[0],
-+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-+            )
-+
-+        hidden_states = encoder_outputs[0]
-+
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
- 
-         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-             # get decoder inputs from shifting lm labels to the right
-             decoder_input_ids = self._shift_right(labels)
- 
--        import time
--        start_time = time.time()
--        with torch.npu.stream(self.stream): # set stream
--            
--            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
--        self.stream.synchronize() # synchronize
--        print("time is", time.time() - start_time)
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
-+            hidden_states = hidden_states.to(self.decoder.first_device)
-+            if decoder_input_ids is not None:
-+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-+            if attention_mask is not None:
-+                attention_mask = attention_mask.to(self.decoder.first_device)
-+            if decoder_attention_mask is not None:
-+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-+
-+        # Decode
-+        decoder_outputs = self.decoder(
-+            input_ids=decoder_input_ids,
-+            attention_mask=decoder_attention_mask,
-+            inputs_embeds=decoder_inputs_embeds,
-+            past_key_values=past_key_values,
-+            encoder_hidden_states=hidden_states,
-+            encoder_attention_mask=attention_mask,
-+            head_mask=decoder_head_mask,
-+            cross_attn_head_mask=cross_attn_head_mask,
-+            use_cache=use_cache,
-+            output_attentions=output_attentions,
-+            output_hidden_states=output_hidden_states,
-+            return_dict=return_dict,
-+        )
-+
-+        sequence_output = decoder_outputs[0]
-+
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.encoder.first_device)
-+            self.lm_head = self.lm_head.to(self.encoder.first_device)
-+            sequence_output = sequence_output.to(self.lm_head.weight.device)
-+
-+        if self.config.tie_word_embeddings:
-+            # Rescale output before projecting on vocab
-+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-+            sequence_output = sequence_output * (self.model_dim**-0.5)
-+
-+        lm_logits = self.lm_head(sequence_output)
- 
-         loss = None
-         if labels is not None:
-@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
-         if not return_dict:
-             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-             return ((loss,) + output) if loss is not None else output
-+
-         return Seq2SeqLMOutput(
-             loss=loss,
--            logits=decoder_outputs[0],
--            past_key_values=decoder_outputs[1]
-+            logits=lm_logits,
-+            past_key_values=decoder_outputs.past_key_values,
-+            decoder_hidden_states=decoder_outputs.hidden_states,
-+            decoder_attentions=decoder_outputs.attentions,
-+            cross_attentions=decoder_outputs.cross_attentions,
-+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-+            encoder_hidden_states=encoder_outputs.hidden_states,
-+            encoder_attentions=encoder_outputs.attentions,
-         )
- 
-     def prepare_inputs_for_generation(
-         self,
-         input_ids,
-         past_key_values=None,
--        past_cross_key_values=None,
-         attention_mask=None,
-         head_mask=None,
-         decoder_head_mask=None,
-@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
-         return {
-             "decoder_input_ids": input_ids,
-             "past_key_values": past_key_values,
--            "past_cross_key_values": past_cross_key_values,
-             "encoder_outputs": encoder_outputs,
-             "attention_mask": attention_mask,
-             "head_mask": head_mask,
-@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel):
-         encoder_config.use_cache = False
-         encoder_config.is_encoder_decoder = False
-         self.encoder = T5Stack(encoder_config, self.shared)
--        self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt")
--        # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt")
--        self.stream = torch.npu.Stream("npu:2")
- 
-         # Initialize weights and apply final processing
-         self.post_init()
-@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
-         >>> last_hidden_states = outputs.last_hidden_state
-         ```"""
-         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-+
-         encoder_outputs = self.encoder(
-             input_ids=input_ids,
-             attention_mask=attention_mask,
-- 
Gitee


From ba6268d1d3922bea5f0c0e443aaf6251f442816a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:29:04 +0000
Subject: [PATCH 006/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/modeling=5Futils.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/modeling_utils.patch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/modeling_utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/modeling_utils.patch
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 93ef152147c62f305265ec275f12c358315970be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:29:12 +0000
Subject: [PATCH 007/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/utils.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/utils.patch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/utils.patch b/MindIE/MindIE-Torch/built-in/utils.patch
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 0c06288203ab056ddcb8442914db41a5cec65d64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 03:29:32 +0000
Subject: [PATCH 008/110] transformers patch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_outputs.patch        |   0
 .../built-in/T5/modeling_t5.patch             | 819 ++++++++++++++++++
 .../built-in/T5/modeling_utils.patch          |   0
 MindIE/MindIE-Torch/built-in/T5/utils.patch   |   0
 4 files changed, 819 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
new file mode 100644
index 0000000000..4a376cf5eb
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -0,0 +1,819 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-08-29 11:11:23.852000000 +0800
++++ modeling_t5.py	2024-08-29 11:19:34.572000000 +0800
+@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple
+ import torch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+-import torch_npu
+-import mindietorch
+ 
+ from ...activations import ACT2FN
+ from ...modeling_outputs import (
+@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module):
+ 
+         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+-        # print("self.weight.dtype=",self.weight.dtype)
++
+         # convert into half-precision if necessary
+         if self.weight.dtype in [torch.float16, torch.bfloat16]:
+             hidden_states = hidden_states.to(self.weight.dtype)
+@@ -451,7 +449,6 @@ class T5Attention(nn.Module):
+         key_value_states=None,
+         position_bias=None,
+         past_key_value=None,
+-        past_cross_key_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -468,8 +465,7 @@ class T5Attention(nn.Module):
+         real_seq_length = seq_length
+ 
+         if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
++            if len(past_key_value) != 2:
+                 raise ValueError(
+                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                 )
+@@ -497,7 +493,6 @@ class T5Attention(nn.Module):
+                 hidden_states = shape(proj_layer(key_value_states))
+ 
+             if past_key_value is not None:
+-                past_key_value = shape(past_key_value)
+                 if key_value_states is None:
+                     # self-attn
+                     # (batch_size, n_heads, key_length, dim_per_head)
+@@ -571,261 +566,7 @@ class T5Attention(nn.Module):
+ 
+         present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+         outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
+-
+-
+-class T5SelfAttention(T5Attention):
+-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+-        super().__init__(config, has_relative_attention_bias)
+-
+-    def forward(
+-        self,
+-        hidden_states,
+-        mask=None,
+-        position_bias=None,
+-        past_key_value=None,
+-        layer_head_mask=None,
+-        use_cache=False,
+-        output_attentions=False,
+-    ):
+-        """
+-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+-        """
+-        # Input is (batch_size, seq_length, dim)
+-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+-        batch_size, seq_length = hidden_states.shape[:2]
+-
+-        real_seq_length = seq_length
+-
+-        if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2]
+-        # print("key_value_states=",real_seq_length)
+-        key_length = real_seq_length
+-
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+-
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+-
+-        def project(hidden_states, proj_layer, past_key_value):
+-            """projects hidden states correctly to key/query states"""
+-            if past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-
+-            if past_key_value is not None:
+-                hidden_states = shape(proj_layer(hidden_states))
+-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-            return hidden_states
+-
+-        # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
+-        # get key/value states
+-        key_states = project(
+-            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
+-        )
+-        value_states = project(
+-            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
+-        )
+-        # print("key_states=",hidden_states.dtype,key_states.dtype)
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-        # print("scores=",scores.dtype)
+-        if position_bias is None:
+-            if not self.has_relative_attention_bias:
+-                position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+-                )
+-                if self.gradient_checkpointing and self.training:
+-                    position_bias.requires_grad = True
+-            else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+-
+-            # if key and values are already calculated
+-            # we want only the last query position bias
+-            if past_key_value is not None:
+-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+-            if mask is not None:
+-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+-
+-        if self.pruned_heads:
+-            mask = torch.ones(position_bias.shape[1])
+-            mask[list(self.pruned_heads)] = 0
+-            position_bias_masked = position_bias[:, mask.bool()]
+-        else:
+-            position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+-        attn_output = self.o(attn_output)
+ 
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
+-
+-
+-class T5CrossAttention(T5Attention):
+-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+-        super().__init__(config, has_relative_attention_bias)
+-
+-    def forward(
+-        self,
+-        hidden_states,
+-        mask=None,
+-        key_value_states=None,
+-        position_bias=None,
+-        past_cross_key_value=None,
+-        layer_head_mask=None,
+-        query_length=None,
+-        use_cache=False,
+-        output_attentions=False,
+-    ):
+-        """
+-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+-        """
+-        # Input is (batch_size, seq_length, dim)
+-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+-        batch_size, seq_length = hidden_states.shape[:2]
+-
+-        real_seq_length = seq_length
+-
+-        if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+-        # print("key_value_states=",key_value_states, real_seq_length)
+-        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+-
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+-
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+-
+-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+-            """projects hidden states correctly to key/query states"""
+-            if key_value_states is None:
+-                # self-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-            elif past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(key_value_states))
+-
+-            if past_key_value is not None:
+-                if key_value_states is None:
+-                    # self-attn
+-                    # (batch_size, n_heads, key_length, dim_per_head)
+-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-                    # print("hidden_states=",hidden_states.shape)
+-                elif past_key_value.shape[2] != key_value_states.shape[1]:
+-                    # checking that the `sequence_length` of the `past_key_value` is the same as
+-                    # the provided `key_value_states` to support prefix tuning
+-                    # cross-attn
+-                    # (batch_size, n_heads, seq_length, dim_per_head)
+-                    hidden_states = shape(proj_layer(key_value_states))
+-                else:
+-                    # cross-attn
+-                    hidden_states = past_key_value
+-            return hidden_states
+-
+-        # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
+-        # get key/value states
+-        key_states = project(
+-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
+-        )
+-        value_states = project(
+-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
+-        )
+-
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-
+-        if position_bias is None:
+-            if not self.has_relative_attention_bias:
+-                position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+-                )
+-                if self.gradient_checkpointing and self.training:
+-                    position_bias.requires_grad = True
+-            else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+-
+-            # if key and values are already calculated
+-            # we want only the last query position bias
+-            if past_key_value is not None:
+-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+-            if mask is not None:
+-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+-
+-        if self.pruned_heads:
+-            mask = torch.ones(position_bias.shape[1])
+-            mask[list(self.pruned_heads)] = 0
+-            position_bias_masked = position_bias[:, mask.bool()]
+-        else:
+-            position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+-        attn_output = self.o(attn_output)
+-
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        # print("output_attentions=",output_attentions)
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention):
+ class T5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+         super().__init__()
+-        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -921,7 +662,6 @@ class T5Block(nn.Module):
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+         past_key_value=None,
+-        past_cross_key_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+@@ -931,17 +671,15 @@ class T5Block(nn.Module):
+                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+ 
+-            # if len(past_key_value) != expected_num_past_key_values:
+-            #     raise ValueError(
+-            #         f"There should be {expected_num_past_key_values} past states. "
+-            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-            #         f"Got {len(past_key_value)} past key / value states"
+-            #     )
+-
+-            self_attn_past_key_value = past_key_value
+-            # print("self_attn_past_key_value=",self_attn_past_key_value.dtype)
+-            cross_attn_past_key_value = past_cross_key_value
+-            # cross_attn_past_key_value = past_key_value[2:]
++            if len(past_key_value) != expected_num_past_key_values:
++                raise ValueError(
++                    f"There should be {expected_num_past_key_values} past states. "
++                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
++                    f"Got {len(past_key_value)} past key / value states"
++                )
++
++            self_attn_past_key_value = past_key_value[:2]
++            cross_attn_past_key_value = past_key_value[2:]
+         else:
+             self_attn_past_key_value, cross_attn_past_key_value = None, None
+ 
+@@ -955,8 +693,6 @@ class T5Block(nn.Module):
+             output_attentions=output_attentions,
+         )
+         hidden_states, present_key_value_state = self_attention_outputs[:2]
+-        # if self.is_decoder:
+-            # print("present_key_value_state=",present_key_value_state[0].dtype)
+         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+ 
+         # clamp inf values to enable fp16 training
+@@ -967,7 +703,7 @@ class T5Block(nn.Module):
+                 torch.finfo(hidden_states.dtype).max,
+             )
+             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+-        present_cross_key_value_state = ()
++
+         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+         if do_cross_attention:
+             # the actual query length is unknown for cross attention
+@@ -1000,10 +736,9 @@ class T5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            # if present_key_value_state is not None:
+-            #     present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-            cross_attn_past_key_values = cross_attention_outputs[1]
+-            # print("cross_attn_past_key_values=",cross_attn_past_key_values)
++            if present_key_value_state is not None:
++                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
++
+             # Keep cross-attention outputs and relative position weights
+             attention_outputs = attention_outputs + cross_attention_outputs[2:]
+ 
+@@ -1022,7 +757,7 @@ class T5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
++            outputs = outputs + (present_key_value_state,) + attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
+ 
+ 
+ class T5Stack(T5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
++    def __init__(self, config, embed_tokens=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
+-        self.lm_head=lm_head
+-        self.encodecrosskeyvalue = encodecrosskeyvalue
+-        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel):
+     def forward(
+         self,
+         input_ids=None,
++        attention_mask=None,
+         encoder_hidden_states=None,
+         encoder_attention_mask=None,
+-        past_key_values=None,
+-        past_cross_key_values=None,
+-        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
++        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+         return_dict=None,
+     ):
+         # Model parallel
+-        # print("aaaaaaaaaaaaaaaaa")
+         if self.model_parallel:
+             torch.cuda.set_device(self.first_device)
+             self.embed_tokens = self.embed_tokens.to(self.first_device)
+@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel):
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        #modified
+-        # if past_key_values is None:
+-        #     past_key_values = [None] * len(self.block)
+-        #added
+-        if not self.is_decoder:
++        if past_key_values is None:
+             past_key_values = [None] * len(self.block)
+-            past_cross_key_values = [None] * len(self.block)
++
+         if attention_mask is None:
+             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+ 
+@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        # present_key_value_states = () if use_cache else None
+-        # present_cross_key_value_states = () if use_cache else None
+-        present_key_value_states = [] if use_cache else None
+-        # present_cross_key_value_states = [] if use_cache else None
++        present_key_value_states = () if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-        for i, layer_module in enumerate(self.block):
+-        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+-            past_key_value = past_key_values[i]
+-            past_cross_key_value = past_cross_key_values[i]
++
++        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                     past_key_value=past_key_value,
+-                    past_cross_key_value=past_cross_key_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
++            hidden_states, present_key_value_state = layer_outputs[:2]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[3]
++            position_bias = layer_outputs[2]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
++                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states.extend(present_key_value_state)
+-                # present_cross_key_value_states.extend(present_cross_key_value_state)
+-                # present_key_value_states = present_key_value_states + (present_key_value_state,)
+-                # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,)
++                present_key_value_states = present_key_value_states + (present_key_value_state,)
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states).half()
++        hidden_states = self.dropout(hidden_states)
+ 
+         # Add last layer
+         if output_hidden_states:
+             all_hidden_states = all_hidden_states + (hidden_states,)
+-        # print("return_dict=",return_dict)
++
+         if not return_dict:
+             return tuple(
+                 v
+                 for v in [
+                     hidden_states,
+                     present_key_value_states,
+-                    # present_cross_key_value_states,
+                     all_hidden_states,
+                     all_attentions,
+                     all_cross_attentions,
+                 ]
+                 if v is not None
+             )
+-        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
+-        # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2,
+-        #                                                                           *present_cross_key_value_states[0].shape) if use_cache else None
+-        # print("dddddddddddd")
+-        # if use_cache:
+-        #     print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype)
+-        # return BaseModelOutputWithPastAndCrossAttentions(
+-        #     last_hidden_state=hidden_states,
+-        #     past_key_values=present_key_value_states,
+-        #     past_cross_key_values=present_cross_key_value_states
+-        # )
+-        if not self.is_decoder and self.encodecrosskeyvalue:
+-            res = self.encodecrosskeyvalue(hidden_states)
+-            return tuple((hidden_states, res))
+-            # return BaseModelOutputWithPastAndCrossAttentions(
+-            #     last_hidden_state=hidden_states,
+-            #     past_key_values=present_key_value_states,
+-            #     # past_cross_key_values=past_cross_key_values,
+-            #     hidden_states=all_hidden_states,
+-            #     attentions=all_attentions,
+-            #     cross_attentions=all_cross_attentions,
+-            # )
+-        if self.is_decoder:
+-            if self.config.tie_word_embeddings:
+-                hidden_states_1 = hidden_states * (self.model_dim ** -0.5)
+-                lm_logits = self.lm_head(hidden_states_1)
+-            return tuple((lm_logits, present_key_value_states))
++        return BaseModelOutputWithPastAndCrossAttentions(
++            last_hidden_state=hidden_states,
++            past_key_values=present_key_value_states,
++            hidden_states=all_hidden_states,
++            attentions=all_attentions,
++            cross_attentions=all_cross_attentions,
++        )
+ 
+ 
+ T5_START_DOCSTRING = r"""
+@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel):
+         )
+ 
+ 
+-
+-class EncoderToCrossKeyValue(nn.Module):
+-    def __init__(self, cross_key, cross_value, num_heads, d_kv):
+-        super().__init__()
+-        self.cross_key = cross_key
+-        self.cross_value = cross_value
+-        self.num_heads = num_heads
+-        self.d_kv = d_kv
+-
+-
+-    def forward(self, hidden_states):
+-        batch_size = hidden_states.shape[0]
+-        encoder_hidden_states_kvs = []
+-        for i in range(len(self.cross_value)):
+-            encoder_hidden_states_kvs.append(
+-                torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
+-                             self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
+-
+-        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
+-        return past_cross_key_values
+-
+-
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
+ class T5ForConditionalGeneration(T5PreTrainedModel):
+     _keys_to_ignore_on_load_unexpected = [
+@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
+     ]
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+-    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
++    def __init__(self, config: T5Config):
+         super().__init__(config)
+-        self.encoder_path = encoder_path
+-        self.decoder_path = decoder_path
+-        if not self.encoder_path or not self.decoder_path:
+-            self.model_dim = config.d_model
+-
+-            self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-            decoder_config = copy.deepcopy(config)
+-            decoder_config.is_decoder = True
+-            decoder_config.is_encoder_decoder = False
+-            decoder_config.num_layers = config.num_decoder_layers
+-       
+-            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+-            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
+-
+-            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
+-            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
+-            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
+-
+-            encoder_config = copy.deepcopy(config)
+-            encoder_config.is_decoder = False
+-            encoder_config.use_cache = False
+-            encoder_config.is_encoder_decoder = False
+-            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
+-        self.encoder_mindie = None
+-        self.decoder_mindie = None
+-        if self.encoder_path:
+-            self.encoder_mindie = torch.jit.load(self.encoder_path)
+-        if self.decoder_path:
+-            self.decoder_mindie = torch.jit.load(self.decoder_path)
+-            self.stream = torch.npu.Stream(f"npu:{device_id}")
+-            self.device_id = device_id
+-
+-    
+-    def get_device(self):
+-        return f"npu:{self.device_id}"
++        self.model_dim = config.d_model
++
++        self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++        encoder_config = copy.deepcopy(config)
++        encoder_config.is_decoder = False
++        encoder_config.use_cache = False
++        encoder_config.is_encoder_decoder = False
++        self.encoder = T5Stack(encoder_config, self.shared)
++
++        decoder_config = copy.deepcopy(config)
++        decoder_config.is_decoder = True
++        decoder_config.is_encoder_decoder = False
++        decoder_config.num_layers = config.num_decoder_layers
++        self.decoder = T5Stack(decoder_config, self.shared)
++
++        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+ 
+         # Initialize weights and apply final processing
+-        # self.post_init()
++        self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
+         cross_attn_head_mask: Optional[torch.Tensor] = None,
+         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         inputs_embeds: Optional[torch.FloatTensor] = None,
+         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+         labels: Optional[torch.LongTensor] = None,
+@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
+             if self.config.num_layers == self.config.num_decoder_layers:
+                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                 decoder_head_mask = head_mask
+-        
+-        hidden_states = encoder_outputs["last_hidden_state"]
+-        # import pdb
+-        # pdb.set_trace()
+ 
+-        # if self.model_parallel:
+-        #     torch.cuda.set_device(self.decoder.first_device)
++        # Encode if needed (training, first prediction pass)
++        if encoder_outputs is None:
++            # Convert encoder inputs in embeddings if needed
++            encoder_outputs = self.encoder(
++                input_ids=input_ids,
++                attention_mask=attention_mask,
++                inputs_embeds=inputs_embeds,
++                head_mask=head_mask,
++                output_attentions=output_attentions,
++                output_hidden_states=output_hidden_states,
++                return_dict=return_dict,
++            )
++        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
++            encoder_outputs = BaseModelOutput(
++                last_hidden_state=encoder_outputs[0],
++                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
++                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
++            )
++
++        hidden_states = encoder_outputs[0]
++
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
+ 
+         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+             # get decoder inputs from shifting lm labels to the right
+             decoder_input_ids = self._shift_right(labels)
+ 
+-        import time
+-        start_time = time.time()
+-        with torch.npu.stream(self.stream): # set stream
+-            
+-            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
+-        self.stream.synchronize() # synchronize
+-        print("time is", time.time() - start_time)
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
++            hidden_states = hidden_states.to(self.decoder.first_device)
++            if decoder_input_ids is not None:
++                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
++            if attention_mask is not None:
++                attention_mask = attention_mask.to(self.decoder.first_device)
++            if decoder_attention_mask is not None:
++                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
++
++        # Decode
++        decoder_outputs = self.decoder(
++            input_ids=decoder_input_ids,
++            attention_mask=decoder_attention_mask,
++            inputs_embeds=decoder_inputs_embeds,
++            past_key_values=past_key_values,
++            encoder_hidden_states=hidden_states,
++            encoder_attention_mask=attention_mask,
++            head_mask=decoder_head_mask,
++            cross_attn_head_mask=cross_attn_head_mask,
++            use_cache=use_cache,
++            output_attentions=output_attentions,
++            output_hidden_states=output_hidden_states,
++            return_dict=return_dict,
++        )
++
++        sequence_output = decoder_outputs[0]
++
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.encoder.first_device)
++            self.lm_head = self.lm_head.to(self.encoder.first_device)
++            sequence_output = sequence_output.to(self.lm_head.weight.device)
++
++        if self.config.tie_word_embeddings:
++            # Rescale output before projecting on vocab
++            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
++            sequence_output = sequence_output * (self.model_dim**-0.5)
++
++        lm_logits = self.lm_head(sequence_output)
+ 
+         loss = None
+         if labels is not None:
+@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
+         if not return_dict:
+             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+             return ((loss,) + output) if loss is not None else output
++
+         return Seq2SeqLMOutput(
+             loss=loss,
+-            logits=decoder_outputs[0],
+-            past_key_values=decoder_outputs[1]
++            logits=lm_logits,
++            past_key_values=decoder_outputs.past_key_values,
++            decoder_hidden_states=decoder_outputs.hidden_states,
++            decoder_attentions=decoder_outputs.attentions,
++            cross_attentions=decoder_outputs.cross_attentions,
++            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
++            encoder_hidden_states=encoder_outputs.hidden_states,
++            encoder_attentions=encoder_outputs.attentions,
+         )
+ 
+     def prepare_inputs_for_generation(
+         self,
+         input_ids,
+         past_key_values=None,
+-        past_cross_key_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
+         return {
+             "decoder_input_ids": input_ids,
+             "past_key_values": past_key_values,
+-            "past_cross_key_values": past_cross_key_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
+@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel):
+         encoder_config.use_cache = False
+         encoder_config.is_encoder_decoder = False
+         self.encoder = T5Stack(encoder_config, self.shared)
+-        self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt")
+-        # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt")
+-        self.stream = torch.npu.Stream("npu:2")
+ 
+         # Initialize weights and apply final processing
+         self.post_init()
+@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
+         >>> last_hidden_states = outputs.last_hidden_state
+         ```"""
+         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++
+         encoder_outputs = self.encoder(
+             input_ids=input_ids,
+             attention_mask=attention_mask,
diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 59bbe4d5c5e66a61b9bd3065e86c01aa03e6f20f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 05:59:44 +0000
Subject: [PATCH 009/110] add MindIE/MindIE-Torch/built-in/T5.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/readme.md

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From d15ef4b1938a88d76022499f95a03b9dbeec2256 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 06:02:43 +0000
Subject: [PATCH 010/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index 2b421aff68..cdb7631c82 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -17,8 +17,8 @@ def parse_arguments():
     parser.add_argument(
         "--model_path",
         type=str,
-        default="./DeepFloyd--t5-v1_1-xxl",
-        help="encoder model path"
+        default="./T5-Small",
+        help="T5 model path"
     )
     parser.add_argument(
         "--max_batchsize",
-- 
Gitee


From 11a6d322502e2b580ae035a5eb7dd1a31d626d2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 06:54:23 +0000
Subject: [PATCH 011/110] add MindIE/MindIE-Torch/built-in/T5.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From fd06db61299ee5ae48455a317c17b4a80962b4f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 06:54:38 +0000
Subject: [PATCH 012/110] update
 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/T5/perf_test_aie.py | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
index e69de29bb2..97c02916fe 100644
--- a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
+++ b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
@@ -0,0 +1,115 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import time
+import argparse
+import json
+
+import numpy as np
+import torch
+import torch_npu
+import mindietorch
+from tqdm import tqdm
+
+def test_encoder(aie_path, args, device_id = 0):
+    batch_size = args.batch_size
+    device_id = args.device_id
+    seq_len = args.seq_len
+    device = f'npu:{device_id}'
+    stream = torch.npu.Stream(f"npu:{device_id}")
+    print("Start loading ts module...")
+    ts = torch.jit.load(aie_path)
+    print("Ts module loaded.")
+    ts.eval()
+    dummy_input = (torch.ones([batch_size, seq_len], dtype=torch.int64).npu())
+    print("Start infering...")
+    # warmup
+    for _ in range(10):
+        with torch.npu.stream(stream):
+            ts(dummy_input)
+            stream.synchronize()
+
+    # performance test
+    num_infer = 100
+
+    start = time.time()
+    for _ in tqdm(range(num_infer)):
+        with torch.npu.stream(stream):
+            ts(dummy_input)
+            stream.synchronize()
+    end = time.time()
+    print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms")
+    print(f"Encoder throughput: {num_infer * batch_size / (end - start):.2f} fps")
+
+
+def test_decoder(aie_path, args):
+    batch_size = args.batch_size
+    device_id = args.device_id
+    seq_len = args.seq_len
+    device = f'npu:{device_id}'
+    stream = torch.npu.Stream(f"npu:{device_id}")
+    print("Start loading ts module...")
+    ts = torch.jit.load(aie_path)
+    print("Ts module loaded.")
+    ts.eval()
+    dummy_input = (
+            torch.ones([batch_size, 1], dtype=torch.int64).npu(),
+            torch.randn(batch_size,seq_len,512).to(torch.float16).npu(),
+            torch.ones(batch_size,seq_len, dtype=torch.int64).npu(),
+            torch.randn(6,2,batch_size,8,1,64).to(torch.float16).npu(),
+            torch.randn(6,2,batch_size,8,24,64).to(torch.float16).npu()
+        )
+
+    # warmup
+    for _ in range(10):
+        with torch.npu.stream(stream):
+            ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4])
+            stream.synchronize()
+
+    # performance test
+    num_infer = 100
+    start = time.time()
+    for _ in tqdm(range(num_infer)):
+        with torch.npu.stream(stream):
+            ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4])
+            stream.synchronize()
+    end = time.time()
+
+    print(f"Decoder latency: {(end - start) / num_infer * 1000:.2f} ms")
+    print(f"Decoder throughput: {num_infer * batch_size / (end - start):.2f} fps")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--encoder_aie_path", type=str, required=True)
+    parser.add_argument("--decoder_aie_path", type=str, required=True)
+    parser.add_argument("--batch_size", type=int, help="NPU device id", default=1)
+    parser.add_argument("--seq_len", type=int, help="NPU device id", default=128)
+
+    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    torch.npu.set_device(args.device_id)
+    test_encoder(args.encoder_aie_path, args)
+    test_decoder(args.decoder_aie_path, args)
+
+
+if __name__ == "__main__":
+    main()
-- 
Gitee


From f974576d601c3bdc1820aef67834fd274e3ce2ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 07:04:47 +0000
Subject: [PATCH 013/110] add MindIE/MindIE-Torch/built-in/T5/main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/main.py | 43 +++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/main.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py
new file mode 100644
index 0000000000..e1ec51d66a
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/main.py
@@ -0,0 +1,43 @@
+import torch
+import time
+import argparse
+import torch_npu
+from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hf_model_path", type=str, required=True)
+
+    parser.add_argument("--encoder_aie_path", type=str, required=True)
+    parser.add_argument("--decoder_aie_path", type=str, required=True)
+
+    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    torch.npu.set_device(args.device_id)
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path)
+    text = [
+                "translate English to German: The house is wonderful.",
+                "summarize: I am a high-performance inference optimizer and runtime.",
+                "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world",
+                ]
+    t5_config = T5Config.from_pretrained(args.hf_model_path)
+    model = T5ForConditionalGeneration(config=t5_config,
+                                        encoder_path=args.encoder_aie_path,
+                                        decoder_path=args.decoder_aie_path,
+                                        device_id=args.device_id).half().npu()
+    input_ids = tokenizer(text, return_tensors = "pt", padding=True).input_ids
+    outputs = model.generate(input_ids.npu(),max_new_tokens=24)
+    start_time = time.time()
+    outputs = model.generate(input_ids.npu(),max_new_tokens=24)
+    print("time_cost=", time.time()-start_time)
+    print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+
+
+if __name__ == "__main__":
+    main()
+
-- 
Gitee


From 135e51f350773ea57bdce3c5f22b5211589f64de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 07:40:39 +0000
Subject: [PATCH 014/110] add
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_modeling_t5_patch.py       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
new file mode 100644
index 0000000000..3922ae56a8
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From d691a8c3f840018a71f0fdca845efb1dda030a6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 07:42:31 +0000
Subject: [PATCH 015/110] add
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_modeling_outputs_patch.py  | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
new file mode 100644
index 0000000000..7569722529
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From abb3fe7763bd8b36523281114778321dab5ab2a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 07:43:40 +0000
Subject: [PATCH 016/110] add
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_modeling_utils_patch.py    | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
new file mode 100644
index 0000000000..743c7a1f00
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From 43c14e121e44859d51093a03d5340f01ef4c00bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 07:44:32 +0000
Subject: [PATCH 017/110] add
 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_utils_patch.py             | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
new file mode 100644
index 0000000000..993a4b6789
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From 1b7477a08815d7aa77447efc44e2f41b77ba379f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:06:55 +0000
Subject: [PATCH 018/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 3922ae56a8..6a64343800 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version !='4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
 
 
-- 
Gitee


From b214ebc3fccf5fa553c3b5b81eada31058aadc6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:07:15 +0000
Subject: [PATCH 019/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
index 7569722529..1cbad93665 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch')
 
 
-- 
Gitee


From 0b60438572729ee8ed45a9d881ba379534ecaedc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:07:39 +0000
Subject: [PATCH 020/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
index 743c7a1f00..80cba46f9c 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch')
 
 
-- 
Gitee


From be1acb7d45cbbf68bc26f724f6de6a50a967d6ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:08:01 +0000
Subject: [PATCH 021/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
index 993a4b6789..18b0475a14 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version is not '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch')
 
 
-- 
Gitee


From 249cb2430953c3284fc718ef5c8584c87850f307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:12:00 +0000
Subject: [PATCH 022/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
index 1cbad93665..21cd251b95 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch')
 
 
-- 
Gitee


From 67ba1eac70c2ac44bf8bf4626c78857922a14dfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:12:18 +0000
Subject: [PATCH 023/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 6a64343800..e304f4f9f2 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version !='4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
 
 
-- 
Gitee


From e6e0fd40a9de20e813cd3b7728b4664436f374b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:12:33 +0000
Subject: [PATCH 024/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
index 80cba46f9c..b3ad7bc20b 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch')
 
 
-- 
Gitee


From 13ead6a9d36c959d87c8ce113c29439685dafea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:12:56 +0000
Subject: [PATCH 025/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
index 18b0475a14..046b6e6b85 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
@@ -20,7 +20,7 @@ def main():
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
-    assert transformers_version != '4.42.0', "expectation transformers==4.42.0"
+    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
     os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch')
 
 
-- 
Gitee


From 37a9a62070eb6df465715e4bf19c726f60b922b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:31:08 +0000
Subject: [PATCH 026/110] update MindIE/MindIE-Torch/built-in/T5/utils.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/utils.patch | 103 ++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch
index e69de29bb2..811327bbc6 100644
--- a/MindIE/MindIE-Torch/built-in/T5/utils.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/utils.patch
@@ -0,0 +1,103 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py	2024-08-29 11:22:09.280000000 +0800
++++ utils.py	2024-08-29 16:28:18.360000000 +0800
+@@ -507,7 +507,7 @@ class GenerationMixin:
+         generation_config: GenerationConfig,
+     ) -> Dict[str, Any]:
+         # 1. get encoder
+-        encoder = self.encoder_mindie
++        encoder = self.get_encoder()
+         # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+         # as the inputs.
+         if hasattr(self, "hf_device_map"):
+@@ -523,12 +523,12 @@ class GenerationMixin:
+             for argument, value in model_kwargs.items()
+             if not any(argument.startswith(p) for p in irrelevant_prefix)
+         }
+-        # encoder_signature = set(inspect.signature(encoder.forward).parameters)
+-        # encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+-        # if not encoder_accepts_wildcard:
+-        #     encoder_kwargs = {
+-        #         argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+-        #     }
++        encoder_signature = set(inspect.signature(encoder.forward).parameters)
++        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
++        if not encoder_accepts_wildcard:
++            encoder_kwargs = {
++                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
++            }
+         encoder_kwargs["output_attentions"] = generation_config.output_attentions
+         encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+ 
+@@ -536,13 +536,8 @@ class GenerationMixin:
+         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
+         encoder_kwargs["return_dict"] = True
+         encoder_kwargs[model_input_name] = inputs_tensor
+-        with torch.npu.stream(self.stream): # set stream
+-            encoder_outputs=encoder.forward(encoder_kwargs["input_ids"])
+-        self.stream.synchronize() # synchronize
+-        model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]}
+-        # import pdb
+-        # pdb.set_trace()
+-        # print("encoder_finished")
++        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
++
+         return model_kwargs
+ 
+     def _prepare_decoder_input_ids_for_generation(
+@@ -667,9 +662,6 @@ class GenerationMixin:
+             outputs, standardize_cache_format=standardize_cache_format
+         )
+         model_kwargs[cache_name] = cache
+-        if "past_cross_key_values" in outputs:
+-            past_cross_key_values = outputs.past_cross_key_values
+-            model_kwargs["past_cross_key_values"] = past_cross_key_values
+         if getattr(outputs, "state", None) is not None:
+             model_kwargs["state"] = outputs.state
+ 
+@@ -1801,16 +1793,16 @@ class GenerationMixin:
+                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
+             )
+ 
+-        # if self.device.type != input_ids.device.type:
+-        #     warnings.warn(
+-        #         "You are calling .generate() with the `input_ids` being on a device type different"
+-        #         f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+-        #         f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+-        #         " Please make sure that you have put `input_ids` to the"
+-        #         f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+-        #         " running `.generate()`.",
+-        #         UserWarning,
+-        #     )
++        if self.device.type != input_ids.device.type:
++            warnings.warn(
++                "You are calling .generate() with the `input_ids` being on a device type different"
++                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
++                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
++                " Please make sure that you have put `input_ids` to the"
++                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
++                " running `.generate()`.",
++                UserWarning,
++            )
+ 
+         # 8. prepare distribution pre_processing samplers
+         prepared_logits_processor = self._get_logits_processor(
+@@ -2650,10 +2642,7 @@ class GenerationMixin:
+         this_peer_finished = False
+         unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+-        num_layers = self.config.num_layers
+-        num_heads = self.config.num_heads
+-        d_kv = self.config.d_kv
+-        model_kwargs["past_key_values"] = torch.randn(num_layers, 2, batch_size, num_heads, 0, d_kv).half().npu()
++
+         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+             # prepare model inputs
+             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+@@ -2711,7 +2700,6 @@ class GenerationMixin:
+ 
+             # update generated ids, model inputs, and length for next step
+             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+-            # print("aaaa",input_ids)
+             if streamer is not None:
+                 streamer.put(next_tokens.cpu())
+             model_kwargs = self._update_model_kwargs_for_generation(
-- 
Gitee


From 91efc78b45a04cd938bef53754b96c96eec5fa50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 08:56:07 +0000
Subject: [PATCH 027/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/.keep

diff --git a/MindIE/MindIE-Torch/built-in/T5/.keep b/MindIE/MindIE-Torch/built-in/T5/.keep
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 0442dd26898430ca897579a363ccacef0220313a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 11:38:36 +0000
Subject: [PATCH 028/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 111 ++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index e69de29bb2..e9bf20bf51 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -0,0 +1,111 @@
+# T5模型-推理指导  
+
+
+- [概述](#ZH-CN_TOPIC_0000001172161501)
+  
+   - [输入输出数据](#section540883920406)
+
+- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
+
+- [快速上手](#ZH-CN_TOPIC_0000001126281700)
+
+  - [获取源码](#section4622531142816)
+  - [准备数据集](#section183221994411)
+  - [模型推理](#section741711594517)
+
+- [模型推理性能](#ZH-CN_TOPIC_0000001172201573)
+
+
+# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
+
+  T5的全称为Text to Text Transfer Transformer，是谷歌提出的预训练语言模型领域的通用模型，该模型将所有自然语言问题都转化成文本到文本的形式，并用一个统一的模型解决.T5最核心的理念是：使用前缀任务声明及文本答案生成，统一所有自然语言处理任务的输入和输出。在此之前的几乎所有预训练语言模型，在下游任务微调过程中都需要添加非线性层，将模型的输出转化为任务指定的输出格式。T5不需要对模型做任何改动，只需要提供下游任务的微调数据；不需要添加任何非线性层，唯一需要做的就是在输入数据前加上任务声明前缀.T5将自然语言处理任务都转化成几乎一致的格式，即输入是带有任务前缀声明的文本序列，输出的文本序列是相应任务的结果
+
+
+## 输入输出数据<a name="section540883920406"></a>
+
+- 输入数据
+
+  | 输入数据  | 大小     | 数据类型 | 数据排布格式 |
+  | -------- | -------- | -------- | ------------ |
+  | input    |  batchsize x input_seq_len | FLOAT16 |  NHWC |
+
+
+- 输出数据
+
+  | 输出数据 | 大小      | 数据类型 | 数据排布格式 |
+  | -------- | -------- | -------- | ------------ |
+  | output   | batchsize x input_seq_len | INT32  | NTHWC |
+
+
+# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
+
+- 该模型需要以下插件与驱动
+
+  **表 1**  版本配套表
+- 
+  | 配套                                                         | 版本     | 备注                                                 |
+  | ------------------------------------------------------------ |--------| ------------------------------------------------------------ |
+  | Python                                                       | 3.10.2 | -                                                            |
+  | torch | 2.1.0  | 导出pt模型所需版本                                            |
+  | torch_npu | 2.1.0  | 模型编译和推理所需版本                                         |
+
+
+# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
+
+## 获取源码<a name="section4622531142816"></a>
+
+1. 安装transformers4.42.0版本。
+   ```bash
+   pip3 install transformers==4.42.0
+   ```
+
+2. 安装mindie包，需要与torch_npu配合使用，请参考mindietorch配套torch_npu配置环境
+
+   ```bash
+   # 安装mindie
+   chmod +x ./Ascend-mindie_xxx.run
+   ./Ascend-mindie_xxx.run --install
+   source /usr/local/Ascend/mindie/set_env.sh
+   ```
+
+3. 代码修改,在T5目录下
+
+   执行命令：
+   
+   ```bash
+   python T5_modeling_outputs_patch.py
+   ```
+
+   ```bash
+   python T5_modeling_t5_patch.py
+   ```
+
+   ```bash
+   python T5_modeling_utils_patch.py
+   ```
+   ```bash
+   python T5_utils_patch.py
+   ```
+4.导出mindietorch模型
+ ```bash
+   python export_t5.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
+   ```
+参数说明：
+{output_path}是输出的目录
+{model_path}模型所在目录
+{max_batchsize}推理过程中最大的batchsize
+{max_input_seq_len}推理过程中最大输入长度
+{device_id} 用哪个npu device
+
+运行该命令后会自动生成encoder和decoder优化后的模型
+
+5.运行
+ ```bash
+python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2
+```
+
+参数说明：
+{model_path}模型所在目录
+{encoder_aie_path}优化后的encoder的模型路径，要具体到.pt文件
+{decoder_aie_path}优化后的decoder的模型路径，要具体到.pt文件
+{device_id} 用哪个npu device
\ No newline at end of file
-- 
Gitee


From dc2a507c211359dabdff9b8b91c91881caf236fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 29 Aug 2024 11:45:23 +0000
Subject: [PATCH 029/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index e9bf20bf51..f518880708 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -9,11 +9,8 @@
 
 - [快速上手](#ZH-CN_TOPIC_0000001126281700)
 
-  - [获取源码](#section4622531142816)
-  - [准备数据集](#section183221994411)
   - [模型推理](#section741711594517)
 
-- [模型推理性能](#ZH-CN_TOPIC_0000001172201573)
 
 
 # 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
@@ -52,7 +49,6 @@
 
 # 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
 
-## 获取源码<a name="section4622531142816"></a>
 
 1. 安装transformers4.42.0版本。
    ```bash
-- 
Gitee


From 47fcbb80564196028f32657e053d2974077aae03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 2 Sep 2024 10:33:32 +0000
Subject: [PATCH 030/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_utils.patch          | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
index e69de29bb2..1b9fef8cd2 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
@@ -0,0 +1,41 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_utils.py	2024-08-28 20:15:38.524000000 +0800
++++ modeling_utils.py	2024-09-02 17:29:43.700000000 +0800
+@@ -975,7 +975,7 @@ class ModuleUtilsMixin:
+         `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+         device).
+         """
+-        return self.get_device()
++        return get_parameter_device(self)
+ 
+     @property
+     def dtype(self) -> torch.dtype:
+@@ -1004,8 +1004,7 @@ class ModuleUtilsMixin:
+         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+         # encoder_extended_attention_mask.transpose(-1, -2))
+         encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+-        #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
+-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
+ 
+         return encoder_extended_attention_mask
+ 
+@@ -1019,9 +1018,7 @@ class ModuleUtilsMixin:
+             device = attention_mask.device
+         batch_size, seq_length = input_shape
+         seq_ids = torch.arange(seq_length, device=device)
+-        # print("seq_ids=",seq_ids)
+         causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+-        # print("causal_mask=",causal_mask)
+         # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+         # causal and attention masks must have same type with pytorch version < 1.3
+         causal_mask = causal_mask.to(attention_mask.dtype)
+@@ -1088,8 +1085,7 @@ class ModuleUtilsMixin:
+         # Since we are adding it to the raw scores before the softmax, this is
+         # effectively the same as removing these entirely.
+         extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+-        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+-        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+         return extended_attention_mask
+ 
+     def get_head_mask(
-- 
Gitee


From a4e9fb7f5a7ac2dfc51d8ff0a5fdb0b7defc5601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 2 Sep 2024 10:35:18 +0000
Subject: [PATCH 031/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
index e69de29bb2..6c99414a69 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
@@ -0,0 +1,10 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_outputs.py	2024-08-28 19:20:22.112000000 +0800
++++ modeling_outputs.py	2024-09-02 18:32:37.720000000 +0800
+@@ -282,7 +282,6 @@ class BaseModelOutputWithPastAndCrossAtt
+ 
+     last_hidden_state: torch.FloatTensor = None
+     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+-    past_cross_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+     cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-- 
Gitee


From 139447e492458113aad3e3d96bd415af96a31c55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 4 Sep 2024 11:14:12 +0000
Subject: [PATCH 032/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5.patch             | 315 +++---------------
 1 file changed, 46 insertions(+), 269 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index 4a376cf5eb..40920ac007 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,5 +1,5 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-08-29 11:11:23.852000000 +0800
-+++ modeling_t5.py	2024-08-29 11:19:34.572000000 +0800
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-09-04 19:03:55.080000000 +0800
++++ modling_t5.py	2024-09-04 19:04:47.048000000 +0800
 @@ -23,8 +23,6 @@ from typing import List, Optional, Tuple
  import torch
  from torch import nn
@@ -9,15 +9,6 @@
  
  from ...activations import ACT2FN
  from ...modeling_outputs import (
-@@ -246,7 +244,7 @@ class T5LayerNorm(nn.Module):
- 
-         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
--        # print("self.weight.dtype=",self.weight.dtype)
-+
-         # convert into half-precision if necessary
-         if self.weight.dtype in [torch.float16, torch.bfloat16]:
-             hidden_states = hidden_states.to(self.weight.dtype)
 @@ -451,7 +449,6 @@ class T5Attention(nn.Module):
          key_value_states=None,
          position_bias=None,
@@ -44,11 +35,11 @@
                  if key_value_states is None:
                      # self-attn
                      # (batch_size, n_heads, key_length, dim_per_head)
-@@ -571,261 +566,7 @@ class T5Attention(nn.Module):
+@@ -571,133 +566,16 @@ class T5Attention(nn.Module):
  
          present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
          outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
+-       
 -        if output_attentions:
 -            outputs = outputs + (attn_weights,)
 -        return outputs
@@ -85,7 +76,6 @@
 -                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
 -                )
 -            real_seq_length += past_key_value[0].shape[2]
--        # print("key_value_states=",real_seq_length)
 -        key_length = real_seq_length
 -
 -        def shape(states):
@@ -118,7 +108,6 @@
 -        value_states = project(
 -            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
 -        )
--        # print("key_states=",hidden_states.dtype,key_states.dtype)
 -        # compute scores
 -        scores = torch.matmul(
 -            query_states, key_states.transpose(3, 2)
@@ -141,143 +130,7 @@
 -
 -            if mask is not None:
 -                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
--
--        if self.pruned_heads:
--            mask = torch.ones(position_bias.shape[1])
--            mask[list(self.pruned_heads)] = 0
--            position_bias_masked = position_bias[:, mask.bool()]
--        else:
--            position_bias_masked = position_bias
--
--        scores += position_bias_masked
--        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
--            scores
--        )  # (batch_size, n_heads, seq_length, key_length)
--        attn_weights = nn.functional.dropout(
--            attn_weights, p=self.dropout, training=self.training
--        )  # (batch_size, n_heads, seq_length, key_length)
--
--        # Mask heads if we want to
--        if layer_head_mask is not None:
--            attn_weights = attn_weights * layer_head_mask
--
--        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
--        attn_output = self.o(attn_output)
  
--        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
--        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
--        if output_attentions:
--            outputs = outputs + (attn_weights,)
--        return outputs
--
--
--class T5CrossAttention(T5Attention):
--    def __init__(self, config: T5Config, has_relative_attention_bias=False):
--        super().__init__(config, has_relative_attention_bias)
--
--    def forward(
--        self,
--        hidden_states,
--        mask=None,
--        key_value_states=None,
--        position_bias=None,
--        past_cross_key_value=None,
--        layer_head_mask=None,
--        query_length=None,
--        use_cache=False,
--        output_attentions=False,
--    ):
--        """
--        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
--        """
--        # Input is (batch_size, seq_length, dim)
--        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
--        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
--        batch_size, seq_length = hidden_states.shape[:2]
--
--        real_seq_length = seq_length
--
--        if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
--                raise ValueError(
--                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
--                )
--            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
--        # print("key_value_states=",key_value_states, real_seq_length)
--        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
--
--        def shape(states):
--            """projection"""
--            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
--
--        def unshape(states):
--            """reshape"""
--            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
--
--        def project(hidden_states, proj_layer, key_value_states, past_key_value):
--            """projects hidden states correctly to key/query states"""
--            if key_value_states is None:
--                # self-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(hidden_states))
--            elif past_key_value is None:
--                # cross-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(key_value_states))
--
--            if past_key_value is not None:
--                if key_value_states is None:
--                    # self-attn
--                    # (batch_size, n_heads, key_length, dim_per_head)
--                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
--                    # print("hidden_states=",hidden_states.shape)
--                elif past_key_value.shape[2] != key_value_states.shape[1]:
--                    # checking that the `sequence_length` of the `past_key_value` is the same as
--                    # the provided `key_value_states` to support prefix tuning
--                    # cross-attn
--                    # (batch_size, n_heads, seq_length, dim_per_head)
--                    hidden_states = shape(proj_layer(key_value_states))
--                else:
--                    # cross-attn
--                    hidden_states = past_key_value
--            return hidden_states
--
--        # get query states
--        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
--
--        # get key/value states
--        key_states = project(
--            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
--        )
--        value_states = project(
--            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
--        )
--
--        # compute scores
--        scores = torch.matmul(
--            query_states, key_states.transpose(3, 2)
--        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--
--        if position_bias is None:
--            if not self.has_relative_attention_bias:
--                position_bias = torch.zeros(
--                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
--                )
--                if self.gradient_checkpointing and self.training:
--                    position_bias.requires_grad = True
--            else:
--                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
--
--            # if key and values are already calculated
--            # we want only the last query position bias
--            if past_key_value is not None:
--                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
--
--            if mask is not None:
--                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
--
 -        if self.pruned_heads:
 -            mask = torch.ones(position_bias.shape[1])
 -            mask[list(self.pruned_heads)] = 0
@@ -302,11 +155,13 @@
 -
 -        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 -        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        # print("output_attentions=",output_attentions)
          if output_attentions:
              outputs = outputs + (attn_weights,)
          return outputs
-@@ -834,7 +575,7 @@ class T5CrossAttention(T5Attention):
+ 
+ 
+-
+-
  class T5LayerSelfAttention(nn.Module):
      def __init__(self, config, has_relative_attention_bias=False):
          super().__init__()
@@ -315,7 +170,7 @@
          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
          self.dropout = nn.Dropout(config.dropout_rate)
  
-@@ -921,7 +662,6 @@ class T5Block(nn.Module):
+@@ -784,7 +662,6 @@ class T5Block(nn.Module):
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
          past_key_value=None,
@@ -323,7 +178,7 @@
          use_cache=False,
          output_attentions=False,
          return_dict=True,
-@@ -931,17 +671,15 @@ class T5Block(nn.Module):
+@@ -794,15 +671,15 @@ class T5Block(nn.Module):
                  logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
              expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
  
@@ -333,56 +188,32 @@
 -            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
 -            #         f"Got {len(past_key_value)} past key / value states"
 -            #     )
--
--            self_attn_past_key_value = past_key_value
--            # print("self_attn_past_key_value=",self_attn_past_key_value.dtype)
--            cross_attn_past_key_value = past_cross_key_value
--            # cross_attn_past_key_value = past_key_value[2:]
 +            if len(past_key_value) != expected_num_past_key_values:
 +                raise ValueError(
 +                    f"There should be {expected_num_past_key_values} past states. "
 +                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
 +                    f"Got {len(past_key_value)} past key / value states"
 +                )
-+
+ 
+-            self_attn_past_key_value = past_key_value
+-            cross_attn_past_key_value = past_cross_key_value
 +            self_attn_past_key_value = past_key_value[:2]
 +            cross_attn_past_key_value = past_key_value[2:]
          else:
              self_attn_past_key_value, cross_attn_past_key_value = None, None
  
-@@ -955,8 +693,6 @@ class T5Block(nn.Module):
-             output_attentions=output_attentions,
-         )
-         hidden_states, present_key_value_state = self_attention_outputs[:2]
--        # if self.is_decoder:
--            # print("present_key_value_state=",present_key_value_state[0].dtype)
-         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
- 
-         # clamp inf values to enable fp16 training
-@@ -967,7 +703,7 @@ class T5Block(nn.Module):
-                 torch.finfo(hidden_states.dtype).max,
-             )
-             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
--        present_cross_key_value_state = ()
-+
-         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-         if do_cross_attention:
-             # the actual query length is unknown for cross attention
-@@ -1000,10 +736,9 @@ class T5Block(nn.Module):
+@@ -859,7 +736,9 @@ class T5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
--            # if present_key_value_state is not None:
--            #     present_key_value_state = present_key_value_state + cross_attention_outputs[1]
 -            cross_attn_past_key_values = cross_attention_outputs[1]
--            # print("cross_attn_past_key_values=",cross_attn_past_key_values)
 +            if present_key_value_state is not None:
 +                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
 +
              # Keep cross-attention outputs and relative position weights
              attention_outputs = attention_outputs + cross_attention_outputs[2:]
  
-@@ -1022,7 +757,7 @@ class T5Block(nn.Module):
+@@ -878,7 +757,7 @@ class T5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
@@ -391,7 +222,7 @@
          else:
              outputs = outputs + attention_outputs
  
-@@ -1162,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
+@@ -1018,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
  
  
  class T5Stack(T5PreTrainedModel):
@@ -407,7 +238,7 @@
  
          self.block = nn.ModuleList(
              [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -1237,21 +969,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -1093,14 +969,13 @@ class T5Stack(T5PreTrainedModel):
      def forward(
          self,
          input_ids=None,
@@ -424,21 +255,10 @@
          use_cache=None,
          output_attentions=None,
          output_hidden_states=None,
-         return_dict=None,
-     ):
-         # Model parallel
--        # print("aaaaaaaaaaaaaaaaa")
-         if self.model_parallel:
-             torch.cuda.set_device(self.first_device)
-             self.embed_tokens = self.embed_tokens.to(self.first_device)
-@@ -1291,13 +1021,9 @@ class T5Stack(T5PreTrainedModel):
+@@ -1146,9 +1021,9 @@ class T5Stack(T5PreTrainedModel):
                  raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
  
          # initialize past_key_values with `None` if past does not exist
--        #modified
--        # if past_key_values is None:
--        #     past_key_values = [None] * len(self.block)
--        #added
 -        if not self.is_decoder:
 +        if past_key_values is None:
              past_key_values = [None] * len(self.block)
@@ -447,19 +267,16 @@
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1328,10 +1054,7 @@ class T5Stack(T5PreTrainedModel):
+@@ -1179,7 +1054,7 @@ class T5Stack(T5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
--        # present_key_value_states = () if use_cache else None
--        # present_cross_key_value_states = () if use_cache else None
 -        present_key_value_states = [] if use_cache else None
--        # present_cross_key_value_states = [] if use_cache else None
 +        present_key_value_states = () if use_cache else None
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1339,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1187,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -472,7 +289,7 @@
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1392,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
+@@ -1240,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                      past_key_value=past_key_value,
@@ -480,7 +297,7 @@
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1400,22 +1120,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -1248,19 +1120,19 @@ class T5Stack(T5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -501,14 +318,11 @@
              # append next layer key value states
              if use_cache:
 -                present_key_value_states.extend(present_key_value_state)
--                # present_cross_key_value_states.extend(present_cross_key_value_state)
--                # present_key_value_states = present_key_value_states + (present_key_value_state,)
--                # present_cross_key_value_states = present_cross_key_value_states + (present_cross_key_value_state,)
 +                present_key_value_states = present_key_value_states + (present_key_value_state,)
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1429,52 +1146,31 @@ class T5Stack(T5PreTrainedModel):
+@@ -1274,7 +1146,7 @@ class T5Stack(T5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -517,48 +331,20 @@
  
          # Add last layer
          if output_hidden_states:
-             all_hidden_states = all_hidden_states + (hidden_states,)
--        # print("return_dict=",return_dict)
-+
-         if not return_dict:
-             return tuple(
-                 v
-                 for v in [
-                     hidden_states,
-                     present_key_value_states,
--                    # present_cross_key_value_states,
-                     all_hidden_states,
-                     all_attentions,
-                     all_cross_attentions,
+@@ -1292,17 +1164,13 @@ class T5Stack(T5PreTrainedModel):
                  ]
                  if v is not None
              )
 -        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
--        # present_cross_key_value_states = torch.concat(present_cross_key_value_states).reshape(len(self.block), 2,
--        #                                                                           *present_cross_key_value_states[0].shape) if use_cache else None
--        # print("dddddddddddd")
--        # if use_cache:
--        #     print("present_key_value_states.shape=",present_key_value_states.shape,present_key_value_states.dtype)
--        # return BaseModelOutputWithPastAndCrossAttentions(
--        #     last_hidden_state=hidden_states,
--        #     past_key_values=present_key_value_states,
--        #     past_cross_key_values=present_cross_key_value_states
--        # )
 -        if not self.is_decoder and self.encodecrosskeyvalue:
 -            res = self.encodecrosskeyvalue(hidden_states)
 -            return tuple((hidden_states, res))
--            # return BaseModelOutputWithPastAndCrossAttentions(
--            #     last_hidden_state=hidden_states,
--            #     past_key_values=present_key_value_states,
--            #     # past_cross_key_values=past_cross_key_values,
--            #     hidden_states=all_hidden_states,
--            #     attentions=all_attentions,
--            #     cross_attentions=all_cross_attentions,
--            # )
+-        lm_logits = None
 -        if self.is_decoder:
+-            #logits = None
 -            if self.config.tie_word_embeddings:
--                hidden_states_1 = hidden_states * (self.model_dim ** -0.5)
--                lm_logits = self.lm_head(hidden_states_1)
+-                hidden_states = hidden_states * (self.model_dim ** -0.5)
+-            lm_logits = self.lm_head(hidden_states)
 -            return tuple((lm_logits, present_key_value_states))
 +        return BaseModelOutputWithPastAndCrossAttentions(
 +            last_hidden_state=hidden_states,
@@ -570,7 +356,7 @@
  
  
  T5_START_DOCSTRING = r"""
-@@ -1845,28 +1541,6 @@ class T5Model(T5PreTrainedModel):
+@@ -1673,31 +1541,6 @@ class T5Model(T5PreTrainedModel):
          )
  
  
@@ -587,11 +373,14 @@
 -    def forward(self, hidden_states):
 -        batch_size = hidden_states.shape[0]
 -        encoder_hidden_states_kvs = []
+-        # for i in range(len(self.cross_value)):
+-        #     encoder_hidden_states_kvs.append(
+-        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
+-        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
 -        for i in range(len(self.cross_value)):
 -            encoder_hidden_states_kvs.append(
--                torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
--                             self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
--
+-                torch.stack((self.cross_key[i](hidden_states),
+-                             self.cross_value[i](hidden_states)), dim=0))
 -        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
 -        return past_cross_key_values
 -
@@ -599,7 +388,7 @@
  @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
  class T5ForConditionalGeneration(T5PreTrainedModel):
      _keys_to_ignore_on_load_unexpected = [
-@@ -1874,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
+@@ -1705,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
      ]
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
@@ -666,7 +455,7 @@
  
          # Model parallel
          self.model_parallel = False
-@@ -1993,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
+@@ -1824,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
          cross_attn_head_mask: Optional[torch.Tensor] = None,
          encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
          past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
@@ -674,14 +463,13 @@
          inputs_embeds: Optional[torch.FloatTensor] = None,
          decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
          labels: Optional[torch.LongTensor] = None,
-@@ -2041,25 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
+@@ -1872,23 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
              if self.config.num_layers == self.config.num_decoder_layers:
                  warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                  decoder_head_mask = head_mask
 -        
 -        hidden_states = encoder_outputs["last_hidden_state"]
--        # import pdb
--        # pdb.set_trace()
+-        past_cross_key_values = encoder_outputs["past_cross_key_values"]
  
 -        # if self.model_parallel:
 -        #     torch.cuda.set_device(self.decoder.first_device)
@@ -713,13 +501,12 @@
              # get decoder inputs from shifting lm labels to the right
              decoder_input_ids = self._shift_right(labels)
  
--        import time
--        start_time = time.time()
 -        with torch.npu.stream(self.stream): # set stream
--            
--            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
+-            # import pdb
+-            # pdb.set_trace()
+-            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
 -        self.stream.synchronize() # synchronize
--        print("time is", time.time() - start_time)
+-        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
 +        # Set device for model parallelism
 +        if self.model_parallel:
 +            torch.cuda.set_device(self.decoder.first_device)
@@ -764,7 +551,7 @@
  
          loss = None
          if labels is not None:
-@@ -2072,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
+@@ -1901,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
          if not return_dict:
              output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
              return ((loss,) + output) if loss is not None else output
@@ -791,7 +578,7 @@
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -2108,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
+@@ -1937,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
          return {
              "decoder_input_ids": input_ids,
              "past_key_values": past_key_values,
@@ -799,17 +586,7 @@
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -2168,9 +1878,6 @@ class T5EncoderModel(T5PreTrainedModel):
-         encoder_config.use_cache = False
-         encoder_config.is_encoder_decoder = False
-         self.encoder = T5Stack(encoder_config, self.shared)
--        self.encoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/encoder/encoder_compiled.pt")
--        # self.decoder_mindie = torch.jit.load(r"/opt/naie/zhenwenqi/T5/decoder/decoder_compiled.pt")
--        self.stream = torch.npu.Stream("npu:2")
- 
-         # Initialize weights and apply final processing
-         self.post_init()
-@@ -2260,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
+@@ -2086,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
          >>> last_hidden_states = outputs.last_hidden_state
          ```"""
          return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-- 
Gitee


From 09425109e1d96ca2aaef3f9c2ff50799c4818da2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 4 Sep 2024 11:15:02 +0000
Subject: [PATCH 033/110] update MindIE/MindIE-Torch/built-in/T5/utils.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/utils.patch | 49 ++++++++++++---------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch
index 811327bbc6..4968e30c2b 100644
--- a/MindIE/MindIE-Torch/built-in/T5/utils.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/utils.patch
@@ -1,15 +1,18 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py	2024-08-29 11:22:09.280000000 +0800
-+++ utils.py	2024-08-29 16:28:18.360000000 +0800
-@@ -507,7 +507,7 @@ class GenerationMixin:
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py	2024-09-04 17:07:15.776000000 +0800
++++ utils.py	2024-09-04 19:05:05.300000000 +0800
+@@ -507,10 +507,7 @@ class GenerationMixin:
          generation_config: GenerationConfig,
      ) -> Dict[str, Any]:
          # 1. get encoder
--        encoder = self.encoder_mindie
+-        if self.encoder_mindie:
+-            encoder = self.encoder_mindie
+-        else:
+-            encoder = self.get_encoder()
 +        encoder = self.get_encoder()
          # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
          # as the inputs.
          if hasattr(self, "hf_device_map"):
-@@ -523,12 +523,12 @@ class GenerationMixin:
+@@ -526,12 +523,12 @@ class GenerationMixin:
              for argument, value in model_kwargs.items()
              if not any(argument.startswith(p) for p in irrelevant_prefix)
          }
@@ -28,23 +31,23 @@
          encoder_kwargs["output_attentions"] = generation_config.output_attentions
          encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
  
-@@ -536,13 +536,8 @@ class GenerationMixin:
+@@ -539,13 +536,8 @@ class GenerationMixin:
          model_input_name = model_input_name if model_input_name is not None else self.main_input_name
          encoder_kwargs["return_dict"] = True
          encoder_kwargs[model_input_name] = inputs_tensor
--        with torch.npu.stream(self.stream): # set stream
--            encoder_outputs=encoder.forward(encoder_kwargs["input_ids"])
--        self.stream.synchronize() # synchronize
+-        if self.encoder_mindie:
+-            with torch.npu.stream(self.stream): # set stream
+-                encoder_outputs=encoder.forward(encoder_kwargs["input_ids"])
+-            self.stream.synchronize() # synchronize
+-        else:
+-            encoder_outputs = encoder(**encoder_kwargs)
 -        model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]}
--        # import pdb
--        # pdb.set_trace()
--        # print("encoder_finished")
 +        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
 +
          return model_kwargs
  
      def _prepare_decoder_input_ids_for_generation(
-@@ -667,9 +662,6 @@ class GenerationMixin:
+@@ -670,9 +662,6 @@ class GenerationMixin:
              outputs, standardize_cache_format=standardize_cache_format
          )
          model_kwargs[cache_name] = cache
@@ -54,7 +57,7 @@
          if getattr(outputs, "state", None) is not None:
              model_kwargs["state"] = outputs.state
  
-@@ -1801,16 +1793,16 @@ class GenerationMixin:
+@@ -1804,16 +1793,16 @@ class GenerationMixin:
                  "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
              )
  
@@ -81,7 +84,14 @@
  
          # 8. prepare distribution pre_processing samplers
          prepared_logits_processor = self._get_logits_processor(
-@@ -2650,10 +2642,7 @@ class GenerationMixin:
+@@ -2647,20 +2636,15 @@ class GenerationMixin:
+             encoder_hidden_states = (
+                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+             )
+-       
+ 
+         # keep track of which sequences are already finished
+         batch_size = input_ids.shape[0]
          this_peer_finished = False
          unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
          model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
@@ -92,12 +102,7 @@
 +
          while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
              # prepare model inputs
+-            
              model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-@@ -2711,7 +2700,6 @@ class GenerationMixin:
  
-             # update generated ids, model inputs, and length for next step
-             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
--            # print("aaaa",input_ids)
-             if streamer is not None:
-                 streamer.put(next_tokens.cpu())
-             model_kwargs = self._update_model_kwargs_for_generation(
+             # forward pass to get next token
-- 
Gitee


From c84ed5a2f3a7585343d0f60c72fc973a1428617c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 4 Sep 2024 11:31:17 +0000
Subject: [PATCH 034/110] =?UTF-8?q?modleing=5Fmt5=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_mt5.patch            | 568 ++++++++++++++++++
 1 file changed, 568 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
new file mode 100644
index 0000000000..38eb59c192
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
@@ -0,0 +1,568 @@
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py	2024-09-03 19:47:27.392000000 +0800
++++ modeling_mt5.py	2024-09-04 19:29:28.348000000 +0800
+@@ -324,6 +324,7 @@ class MT5Attention(nn.Module):
+         key_value_states=None,
+         position_bias=None,
+         past_key_value=None,
++        past_cross_key_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -340,7 +341,8 @@ class MT5Attention(nn.Module):
+         real_seq_length = seq_length
+ 
+         if past_key_value is not None:
+-            if len(past_key_value) != 2:
++            if past_key_value.shape[0] != 2:
++            # if len(past_key_value) != 2:
+                 raise ValueError(
+                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                 )
+@@ -368,6 +370,7 @@ class MT5Attention(nn.Module):
+                 hidden_states = shape(proj_layer(key_value_states))
+ 
+             if past_key_value is not None:
++                past_key_value = shape(past_key_value)
+                 if key_value_states is None:
+                     # self-attn
+                     # (batch_size, n_heads, key_length, dim_per_head)
+@@ -446,12 +449,125 @@ class MT5Attention(nn.Module):
+             outputs = outputs + (attn_weights,)
+         return outputs
+ 
++class MT5SelfAttention(MT5Attention):
++    def __init__(self, config: T5Config, has_relative_attention_bias=False):
++        super().__init__(config, has_relative_attention_bias)
++
++    def forward(
++        self,
++        hidden_states,
++        mask=None,
++        position_bias=None,
++        past_key_value=None,
++        layer_head_mask=None,
++        use_cache=False,
++        output_attentions=False,
++    ):
++        """
++        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
++        """
++        # Input is (batch_size, seq_length, dim)
++        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
++        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
++        batch_size, seq_length = hidden_states.shape[:2]
++
++        real_seq_length = seq_length
++
++        if past_key_value is not None:
++            if past_key_value.shape[0] != 2:
++            # if len(past_key_value) != 2:
++                raise ValueError(
++                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
++                )
++            real_seq_length += past_key_value[0].shape[2]
++        key_length = real_seq_length
++
++        def shape(states):
++            """projection"""
++            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
++
++        def unshape(states):
++            """reshape"""
++            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
++
++        def project(hidden_states, proj_layer, past_key_value):
++            """projects hidden states correctly to key/query states"""
++            if past_key_value is None:
++                # cross-attn
++                # (batch_size, n_heads, seq_length, dim_per_head)
++                hidden_states = shape(proj_layer(hidden_states))
++
++            if past_key_value is not None:
++                hidden_states = shape(proj_layer(hidden_states))
++                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
++            return hidden_states
++
++        # get query states
++        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
++
++        # get key/value states
++        key_states = project(
++            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
++        )
++        value_states = project(
++            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
++        )
++        # compute scores
++        scores = torch.matmul(
++            query_states, key_states.transpose(3, 2)
++        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
++        # print("scores=",scores.dtype)
++        if position_bias is None:
++            if not self.has_relative_attention_bias:
++                position_bias = torch.zeros(
++                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
++                )
++                if self.gradient_checkpointing and self.training:
++                    position_bias.requires_grad = True
++            else:
++                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
++
++            # if key and values are already calculated
++            # we want only the last query position bias
++            if past_key_value is not None:
++                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
++
++            if mask is not None:
++                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
++
++        if self.pruned_heads:
++            mask = torch.ones(position_bias.shape[1])
++            mask[list(self.pruned_heads)] = 0
++            position_bias_masked = position_bias[:, mask.bool()]
++        else:
++            position_bias_masked = position_bias
++
++        scores += position_bias_masked
++        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
++            scores
++        )  # (batch_size, n_heads, seq_length, key_length)
++        attn_weights = nn.functional.dropout(
++            attn_weights, p=self.dropout, training=self.training
++        )  # (batch_size, n_heads, seq_length, key_length)
++
++        # Mask heads if we want to
++        if layer_head_mask is not None:
++            attn_weights = attn_weights * layer_head_mask
++
++        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
++        attn_output = self.o(attn_output)
++
++        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
++        if output_attentions:
++            outputs = outputs + (attn_weights,)
++        return outputs
+ 
+ # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
+ class MT5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+         super().__init__()
+-        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -540,6 +656,7 @@ class MT5Block(nn.Module):
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+         past_key_value=None,
++        past_cross_key_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+@@ -549,15 +666,15 @@ class MT5Block(nn.Module):
+                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+ 
+-            if len(past_key_value) != expected_num_past_key_values:
+-                raise ValueError(
+-                    f"There should be {expected_num_past_key_values} past states. "
+-                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-                    f"Got {len(past_key_value)} past key / value states"
+-                )
++            # if len(past_key_value) != expected_num_past_key_values:
++            #     raise ValueError(
++            #         f"There should be {expected_num_past_key_values} past states. "
++            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
++            #         f"Got {len(past_key_value)} past key / value states"
++            #     )
+ 
+-            self_attn_past_key_value = past_key_value[:2]
+-            cross_attn_past_key_value = past_key_value[2:]
++            self_attn_past_key_value = past_key_value
++            cross_attn_past_key_value = past_cross_key_value
+         else:
+             self_attn_past_key_value, cross_attn_past_key_value = None, None
+ 
+@@ -614,9 +731,7 @@ class MT5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            if present_key_value_state is not None:
+-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-
++            cross_attn_past_key_values = cross_attention_outputs[1]
+             # Keep cross-attention outputs and relative position weights
+             attention_outputs = attention_outputs + cross_attention_outputs[2:]
+ 
+@@ -635,7 +750,7 @@ class MT5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) + attention_outputs
++            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -884,11 +999,14 @@ class MT5PreTrainedModel(PreTrainedModel
+ 
+ # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
+ class MT5Stack(MT5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None):
++    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
++        self.lm_head=lm_head
++        self.encodecrosskeyvalue = encodecrosskeyvalue
++        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -956,13 +1074,14 @@ class MT5Stack(MT5PreTrainedModel):
+     def forward(
+         self,
+         input_ids=None,
+-        attention_mask=None,
+         encoder_hidden_states=None,
+         encoder_attention_mask=None,
++        past_key_values=None,
++        past_cross_key_values=None,
++        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
+-        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+@@ -1008,9 +1127,9 @@ class MT5Stack(MT5PreTrainedModel):
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        if past_key_values is None:
++        if not self.is_decoder:
+             past_key_values = [None] * len(self.block)
+-
++            past_cross_key_values = [None] * len(self.block)
+         if attention_mask is None:
+             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+ 
+@@ -1041,7 +1160,7 @@ class MT5Stack(MT5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        present_key_value_states = () if use_cache else None
++        present_key_value_states = [] if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1049,8 +1168,10 @@ class MT5Stack(MT5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-
+-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
++        for i, layer_module in enumerate(self.block):
++        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
++            past_key_value = past_key_values[i]
++            past_cross_key_value = past_cross_key_values[i]
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1100,6 +1221,7 @@ class MT5Stack(MT5PreTrainedModel):
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                     past_key_value=past_key_value,
++                    past_cross_key_value=past_cross_key_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1107,19 +1229,19 @@ class MT5Stack(MT5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state = layer_outputs[:2]
++            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[2]
++            position_bias = layer_outputs[3]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states = present_key_value_states + (present_key_value_state,)
++                present_key_value_states.extend(present_key_value_state)
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1133,7 +1255,7 @@ class MT5Stack(MT5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
+ 
+         # Add last layer
+         if output_hidden_states:
+@@ -1151,13 +1273,17 @@ class MT5Stack(MT5PreTrainedModel):
+                 ]
+                 if v is not None
+             )
+-        return BaseModelOutputWithPastAndCrossAttentions(
+-            last_hidden_state=hidden_states,
+-            past_key_values=present_key_value_states,
+-            hidden_states=all_hidden_states,
+-            attentions=all_attentions,
+-            cross_attentions=all_cross_attentions,
+-        )
++        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
++        if not self.is_decoder and self.encodecrosskeyvalue:
++            res = self.encodecrosskeyvalue(hidden_states)
++            return tuple((hidden_states, res))
++        lm_logits = None
++        if self.is_decoder:
++            #logits = None
++            if self.config.tie_word_embeddings:
++                hidden_states = hidden_states * (self.model_dim ** -0.5)
++            lm_logits = self.lm_head(hidden_states)
++            return tuple((lm_logits, present_key_value_states))
+ 
+ 
+ MT5_START_DOCSTRING = r"""
+@@ -1549,6 +1675,29 @@ class MT5Model(MT5PreTrainedModel):
+         )
+ 
+ 
++class EncoderToCrossKeyValue(nn.Module):
++    def __init__(self, cross_key, cross_value, num_heads, d_kv):
++        super().__init__()
++        self.cross_key = cross_key
++        self.cross_value = cross_value
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        encoder_hidden_states_kvs = []
++        # for i in range(len(self.cross_value)):
++        #     encoder_hidden_states_kvs.append(
++        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
++        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
++        for i in range(len(self.cross_value)):
++            encoder_hidden_states_kvs.append(
++                torch.stack((self.cross_key[i](hidden_states),
++                             self.cross_value[i](hidden_states)), dim=0))
++        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
++        return past_cross_key_values
++
+ @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
+ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     r"""
+@@ -1573,28 +1722,45 @@ class MT5ForConditionalGeneration(MT5Pre
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
+-    def __init__(self, config: MT5Config):
++    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
+         super().__init__(config)
+-        self.model_dim = config.d_model
+-
+-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-        encoder_config = copy.deepcopy(config)
+-        encoder_config.is_decoder = False
+-        encoder_config.use_cache = False
+-        encoder_config.is_encoder_decoder = False
+-        self.encoder = MT5Stack(encoder_config, self.shared)
+-
+-        decoder_config = copy.deepcopy(config)
+-        decoder_config.is_decoder = True
+-        decoder_config.is_encoder_decoder = False
+-        decoder_config.num_layers = config.num_decoder_layers
+-        self.decoder = MT5Stack(decoder_config, self.shared)
+-
+-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++        
++        self.encoder_path = encoder_path
++        self.decoder_path = decoder_path
++        if not self.encoder_path or not self.decoder_path:
++            self.model_dim = config.d_model
++            self.shared = nn.Embedding(config.vocab_size, config.d_model)
++            decoder_config = copy.deepcopy(config)
++            decoder_config.is_decoder = True
++            decoder_config.is_encoder_decoder = False
++            decoder_config.num_layers = config.num_decoder_layers
++            
++
++            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++            self.decoder = MT5Stack(decoder_config, self.shared, , self.lm_head)
++            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
++            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
++            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
++            encoder_config = copy.deepcopy(config)
++            encoder_config.is_decoder = False
++            encoder_config.use_cache = False
++            encoder_config.is_encoder_decoder = False
++            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
++        self.encoder_mindie = None
++        self.decoder_mindie = None
++        if self.encoder_path:
++            self.encoder_mindie = torch.jit.load(self.encoder_path)
++        if self.decoder_path:
++            self.decoder_mindie = torch.jit.load(self.decoder_path)
++            self.stream = torch.npu.Stream(f"npu:{device_id}")
++            self.device_id = device_id
++
++    
++    def get_device(self):
++        return f"npu:{self.device_id}"
+ 
+         # Initialize weights and apply final processing
+-        self.post_init()
++        # self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+@@ -1677,6 +1843,7 @@ class MT5ForConditionalGeneration(MT5Pre
+         cross_attn_head_mask: Optional[torch.Tensor] = None,
+         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
++        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+         inputs_embeds: Optional[torch.FloatTensor] = None,
+         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+         labels: Optional[torch.LongTensor] = None,
+@@ -1724,76 +1891,23 @@ class MT5ForConditionalGeneration(MT5Pre
+             if self.config.num_layers == self.config.num_decoder_layers:
+                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                 decoder_head_mask = head_mask
++        
++        hidden_states = encoder_outputs["last_hidden_state"]
++        past_cross_key_values = encoder_outputs["past_cross_key_values"]
+ 
+-        # Encode if needed (training, first prediction pass)
+-        if encoder_outputs is None:
+-            # Convert encoder inputs in embeddings if needed
+-            encoder_outputs = self.encoder(
+-                input_ids=input_ids,
+-                attention_mask=attention_mask,
+-                inputs_embeds=inputs_embeds,
+-                head_mask=head_mask,
+-                output_attentions=output_attentions,
+-                output_hidden_states=output_hidden_states,
+-                return_dict=return_dict,
+-            )
+-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+-            encoder_outputs = BaseModelOutput(
+-                last_hidden_state=encoder_outputs[0],
+-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+-            )
+-
+-        hidden_states = encoder_outputs[0]
+-
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
++        # if self.model_parallel:
++        #     torch.cuda.set_device(self.decoder.first_device)
+ 
+         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+             # get decoder inputs from shifting lm labels to the right
+             decoder_input_ids = self._shift_right(labels)
+ 
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-            hidden_states = hidden_states.to(self.decoder.first_device)
+-            if decoder_input_ids is not None:
+-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+-            if attention_mask is not None:
+-                attention_mask = attention_mask.to(self.decoder.first_device)
+-            if decoder_attention_mask is not None:
+-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+-
+-        # Decode
+-        decoder_outputs = self.decoder(
+-            input_ids=decoder_input_ids,
+-            attention_mask=decoder_attention_mask,
+-            inputs_embeds=decoder_inputs_embeds,
+-            past_key_values=past_key_values,
+-            encoder_hidden_states=hidden_states,
+-            encoder_attention_mask=attention_mask,
+-            head_mask=decoder_head_mask,
+-            cross_attn_head_mask=cross_attn_head_mask,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-
+-        sequence_output = decoder_outputs[0]
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.encoder.first_device)
+-            self.lm_head = self.lm_head.to(self.encoder.first_device)
+-            sequence_output = sequence_output.to(self.lm_head.weight.device)
+-
+-        if self.config.tie_word_embeddings:
+-            # Rescale output before projecting on vocab
+-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+-            sequence_output = sequence_output * (self.model_dim**-0.5)
+-
+-        lm_logits = self.lm_head(sequence_output)
++        with torch.npu.stream(self.stream): # set stream
++            # import pdb
++            # pdb.set_trace()
++            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
++        self.stream.synchronize() # synchronize
++        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
+ 
+         loss = None
+         if labels is not None:
+@@ -1806,17 +1920,10 @@ class MT5ForConditionalGeneration(MT5Pre
+         if not return_dict:
+             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+             return ((loss,) + output) if loss is not None else output
+-
+         return Seq2SeqLMOutput(
+             loss=loss,
+-            logits=lm_logits,
+-            past_key_values=decoder_outputs.past_key_values,
+-            decoder_hidden_states=decoder_outputs.hidden_states,
+-            decoder_attentions=decoder_outputs.attentions,
+-            cross_attentions=decoder_outputs.cross_attentions,
+-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+-            encoder_hidden_states=encoder_outputs.hidden_states,
+-            encoder_attentions=encoder_outputs.attentions,
++            logits=decoder_outputs[0],
++            past_key_values=decoder_outputs[1]
+         )
+ 
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
+@@ -1824,6 +1931,7 @@ class MT5ForConditionalGeneration(MT5Pre
+         self,
+         input_ids,
+         past_key_values=None,
++        past_cross_key_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -1849,6 +1957,7 @@ class MT5ForConditionalGeneration(MT5Pre
+         return {
+             "decoder_input_ids": input_ids,
+             "past_key_values": past_key_values,
++            "past_cross_key_values": past_cross_key_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
-- 
Gitee


From a6a3836f988646bf890a1f7b13d4eba8a62c44ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 4 Sep 2024 12:34:29 +0000
Subject: [PATCH 035/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index cdb7631c82..5fa13d3c0a 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -85,14 +85,14 @@ def export_textencoder(args, model, save_dir, batch_size):
         encoder.eval()
         torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
     if not os.path.exists(compiled_path):
-        model = torch.jit.load(traced_path).eval()
+        traced_model = torch.jit.load(traced_path).eval()
         
         inputs0 = []
         # inputs1 = []
         inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
         print("compiling encoder")
         compiled_model = mindietorch.compile(
-            model,
+            traced_model,
             inputs=inputs0,
             allow_tensor_replace_int=True,
             require_full_compilation=False,
@@ -115,16 +115,16 @@ def export_textdecoder(args, model, save_dir, batch_size):
         text_decoder = model.decoder
         dummy_input = (
             torch.ones([1, 1], dtype=torch.int64).npu(),
-            torch.randn(1,16,512).to(torch.float16).npu(),
+           torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(),
             torch.ones(1,16).npu(),
-            torch.randn(6,2,1,8,1,64).to(torch.float16).npu(),
-            torch.randn(6,2,1,8,24,64).to(torch.float16).npu()
+            torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(),
+            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_model).to(torch.float16).npu()
         )
         decoder = TextDecoderExport(text_decoder).npu()
         decoder.eval()
         torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
     if not os.path.exists(compiled_path):
-        model = torch.jit.load(traced_path).eval()
+        traced_model = torch.jit.load(traced_path).eval()
         print("compiling decoder")
         compiled_model = mindietorch.compile(
             model,
@@ -132,19 +132,19 @@ def export_textdecoder(args, model, save_dir, batch_size):
                                       max_shape = (args.max_batchsize,1),
                                       dtype=mindietorch.dtype.INT64),
 
-                    mindietorch.Input(min_shape =(1, 1, 512),
-                                      max_shape=(args.max_batchsize, args.max_input_seq_len, 512),
+                    mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
                                       dtype=mindietorch.dtype.FLOAT16),
                                       
                     mindietorch.Input(min_shape = (1,1),
                                       max_shape =(args.max_batchsize,args.max_input_seq_len),
                                       dtype=mindietorch.dtype.INT64),
-                    mindietorch.Input(min_shape = (6,2,1,8,0,64),
-                                      max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64),
+                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
                                       dtype=mindietorch.dtype.FLOAT16),
 
-                    mindietorch.Input(min_shape = (6,2,1,8,1,64),
-                                      max_shape = (6,2,args.max_batchsize,8,args.max_input_seq_len,64),
+                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_model),
+                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_model),
                                       dtype=mindietorch.dtype.FLOAT16)],
             allow_tensor_replace_int=True,
             require_full_compilation=False,
-- 
Gitee


From 037d7ad1b64a1073ed8293fd888cc65b790bfabe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 4 Sep 2024 13:45:52 +0000
Subject: [PATCH 036/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index 5fa13d3c0a..b2f1b06157 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -127,7 +127,7 @@ def export_textdecoder(args, model, save_dir, batch_size):
         traced_model = torch.jit.load(traced_path).eval()
         print("compiling decoder")
         compiled_model = mindietorch.compile(
-            model,
+            traced_model,
             inputs=[mindietorch.Input(min_shape =(1, 1),
                                       max_shape = (args.max_batchsize,1),
                                       dtype=mindietorch.dtype.INT64),
-- 
Gitee


From fcb345f4f6b37639a90e20400c4e88db996787c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 5 Sep 2024 11:37:02 +0000
Subject: [PATCH 037/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index b2f1b06157..af67451d69 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -118,7 +118,7 @@ def export_textdecoder(args, model, save_dir, batch_size):
            torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(),
             torch.ones(1,16).npu(),
             torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(),
-            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_model).to(torch.float16).npu()
+            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()
         )
         decoder = TextDecoderExport(text_decoder).npu()
         decoder.eval()
@@ -143,8 +143,8 @@ def export_textdecoder(args, model, save_dir, batch_size):
                                       max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
                                       dtype=mindietorch.dtype.FLOAT16),
 
-                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_model),
-                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads),
+                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads),
                                       dtype=mindietorch.dtype.FLOAT16)],
             allow_tensor_replace_int=True,
             require_full_compilation=False,
-- 
Gitee


From 4c918331b1afea3b44672d60e949918c6b717608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 5 Sep 2024 12:37:04 +0000
Subject: [PATCH 038/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_mt5.patch            | 803 +++++++++---------
 1 file changed, 416 insertions(+), 387 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
index 38eb59c192..0fdb93043a 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
@@ -1,568 +1,597 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py	2024-09-03 19:47:27.392000000 +0800
-+++ modeling_mt5.py	2024-09-04 19:29:28.348000000 +0800
-@@ -324,6 +324,7 @@ class MT5Attention(nn.Module):
+--- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py	2024-09-05 20:29:34.752000000 +0800
++++ modeling_mt5.py	2024-09-05 20:33:39.712000000 +0800
+@@ -21,8 +21,6 @@ import warnings
+ from typing import List, Optional, Tuple, Union
+ 
+ import torch
+-import torch_npu
+-import mindietorch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+ 
+@@ -326,7 +324,6 @@ class MT5Attention(nn.Module):
          key_value_states=None,
          position_bias=None,
          past_key_value=None,
-+        past_cross_key_value=None,
+-        past_cross_key_value=None,
          layer_head_mask=None,
          query_length=None,
          use_cache=False,
-@@ -340,7 +341,8 @@ class MT5Attention(nn.Module):
+@@ -343,8 +340,7 @@ class MT5Attention(nn.Module):
          real_seq_length = seq_length
  
          if past_key_value is not None:
--            if len(past_key_value) != 2:
-+            if past_key_value.shape[0] != 2:
-+            # if len(past_key_value) != 2:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
++            if len(past_key_value) != 2:
                  raise ValueError(
                      f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
                  )
-@@ -368,6 +370,7 @@ class MT5Attention(nn.Module):
+@@ -369,10 +365,10 @@ class MT5Attention(nn.Module):
+             elif past_key_value is None:
+                 # cross-attn
+                 # (batch_size, n_heads, seq_length, dim_per_head)
++
                  hidden_states = shape(proj_layer(key_value_states))
  
              if past_key_value is not None:
-+                past_key_value = shape(past_key_value)
+-                past_key_value = shape(past_key_value)
                  if key_value_states is None:
                      # self-attn
                      # (batch_size, n_heads, key_length, dim_per_head)
-@@ -446,12 +449,125 @@ class MT5Attention(nn.Module):
+@@ -451,125 +447,12 @@ class MT5Attention(nn.Module):
              outputs = outputs + (attn_weights,)
          return outputs
  
-+class MT5SelfAttention(MT5Attention):
-+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
-+        super().__init__(config, has_relative_attention_bias)
-+
-+    def forward(
-+        self,
-+        hidden_states,
-+        mask=None,
-+        position_bias=None,
-+        past_key_value=None,
-+        layer_head_mask=None,
-+        use_cache=False,
-+        output_attentions=False,
-+    ):
-+        """
-+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-+        """
-+        # Input is (batch_size, seq_length, dim)
-+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-+        batch_size, seq_length = hidden_states.shape[:2]
-+
-+        real_seq_length = seq_length
-+
-+        if past_key_value is not None:
-+            if past_key_value.shape[0] != 2:
-+            # if len(past_key_value) != 2:
-+                raise ValueError(
-+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-+                )
-+            real_seq_length += past_key_value[0].shape[2]
-+        key_length = real_seq_length
-+
-+        def shape(states):
-+            """projection"""
-+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-+
-+        def unshape(states):
-+            """reshape"""
-+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-+
-+        def project(hidden_states, proj_layer, past_key_value):
-+            """projects hidden states correctly to key/query states"""
-+            if past_key_value is None:
-+                # cross-attn
-+                # (batch_size, n_heads, seq_length, dim_per_head)
-+                hidden_states = shape(proj_layer(hidden_states))
-+
-+            if past_key_value is not None:
-+                hidden_states = shape(proj_layer(hidden_states))
-+                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-+            return hidden_states
-+
-+        # get query states
-+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-+
-+        # get key/value states
-+        key_states = project(
-+            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
-+        )
-+        value_states = project(
-+            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
-+        )
-+        # compute scores
-+        scores = torch.matmul(
-+            query_states, key_states.transpose(3, 2)
-+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-+        # print("scores=",scores.dtype)
-+        if position_bias is None:
-+            if not self.has_relative_attention_bias:
-+                position_bias = torch.zeros(
-+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-+                )
-+                if self.gradient_checkpointing and self.training:
-+                    position_bias.requires_grad = True
-+            else:
-+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-+
-+            # if key and values are already calculated
-+            # we want only the last query position bias
-+            if past_key_value is not None:
-+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-+
-+            if mask is not None:
-+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-+
-+        if self.pruned_heads:
-+            mask = torch.ones(position_bias.shape[1])
-+            mask[list(self.pruned_heads)] = 0
-+            position_bias_masked = position_bias[:, mask.bool()]
-+        else:
-+            position_bias_masked = position_bias
-+
-+        scores += position_bias_masked
-+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-+            scores
-+        )  # (batch_size, n_heads, seq_length, key_length)
-+        attn_weights = nn.functional.dropout(
-+            attn_weights, p=self.dropout, training=self.training
-+        )  # (batch_size, n_heads, seq_length, key_length)
-+
-+        # Mask heads if we want to
-+        if layer_head_mask is not None:
-+            attn_weights = attn_weights * layer_head_mask
-+
-+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-+        attn_output = self.o(attn_output)
-+
-+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-+        if output_attentions:
-+            outputs = outputs + (attn_weights,)
-+        return outputs
+-class MT5SelfAttention(MT5Attention):
+-    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
+-        super().__init__(config, has_relative_attention_bias)
+-
+-    def forward(
+-        self,
+-        hidden_states,
+-        mask=None,
+-        position_bias=None,
+-        past_key_value=None,
+-        layer_head_mask=None,
+-        use_cache=False,
+-        output_attentions=False,
+-    ):
+-        """
+-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+-        """
+-        # Input is (batch_size, seq_length, dim)
+-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+-        batch_size, seq_length = hidden_states.shape[:2]
+-
+-        real_seq_length = seq_length
+-
+-        if past_key_value is not None:
+-            if past_key_value.shape[0] != 2:
+-            # if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2]
+-        key_length = real_seq_length
+-
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+-
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+-
+-        def project(hidden_states, proj_layer, past_key_value):
+-            """projects hidden states correctly to key/query states"""
+-            if past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-
+-            if past_key_value is not None:
+-                hidden_states = shape(proj_layer(hidden_states))
+-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-            return hidden_states
+-
+-        # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
+-        # get key/value states
+-        key_states = project(
+-            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
+-        )
+-        value_states = project(
+-            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
+-        )
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-        # print("scores=",scores.dtype)
+-        if position_bias is None:
+-            if not self.has_relative_attention_bias:
+-                position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+-                )
+-                if self.gradient_checkpointing and self.training:
+-                    position_bias.requires_grad = True
+-            else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+-
+-            # if key and values are already calculated
+-            # we want only the last query position bias
+-            if past_key_value is not None:
+-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+-            if mask is not None:
+-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+-
+-        if self.pruned_heads:
+-            mask = torch.ones(position_bias.shape[1])
+-            mask[list(self.pruned_heads)] = 0
+-            position_bias_masked = position_bias[:, mask.bool()]
+-        else:
+-            position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+-        attn_output = self.o(attn_output)
+-
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
  
  # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
  class MT5LayerSelfAttention(nn.Module):
      def __init__(self, config, has_relative_attention_bias=False):
          super().__init__()
--        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-+        self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+-        self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
          self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
          self.dropout = nn.Dropout(config.dropout_rate)
  
-@@ -540,6 +656,7 @@ class MT5Block(nn.Module):
+@@ -658,7 +541,6 @@ class MT5Block(nn.Module):
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
          past_key_value=None,
-+        past_cross_key_value=None,
+-        past_cross_key_value=None,
          use_cache=False,
          output_attentions=False,
          return_dict=True,
-@@ -549,15 +666,15 @@ class MT5Block(nn.Module):
+@@ -668,15 +550,15 @@ class MT5Block(nn.Module):
                  logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
              expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
  
--            if len(past_key_value) != expected_num_past_key_values:
--                raise ValueError(
--                    f"There should be {expected_num_past_key_values} past states. "
--                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
--                    f"Got {len(past_key_value)} past key / value states"
--                )
-+            # if len(past_key_value) != expected_num_past_key_values:
-+            #     raise ValueError(
-+            #         f"There should be {expected_num_past_key_values} past states. "
-+            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-+            #         f"Got {len(past_key_value)} past key / value states"
-+            #     )
- 
--            self_attn_past_key_value = past_key_value[:2]
--            cross_attn_past_key_value = past_key_value[2:]
-+            self_attn_past_key_value = past_key_value
-+            cross_attn_past_key_value = past_cross_key_value
+-            # if len(past_key_value) != expected_num_past_key_values:
+-            #     raise ValueError(
+-            #         f"There should be {expected_num_past_key_values} past states. "
+-            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-            #         f"Got {len(past_key_value)} past key / value states"
+-            #     )
++            if len(past_key_value) != expected_num_past_key_values:
++                raise ValueError(
++                    f"There should be {expected_num_past_key_values} past states. "
++                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
++                    f"Got {len(past_key_value)} past key / value states"
++                )
+ 
+-            self_attn_past_key_value = past_key_value
+-            cross_attn_past_key_value = past_cross_key_value
++            self_attn_past_key_value = past_key_value[:2]
++            cross_attn_past_key_value = past_key_value[2:]
          else:
              self_attn_past_key_value, cross_attn_past_key_value = None, None
  
-@@ -614,9 +731,7 @@ class MT5Block(nn.Module):
+@@ -709,7 +591,8 @@ class MT5Block(nn.Module):
+                 query_length = present_key_value_state[0].shape[2]
+             else:
+                 query_length = None
+-
++            import pdb
++            pdb.set_trace()
+             cross_attention_outputs = self.layer[1](
+                 hidden_states,
+                 key_value_states=encoder_hidden_states,
+@@ -733,7 +616,9 @@ class MT5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
--            if present_key_value_state is not None:
--                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
--
-+            cross_attn_past_key_values = cross_attention_outputs[1]
+-            cross_attn_past_key_values = cross_attention_outputs[1]
++            if present_key_value_state is not None:
++                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
++
              # Keep cross-attention outputs and relative position weights
              attention_outputs = attention_outputs + cross_attention_outputs[2:]
  
-@@ -635,7 +750,7 @@ class MT5Block(nn.Module):
+@@ -752,7 +637,7 @@ class MT5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
--            outputs = outputs + (present_key_value_state,) + attention_outputs
-+            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
+-            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
++            outputs = outputs + (present_key_value_state,) + attention_outputs
          else:
              outputs = outputs + attention_outputs
  
-@@ -884,11 +999,14 @@ class MT5PreTrainedModel(PreTrainedModel
+@@ -1001,14 +886,11 @@ class MT5PreTrainedModel(PreTrainedModel
  
  # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
  class MT5Stack(MT5PreTrainedModel):
--    def __init__(self, config, embed_tokens=None):
-+    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
+-    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
++    def __init__(self, config, embed_tokens=None):
          super().__init__(config)
  
          self.embed_tokens = embed_tokens
          self.is_decoder = config.is_decoder
-+        self.lm_head=lm_head
-+        self.encodecrosskeyvalue = encodecrosskeyvalue
-+        self.model_dim = config.d_model
+-        self.lm_head=lm_head
+-        self.encodecrosskeyvalue = encodecrosskeyvalue
+-        self.model_dim = config.d_model
  
          self.block = nn.ModuleList(
              [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -956,13 +1074,14 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1076,14 +958,13 @@ class MT5Stack(MT5PreTrainedModel):
      def forward(
          self,
          input_ids=None,
--        attention_mask=None,
++        attention_mask=None,
          encoder_hidden_states=None,
          encoder_attention_mask=None,
-+        past_key_values=None,
-+        past_cross_key_values=None,
-+        attention_mask=None,
+-        past_key_values=None,
+-        past_cross_key_values=None,
+-        attention_mask=None,
          inputs_embeds=None,
          head_mask=None,
          cross_attn_head_mask=None,
--        past_key_values=None,
++        past_key_values=None,
          use_cache=None,
          output_attentions=None,
          output_hidden_states=None,
-@@ -1008,9 +1127,9 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1129,9 +1010,9 @@ class MT5Stack(MT5PreTrainedModel):
                  raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
  
          # initialize past_key_values with `None` if past does not exist
--        if past_key_values is None:
-+        if not self.is_decoder:
+-        if not self.is_decoder:
++        if past_key_values is None:
              past_key_values = [None] * len(self.block)
--
-+            past_cross_key_values = [None] * len(self.block)
+-            past_cross_key_values = [None] * len(self.block)
++
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1041,7 +1160,7 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1162,7 +1043,7 @@ class MT5Stack(MT5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
--        present_key_value_states = () if use_cache else None
-+        present_key_value_states = [] if use_cache else None
+-        present_key_value_states = [] if use_cache else None
++        present_key_value_states = () if use_cache else None
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1049,8 +1168,10 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1170,10 +1051,8 @@ class MT5Stack(MT5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
--
--        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-+        for i, layer_module in enumerate(self.block):
-+        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-+            past_key_value = past_key_values[i]
-+            past_cross_key_value = past_cross_key_values[i]
+-        for i, layer_module in enumerate(self.block):
+-        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+-            past_key_value = past_key_values[i]
+-            past_cross_key_value = past_cross_key_values[i]
++
++        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1100,6 +1221,7 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1223,7 +1102,6 @@ class MT5Stack(MT5PreTrainedModel):
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                      past_key_value=past_key_value,
-+                    past_cross_key_value=past_cross_key_value,
+-                    past_cross_key_value=past_cross_key_value,
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1107,19 +1229,19 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1231,19 +1109,19 @@ class MT5Stack(MT5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
--                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
-+                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+-                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
  
--            hidden_states, present_key_value_state = layer_outputs[:2]
-+            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
+-            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
++            hidden_states, present_key_value_state = layer_outputs[:2]
  
              # We share the position biases between the layers - the first layer store them
              # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
              # (cross-attention position bias), (cross-attention weights)
--            position_bias = layer_outputs[2]
-+            position_bias = layer_outputs[3]
+-            position_bias = layer_outputs[3]
++            position_bias = layer_outputs[2]
              if self.is_decoder and encoder_hidden_states is not None:
--                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-+                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+-                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
++                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
              # append next layer key value states
              if use_cache:
--                present_key_value_states = present_key_value_states + (present_key_value_state,)
-+                present_key_value_states.extend(present_key_value_state)
+-                present_key_value_states.extend(present_key_value_state)
++                present_key_value_states = present_key_value_states + (present_key_value_state,)
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1133,7 +1255,7 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1257,12 +1135,12 @@ class MT5Stack(MT5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
--        hidden_states = self.dropout(hidden_states)
-+        hidden_states = self.dropout(hidden_states).half()
+-        hidden_states = self.dropout(hidden_states).half()
++        hidden_states = self.dropout(hidden_states)
  
          # Add last layer
          if output_hidden_states:
-@@ -1151,13 +1273,17 @@ class MT5Stack(MT5PreTrainedModel):
+             all_hidden_states = all_hidden_states + (hidden_states,)
+-
++        print("return_dict=",return_dict)
+         if not return_dict:
+             return tuple(
+                 v
+@@ -1275,17 +1153,13 @@ class MT5Stack(MT5PreTrainedModel):
                  ]
                  if v is not None
              )
--        return BaseModelOutputWithPastAndCrossAttentions(
--            last_hidden_state=hidden_states,
--            past_key_values=present_key_value_states,
--            hidden_states=all_hidden_states,
--            attentions=all_attentions,
--            cross_attentions=all_cross_attentions,
--        )
-+        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
-+        if not self.is_decoder and self.encodecrosskeyvalue:
-+            res = self.encodecrosskeyvalue(hidden_states)
-+            return tuple((hidden_states, res))
-+        lm_logits = None
-+        if self.is_decoder:
-+            #logits = None
-+            if self.config.tie_word_embeddings:
-+                hidden_states = hidden_states * (self.model_dim ** -0.5)
-+            lm_logits = self.lm_head(hidden_states)
-+            return tuple((lm_logits, present_key_value_states))
+-        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
+-        if not self.is_decoder and self.encodecrosskeyvalue:
+-            res = self.encodecrosskeyvalue(hidden_states)
+-            return tuple((hidden_states, res))
+-        lm_logits = None
+-        if self.is_decoder:
+-            #logits = None
+-            if self.config.tie_word_embeddings:
+-                hidden_states = hidden_states * (self.model_dim ** -0.5)
+-            lm_logits = self.lm_head(hidden_states)
+-            return tuple((lm_logits, present_key_value_states))
++        return BaseModelOutputWithPastAndCrossAttentions(
++            last_hidden_state=hidden_states,
++            past_key_values=present_key_value_states,
++            hidden_states=all_hidden_states,
++            attentions=all_attentions,
++            cross_attentions=all_cross_attentions,
++        )
  
  
  MT5_START_DOCSTRING = r"""
-@@ -1549,6 +1675,29 @@ class MT5Model(MT5PreTrainedModel):
+@@ -1677,29 +1551,6 @@ class MT5Model(MT5PreTrainedModel):
          )
  
  
-+class EncoderToCrossKeyValue(nn.Module):
-+    def __init__(self, cross_key, cross_value, num_heads, d_kv):
-+        super().__init__()
-+        self.cross_key = cross_key
-+        self.cross_value = cross_value
-+        self.num_heads = num_heads
-+        self.d_kv = d_kv
-+
-+
-+    def forward(self, hidden_states):
-+        batch_size = hidden_states.shape[0]
-+        encoder_hidden_states_kvs = []
-+        # for i in range(len(self.cross_value)):
-+        #     encoder_hidden_states_kvs.append(
-+        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
-+        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
-+        for i in range(len(self.cross_value)):
-+            encoder_hidden_states_kvs.append(
-+                torch.stack((self.cross_key[i](hidden_states),
-+                             self.cross_value[i](hidden_states)), dim=0))
-+        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
-+        return past_cross_key_values
-+
+-class EncoderToCrossKeyValue(nn.Module):
+-    def __init__(self, cross_key, cross_value, num_heads, d_kv):
+-        super().__init__()
+-        self.cross_key = cross_key
+-        self.cross_value = cross_value
+-        self.num_heads = num_heads
+-        self.d_kv = d_kv
+-
+-
+-    def forward(self, hidden_states):
+-        batch_size = hidden_states.shape[0]
+-        encoder_hidden_states_kvs = []
+-        # for i in range(len(self.cross_value)):
+-        #     encoder_hidden_states_kvs.append(
+-        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
+-        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
+-        for i in range(len(self.cross_value)):
+-            encoder_hidden_states_kvs.append(
+-                torch.stack((self.cross_key[i](hidden_states),
+-                             self.cross_value[i](hidden_states)), dim=0))
+-        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
+-        return past_cross_key_values
+-
  @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
  class MT5ForConditionalGeneration(MT5PreTrainedModel):
      r"""
-@@ -1573,28 +1722,45 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1724,45 +1575,28 @@ class MT5ForConditionalGeneration(MT5Pre
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
      # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
--    def __init__(self, config: MT5Config):
-+    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
+-    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
++    def __init__(self, config: MT5Config):
          super().__init__(config)
--        self.model_dim = config.d_model
--
--        self.shared = nn.Embedding(config.vocab_size, config.d_model)
--
--        encoder_config = copy.deepcopy(config)
--        encoder_config.is_decoder = False
--        encoder_config.use_cache = False
--        encoder_config.is_encoder_decoder = False
--        self.encoder = MT5Stack(encoder_config, self.shared)
+-        
+-        self.encoder_path = encoder_path
+-        self.decoder_path = decoder_path
+-        if not self.encoder_path or not self.decoder_path:
+-            self.model_dim = config.d_model
+-            self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-            decoder_config = copy.deepcopy(config)
+-            decoder_config.is_decoder = True
+-            decoder_config.is_encoder_decoder = False
+-            decoder_config.num_layers = config.num_decoder_layers
+-            
 -
--        decoder_config = copy.deepcopy(config)
--        decoder_config.is_decoder = True
--        decoder_config.is_encoder_decoder = False
--        decoder_config.num_layers = config.num_decoder_layers
--        self.decoder = MT5Stack(decoder_config, self.shared)
+-            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+-            self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head)
+-            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
+-            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
+-            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
+-            encoder_config = copy.deepcopy(config)
+-            encoder_config.is_decoder = False
+-            encoder_config.use_cache = False
+-            encoder_config.is_encoder_decoder = False
+-            self.encoder = MT5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
+-        self.encoder_mindie = None
+-        self.decoder_mindie = None
+-        if self.encoder_path:
+-            self.encoder_mindie = torch.jit.load(self.encoder_path)
+-        if self.decoder_path:
+-            self.decoder_mindie = torch.jit.load(self.decoder_path)
+-            self.stream = torch.npu.Stream(f"npu:{device_id}")
+-            self.device_id = device_id
 -
--        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-+        
-+        self.encoder_path = encoder_path
-+        self.decoder_path = decoder_path
-+        if not self.encoder_path or not self.decoder_path:
-+            self.model_dim = config.d_model
-+            self.shared = nn.Embedding(config.vocab_size, config.d_model)
-+            decoder_config = copy.deepcopy(config)
-+            decoder_config.is_decoder = True
-+            decoder_config.is_encoder_decoder = False
-+            decoder_config.num_layers = config.num_decoder_layers
-+            
+-    
+-    def get_device(self):
+-        return f"npu:{self.device_id}"
++        self.model_dim = config.d_model
++
++        self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++        encoder_config = copy.deepcopy(config)
++        encoder_config.is_decoder = False
++        encoder_config.use_cache = False
++        encoder_config.is_encoder_decoder = False
++        self.encoder = MT5Stack(encoder_config, self.shared)
 +
-+            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-+            self.decoder = MT5Stack(decoder_config, self.shared, , self.lm_head)
-+            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
-+            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
-+            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
-+            encoder_config = copy.deepcopy(config)
-+            encoder_config.is_decoder = False
-+            encoder_config.use_cache = False
-+            encoder_config.is_encoder_decoder = False
-+            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
-+        self.encoder_mindie = None
-+        self.decoder_mindie = None
-+        if self.encoder_path:
-+            self.encoder_mindie = torch.jit.load(self.encoder_path)
-+        if self.decoder_path:
-+            self.decoder_mindie = torch.jit.load(self.decoder_path)
-+            self.stream = torch.npu.Stream(f"npu:{device_id}")
-+            self.device_id = device_id
++        decoder_config = copy.deepcopy(config)
++        decoder_config.is_decoder = True
++        decoder_config.is_encoder_decoder = False
++        decoder_config.num_layers = config.num_decoder_layers
++        self.decoder = MT5Stack(decoder_config, self.shared)
 +
-+    
-+    def get_device(self):
-+        return f"npu:{self.device_id}"
++        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
  
          # Initialize weights and apply final processing
--        self.post_init()
-+        # self.post_init()
+-        # self.post_init()
++        self.post_init()
  
          # Model parallel
          self.model_parallel = False
-@@ -1677,6 +1843,7 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1845,7 +1679,6 @@ class MT5ForConditionalGeneration(MT5Pre
          cross_attn_head_mask: Optional[torch.Tensor] = None,
          encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
          past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-+        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
          inputs_embeds: Optional[torch.FloatTensor] = None,
          decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
          labels: Optional[torch.LongTensor] = None,
-@@ -1724,76 +1891,23 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1893,23 +1726,76 @@ class MT5ForConditionalGeneration(MT5Pre
              if self.config.num_layers == self.config.num_decoder_layers:
                  warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                  decoder_head_mask = head_mask
-+        
-+        hidden_states = encoder_outputs["last_hidden_state"]
-+        past_cross_key_values = encoder_outputs["past_cross_key_values"]
- 
--        # Encode if needed (training, first prediction pass)
--        if encoder_outputs is None:
--            # Convert encoder inputs in embeddings if needed
--            encoder_outputs = self.encoder(
--                input_ids=input_ids,
--                attention_mask=attention_mask,
--                inputs_embeds=inputs_embeds,
--                head_mask=head_mask,
--                output_attentions=output_attentions,
--                output_hidden_states=output_hidden_states,
--                return_dict=return_dict,
--            )
--        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
--            encoder_outputs = BaseModelOutput(
--                last_hidden_state=encoder_outputs[0],
--                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
--                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
--            )
--
--        hidden_states = encoder_outputs[0]
--
--        if self.model_parallel:
--            torch.cuda.set_device(self.decoder.first_device)
-+        # if self.model_parallel:
-+        #     torch.cuda.set_device(self.decoder.first_device)
+-        
+-        hidden_states = encoder_outputs["last_hidden_state"]
+-        past_cross_key_values = encoder_outputs["past_cross_key_values"]
+ 
+-        # if self.model_parallel:
+-        #     torch.cuda.set_device(self.decoder.first_device)
++        # Encode if needed (training, first prediction pass)
++        if encoder_outputs is None:
++            # Convert encoder inputs in embeddings if needed
++            encoder_outputs = self.encoder(
++                input_ids=input_ids,
++                attention_mask=attention_mask,
++                inputs_embeds=inputs_embeds,
++                head_mask=head_mask,
++                output_attentions=output_attentions,
++                output_hidden_states=output_hidden_states,
++                return_dict=return_dict,
++            )
++        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
++            encoder_outputs = BaseModelOutput(
++                last_hidden_state=encoder_outputs[0],
++                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
++                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
++            )
++
++        hidden_states = encoder_outputs[0]
++
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
  
          if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
              # get decoder inputs from shifting lm labels to the right
              decoder_input_ids = self._shift_right(labels)
  
--        # Set device for model parallelism
--        if self.model_parallel:
--            torch.cuda.set_device(self.decoder.first_device)
--            hidden_states = hidden_states.to(self.decoder.first_device)
--            if decoder_input_ids is not None:
--                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
--            if attention_mask is not None:
--                attention_mask = attention_mask.to(self.decoder.first_device)
--            if decoder_attention_mask is not None:
--                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
--
--        # Decode
--        decoder_outputs = self.decoder(
--            input_ids=decoder_input_ids,
--            attention_mask=decoder_attention_mask,
--            inputs_embeds=decoder_inputs_embeds,
--            past_key_values=past_key_values,
--            encoder_hidden_states=hidden_states,
--            encoder_attention_mask=attention_mask,
--            head_mask=decoder_head_mask,
--            cross_attn_head_mask=cross_attn_head_mask,
--            use_cache=use_cache,
--            output_attentions=output_attentions,
--            output_hidden_states=output_hidden_states,
--            return_dict=return_dict,
--        )
--
--        sequence_output = decoder_outputs[0]
--
--        # Set device for model parallelism
--        if self.model_parallel:
--            torch.cuda.set_device(self.encoder.first_device)
--            self.lm_head = self.lm_head.to(self.encoder.first_device)
--            sequence_output = sequence_output.to(self.lm_head.weight.device)
--
--        if self.config.tie_word_embeddings:
--            # Rescale output before projecting on vocab
--            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
--            sequence_output = sequence_output * (self.model_dim**-0.5)
--
--        lm_logits = self.lm_head(sequence_output)
-+        with torch.npu.stream(self.stream): # set stream
-+            # import pdb
-+            # pdb.set_trace()
-+            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
-+        self.stream.synchronize() # synchronize
-+        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
+-        with torch.npu.stream(self.stream): # set stream
+-            # import pdb
+-            # pdb.set_trace()
+-            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
+-        self.stream.synchronize() # synchronize
+-        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.decoder.first_device)
++            hidden_states = hidden_states.to(self.decoder.first_device)
++            if decoder_input_ids is not None:
++                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
++            if attention_mask is not None:
++                attention_mask = attention_mask.to(self.decoder.first_device)
++            if decoder_attention_mask is not None:
++                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
++
++        # Decode
++        decoder_outputs = self.decoder(
++            input_ids=decoder_input_ids,
++            attention_mask=decoder_attention_mask,
++            inputs_embeds=decoder_inputs_embeds,
++            past_key_values=past_key_values,
++            encoder_hidden_states=hidden_states,
++            encoder_attention_mask=attention_mask,
++            head_mask=decoder_head_mask,
++            cross_attn_head_mask=cross_attn_head_mask,
++            use_cache=use_cache,
++            output_attentions=output_attentions,
++            output_hidden_states=output_hidden_states,
++            return_dict=return_dict,
++        )
++
++        sequence_output = decoder_outputs[0]
++
++        # Set device for model parallelism
++        if self.model_parallel:
++            torch.cuda.set_device(self.encoder.first_device)
++            self.lm_head = self.lm_head.to(self.encoder.first_device)
++            sequence_output = sequence_output.to(self.lm_head.weight.device)
++
++        if self.config.tie_word_embeddings:
++            # Rescale output before projecting on vocab
++            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
++            sequence_output = sequence_output * (self.model_dim**-0.5)
++
++        lm_logits = self.lm_head(sequence_output)
  
          loss = None
          if labels is not None:
-@@ -1806,17 +1920,10 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1922,10 +1808,17 @@ class MT5ForConditionalGeneration(MT5Pre
          if not return_dict:
              output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
              return ((loss,) + output) if loss is not None else output
--
++
          return Seq2SeqLMOutput(
              loss=loss,
--            logits=lm_logits,
--            past_key_values=decoder_outputs.past_key_values,
--            decoder_hidden_states=decoder_outputs.hidden_states,
--            decoder_attentions=decoder_outputs.attentions,
--            cross_attentions=decoder_outputs.cross_attentions,
--            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
--            encoder_hidden_states=encoder_outputs.hidden_states,
--            encoder_attentions=encoder_outputs.attentions,
-+            logits=decoder_outputs[0],
-+            past_key_values=decoder_outputs[1]
+-            logits=decoder_outputs[0],
+-            past_key_values=decoder_outputs[1]
++            logits=lm_logits,
++            past_key_values=decoder_outputs.past_key_values,
++            decoder_hidden_states=decoder_outputs.hidden_states,
++            decoder_attentions=decoder_outputs.attentions,
++            cross_attentions=decoder_outputs.cross_attentions,
++            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
++            encoder_hidden_states=encoder_outputs.hidden_states,
++            encoder_attentions=encoder_outputs.attentions,
          )
  
      # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
-@@ -1824,6 +1931,7 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1933,7 +1826,6 @@ class MT5ForConditionalGeneration(MT5Pre
          self,
          input_ids,
          past_key_values=None,
-+        past_cross_key_values=None,
+-        past_cross_key_values=None,
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1849,6 +1957,7 @@ class MT5ForConditionalGeneration(MT5Pre
+@@ -1959,7 +1851,6 @@ class MT5ForConditionalGeneration(MT5Pre
          return {
              "decoder_input_ids": input_ids,
              "past_key_values": past_key_values,
-+            "past_cross_key_values": past_cross_key_values,
+-            "past_cross_key_values": past_cross_key_values,
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-- 
Gitee


From 64c73cd0ae3aee00ed990b1c586af60923e4a885 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 5 Sep 2024 12:37:56 +0000
Subject: [PATCH 039/110] =?UTF-8?q?MT5=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_mt5.py | 181 ++++++++++++++++++
 MindIE/MindIE-Torch/built-in/T5/test_mt5.py   |  54 ++++++
 2 files changed, 235 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_mt5.py
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/test_mt5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py
new file mode 100644
index 0000000000..dc8308e362
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py
@@ -0,0 +1,181 @@
+
+import torch
+import torch_npu
+import argparse
+import os
+import mindietorch
+from transformers import MT5ForConditionalGeneration
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./models",
+        help="save dir"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./T5-Small",
+        help="T5 model path"
+    )
+    parser.add_argument(
+        "--max_batchsize",
+        type=int,
+        default=1,
+        help="max batchsize when running"
+    )
+
+    parser.add_argument(
+        "--max_input_seq_len",
+        type=int,
+        default=256,
+        help="max input_sequence length when running"
+    )
+
+    
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="npu device id"
+    )
+    return parser.parse_args()
+
+
+class TextEncoderExport(torch.nn.Module):
+    def __init__(self, textencoder_model):
+        super(TextEncoderExport, self).__init__()
+        self.textencoder_model = textencoder_model
+    
+    def forward(self, input_ids):
+        return self.textencoder_model(input_ids=input_ids)
+
+class TextDecoderExport(torch.nn.Module):
+    def __init__(self, textdecoder_model):
+        super(TextDecoderExport, self).__init__()
+        self.textdecoder_model = textdecoder_model
+    
+    def forward(self,
+                input_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values,
+                past_cross_key_values):
+        return self.textdecoder_model(input_ids=input_ids,
+                                      encoder_hidden_states=encoder_hidden_states,
+                                      encoder_attention_mask=encoder_attention_mask,
+                                      past_key_values=past_key_values,
+                                      past_cross_key_values=past_cross_key_values,
+                                      return_dict=True)
+
+def export_textencoder(args, model, save_dir, batch_size):
+    encoder_path = os.path.join(save_dir, "encoder")
+    if not os.path.exists(encoder_path):
+        os.makedirs(encoder_path, mode=0o640)
+    traced_path = os.path.join(encoder_path, "encoder.pt")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(traced_path):
+        text_encoder = model.encoder
+        dummy_input = (
+            torch.ones([1, 128], dtype=torch.int64).npu()
+        )
+        encoder = TextEncoderExport(text_encoder)
+        encoder.eval()
+        torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        
+        inputs0 = []
+        # inputs1 = []
+        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
+        print("compiling encoder")
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=inputs0,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def export_textdecoder(args, model, save_dir, batch_size):
+    decoder_path = os.path.join(save_dir, "decoder")
+    if not os.path.exists(decoder_path):
+        os.makedirs(decoder_path, mode=0o640)
+    traced_path = os.path.join(decoder_path, "decoder.pt")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    model_path = args.model_path
+    max_lenth = 120
+    if not os.path.exists(traced_path):
+        text_decoder = model.decoder
+        dummy_input = (
+            torch.ones([1, 1], dtype=torch.int64).npu(),
+            torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(),
+            torch.ones(1,16).npu(),
+            torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(),
+            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()
+        )
+        decoder = TextDecoderExport(text_decoder).npu()
+        decoder.eval()
+        torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        print("compiling decoder")
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=[mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,1),
+                                      dtype=mindietorch.dtype.INT64),
+
+                    mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16),
+                                      
+                    mindietorch.Input(min_shape = (1,1),
+                                      max_shape =(args.max_batchsize,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.INT64),
+                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16),
+
+                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads),
+                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len,model.config.d_kv*model.config.num_heads),
+                                      dtype=mindietorch.dtype.FLOAT16)],
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def main():
+    args = parse_arguments()
+    device_id = args.device_id
+    save_dir = args.output_dir
+    torch.npu.set_device(device_id)
+    batch_size = 1
+    model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu()
+    encoder_path = os.path.join(save_dir, "encoder")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textencoder(args, model, save_dir, batch_size)
+    print("export encoder_model done!")
+
+    decoder_path = os.path.join(save_dir, "decoder")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textdecoder(args, model, save_dir, batch_size)
+    print("export decoder_model done!")
+    
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py
new file mode 100644
index 0000000000..af441392d4
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py
@@ -0,0 +1,54 @@
+import torch
+import time
+import argparse
+import torch_npu
+from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hf_model_path", type=str, required=True)
+
+    parser.add_argument("--encoder_aie_path", type=str, required=True)
+    parser.add_argument("--decoder_aie_path", type=str, required=True)
+
+    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    torch.npu.set_device(args.device_id)
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path)
+    text = [
+                "translate English to German: The house is wonderful.",
+                "summarize: I am a high-performance inference optimizer and runtime.",
+                "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world",
+                ]
+    model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu()
+    encoder = model.encoder
+    decoder = model.decoder
+    encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu()
+    t5_config = MT5Config.from_pretrained(args.hf_model_path)
+
+    encoder_output = encoder(encoder_input)[0]
+    model = MT5ForConditionalGeneration(config=t5_config,
+                                        encoder_path=args.encoder_aie_path,
+                                        decoder_path=args.decoder_aie_path,
+                                        device_id=args.device_id).half().npu()
+    
+    encoder_mindie = model.encoder_mindie
+    decoder_mindie = model.decoder_mindie
+    mindie_stream = model.stream
+    with torch.npu.stream(mindie_stream): # set stream
+        mindie_encoder_output = encoder_mindie(encoder_input)[0]
+    mindie_stream.synchronize() # synchronize
+    if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99:
+        print("encoder precision failed")
+    else:
+        print("test OK")
+
+
+if __name__ == "__main__":
+    main()
+
-- 
Gitee


From f32b967a55b14959ee64fa8c2614ea94b7be4df2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:48:57 +0000
Subject: [PATCH 040/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Foutputs=5Fpatch?=
 =?UTF-8?q?.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/T5_modeling_outputs_patch.py  | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
deleted file mode 100644
index 21cd251b95..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_outputs_patch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import transformers
-
-
-def main():
-    transformers_path = transformers.__path__
-    transformers_version = transformers.__version__
-
-    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
-    os.system(f'patch -p0 {transformers_path[0]}/modeling_outputs.py modeling_outputs.patch')
-
-
-if __name__ == '__main__':
-    main()
-- 
Gitee


From d569cb70edacafd1ca0bddbeefdce29fc3877d45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:49:16 +0000
Subject: [PATCH 041/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Fmt5.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/modeling_mt5.patch            | 597 ------------------
 1 file changed, 597 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
deleted file mode 100644
index 0fdb93043a..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_mt5.patch
+++ /dev/null
@@ -1,597 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py	2024-09-05 20:29:34.752000000 +0800
-+++ modeling_mt5.py	2024-09-05 20:33:39.712000000 +0800
-@@ -21,8 +21,6 @@ import warnings
- from typing import List, Optional, Tuple, Union
- 
- import torch
--import torch_npu
--import mindietorch
- from torch import nn
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
- 
-@@ -326,7 +324,6 @@ class MT5Attention(nn.Module):
-         key_value_states=None,
-         position_bias=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         layer_head_mask=None,
-         query_length=None,
-         use_cache=False,
-@@ -343,8 +340,7 @@ class MT5Attention(nn.Module):
-         real_seq_length = seq_length
- 
-         if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
-+            if len(past_key_value) != 2:
-                 raise ValueError(
-                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-                 )
-@@ -369,10 +365,10 @@ class MT5Attention(nn.Module):
-             elif past_key_value is None:
-                 # cross-attn
-                 # (batch_size, n_heads, seq_length, dim_per_head)
-+
-                 hidden_states = shape(proj_layer(key_value_states))
- 
-             if past_key_value is not None:
--                past_key_value = shape(past_key_value)
-                 if key_value_states is None:
-                     # self-attn
-                     # (batch_size, n_heads, key_length, dim_per_head)
-@@ -451,125 +447,12 @@ class MT5Attention(nn.Module):
-             outputs = outputs + (attn_weights,)
-         return outputs
- 
--class MT5SelfAttention(MT5Attention):
--    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
--        super().__init__(config, has_relative_attention_bias)
--
--    def forward(
--        self,
--        hidden_states,
--        mask=None,
--        position_bias=None,
--        past_key_value=None,
--        layer_head_mask=None,
--        use_cache=False,
--        output_attentions=False,
--    ):
--        """
--        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
--        """
--        # Input is (batch_size, seq_length, dim)
--        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
--        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
--        batch_size, seq_length = hidden_states.shape[:2]
--
--        real_seq_length = seq_length
--
--        if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
--                raise ValueError(
--                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
--                )
--            real_seq_length += past_key_value[0].shape[2]
--        key_length = real_seq_length
--
--        def shape(states):
--            """projection"""
--            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
--
--        def unshape(states):
--            """reshape"""
--            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
--
--        def project(hidden_states, proj_layer, past_key_value):
--            """projects hidden states correctly to key/query states"""
--            if past_key_value is None:
--                # cross-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(hidden_states))
--
--            if past_key_value is not None:
--                hidden_states = shape(proj_layer(hidden_states))
--                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
--            return hidden_states
--
--        # get query states
--        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
--
--        # get key/value states
--        key_states = project(
--            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
--        )
--        value_states = project(
--            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
--        )
--        # compute scores
--        scores = torch.matmul(
--            query_states, key_states.transpose(3, 2)
--        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--        # print("scores=",scores.dtype)
--        if position_bias is None:
--            if not self.has_relative_attention_bias:
--                position_bias = torch.zeros(
--                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
--                )
--                if self.gradient_checkpointing and self.training:
--                    position_bias.requires_grad = True
--            else:
--                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
--
--            # if key and values are already calculated
--            # we want only the last query position bias
--            if past_key_value is not None:
--                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
--
--            if mask is not None:
--                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
--
--        if self.pruned_heads:
--            mask = torch.ones(position_bias.shape[1])
--            mask[list(self.pruned_heads)] = 0
--            position_bias_masked = position_bias[:, mask.bool()]
--        else:
--            position_bias_masked = position_bias
--
--        scores += position_bias_masked
--        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
--            scores
--        )  # (batch_size, n_heads, seq_length, key_length)
--        attn_weights = nn.functional.dropout(
--            attn_weights, p=self.dropout, training=self.training
--        )  # (batch_size, n_heads, seq_length, key_length)
--
--        # Mask heads if we want to
--        if layer_head_mask is not None:
--            attn_weights = attn_weights * layer_head_mask
--
--        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
--        attn_output = self.o(attn_output)
--
--        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
--        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--        if output_attentions:
--            outputs = outputs + (attn_weights,)
--        return outputs
- 
- # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
- class MT5LayerSelfAttention(nn.Module):
-     def __init__(self, config, has_relative_attention_bias=False):
-         super().__init__()
--        self.SelfAttention = MT5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-+        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-         self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-         self.dropout = nn.Dropout(config.dropout_rate)
- 
-@@ -658,7 +541,6 @@ class MT5Block(nn.Module):
-         layer_head_mask=None,
-         cross_attn_layer_head_mask=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         use_cache=False,
-         output_attentions=False,
-         return_dict=True,
-@@ -668,15 +550,15 @@ class MT5Block(nn.Module):
-                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
-             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
- 
--            # if len(past_key_value) != expected_num_past_key_values:
--            #     raise ValueError(
--            #         f"There should be {expected_num_past_key_values} past states. "
--            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
--            #         f"Got {len(past_key_value)} past key / value states"
--            #     )
-+            if len(past_key_value) != expected_num_past_key_values:
-+                raise ValueError(
-+                    f"There should be {expected_num_past_key_values} past states. "
-+                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-+                    f"Got {len(past_key_value)} past key / value states"
-+                )
- 
--            self_attn_past_key_value = past_key_value
--            cross_attn_past_key_value = past_cross_key_value
-+            self_attn_past_key_value = past_key_value[:2]
-+            cross_attn_past_key_value = past_key_value[2:]
-         else:
-             self_attn_past_key_value, cross_attn_past_key_value = None, None
- 
-@@ -709,7 +591,8 @@ class MT5Block(nn.Module):
-                 query_length = present_key_value_state[0].shape[2]
-             else:
-                 query_length = None
--
-+            import pdb
-+            pdb.set_trace()
-             cross_attention_outputs = self.layer[1](
-                 hidden_states,
-                 key_value_states=encoder_hidden_states,
-@@ -733,7 +616,9 @@ class MT5Block(nn.Module):
-                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
- 
-             # Combine self attn and cross attn key value states
--            cross_attn_past_key_values = cross_attention_outputs[1]
-+            if present_key_value_state is not None:
-+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-+
-             # Keep cross-attention outputs and relative position weights
-             attention_outputs = attention_outputs + cross_attention_outputs[2:]
- 
-@@ -752,7 +637,7 @@ class MT5Block(nn.Module):
-         outputs = (hidden_states,)
- 
-         if use_cache:
--            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
-+            outputs = outputs + (present_key_value_state,) + attention_outputs
-         else:
-             outputs = outputs + attention_outputs
- 
-@@ -1001,14 +886,11 @@ class MT5PreTrainedModel(PreTrainedModel
- 
- # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
- class MT5Stack(MT5PreTrainedModel):
--    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
-+    def __init__(self, config, embed_tokens=None):
-         super().__init__(config)
- 
-         self.embed_tokens = embed_tokens
-         self.is_decoder = config.is_decoder
--        self.lm_head=lm_head
--        self.encodecrosskeyvalue = encodecrosskeyvalue
--        self.model_dim = config.d_model
- 
-         self.block = nn.ModuleList(
-             [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -1076,14 +958,13 @@ class MT5Stack(MT5PreTrainedModel):
-     def forward(
-         self,
-         input_ids=None,
-+        attention_mask=None,
-         encoder_hidden_states=None,
-         encoder_attention_mask=None,
--        past_key_values=None,
--        past_cross_key_values=None,
--        attention_mask=None,
-         inputs_embeds=None,
-         head_mask=None,
-         cross_attn_head_mask=None,
-+        past_key_values=None,
-         use_cache=None,
-         output_attentions=None,
-         output_hidden_states=None,
-@@ -1129,9 +1010,9 @@ class MT5Stack(MT5PreTrainedModel):
-                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
- 
-         # initialize past_key_values with `None` if past does not exist
--        if not self.is_decoder:
-+        if past_key_values is None:
-             past_key_values = [None] * len(self.block)
--            past_cross_key_values = [None] * len(self.block)
-+
-         if attention_mask is None:
-             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
- 
-@@ -1162,7 +1043,7 @@ class MT5Stack(MT5PreTrainedModel):
-         # Prepare head mask if needed
-         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
--        present_key_value_states = [] if use_cache else None
-+        present_key_value_states = () if use_cache else None
-         all_hidden_states = () if output_hidden_states else None
-         all_attentions = () if output_attentions else None
-         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1170,10 +1051,8 @@ class MT5Stack(MT5PreTrainedModel):
-         encoder_decoder_position_bias = None
- 
-         hidden_states = self.dropout(inputs_embeds)
--        for i, layer_module in enumerate(self.block):
--        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
--            past_key_value = past_key_values[i]
--            past_cross_key_value = past_cross_key_values[i]
-+
-+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-             layer_head_mask = head_mask[i]
-             cross_attn_layer_head_mask = cross_attn_head_mask[i]
-             # Model parallel
-@@ -1223,7 +1102,6 @@ class MT5Stack(MT5PreTrainedModel):
-                     layer_head_mask=layer_head_mask,
-                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                     past_key_value=past_key_value,
--                    past_cross_key_value=past_cross_key_value,
-                     use_cache=use_cache,
-                     output_attentions=output_attentions,
-                 )
-@@ -1231,19 +1109,19 @@ class MT5Stack(MT5PreTrainedModel):
-             # layer_outputs is a tuple with:
-             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-             if use_cache is False:
--                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
-+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
- 
--            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
-+            hidden_states, present_key_value_state = layer_outputs[:2]
- 
-             # We share the position biases between the layers - the first layer store them
-             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-             # (cross-attention position bias), (cross-attention weights)
--            position_bias = layer_outputs[3]
-+            position_bias = layer_outputs[2]
-             if self.is_decoder and encoder_hidden_states is not None:
--                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
-+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-             # append next layer key value states
-             if use_cache:
--                present_key_value_states.extend(present_key_value_state)
-+                present_key_value_states = present_key_value_states + (present_key_value_state,)
- 
-             if output_attentions:
-                 all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1257,12 +1135,12 @@ class MT5Stack(MT5PreTrainedModel):
-                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
- 
-         hidden_states = self.final_layer_norm(hidden_states)
--        hidden_states = self.dropout(hidden_states).half()
-+        hidden_states = self.dropout(hidden_states)
- 
-         # Add last layer
-         if output_hidden_states:
-             all_hidden_states = all_hidden_states + (hidden_states,)
--
-+        print("return_dict=",return_dict)
-         if not return_dict:
-             return tuple(
-                 v
-@@ -1275,17 +1153,13 @@ class MT5Stack(MT5PreTrainedModel):
-                 ]
-                 if v is not None
-             )
--        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
--        if not self.is_decoder and self.encodecrosskeyvalue:
--            res = self.encodecrosskeyvalue(hidden_states)
--            return tuple((hidden_states, res))
--        lm_logits = None
--        if self.is_decoder:
--            #logits = None
--            if self.config.tie_word_embeddings:
--                hidden_states = hidden_states * (self.model_dim ** -0.5)
--            lm_logits = self.lm_head(hidden_states)
--            return tuple((lm_logits, present_key_value_states))
-+        return BaseModelOutputWithPastAndCrossAttentions(
-+            last_hidden_state=hidden_states,
-+            past_key_values=present_key_value_states,
-+            hidden_states=all_hidden_states,
-+            attentions=all_attentions,
-+            cross_attentions=all_cross_attentions,
-+        )
- 
- 
- MT5_START_DOCSTRING = r"""
-@@ -1677,29 +1551,6 @@ class MT5Model(MT5PreTrainedModel):
-         )
- 
- 
--class EncoderToCrossKeyValue(nn.Module):
--    def __init__(self, cross_key, cross_value, num_heads, d_kv):
--        super().__init__()
--        self.cross_key = cross_key
--        self.cross_value = cross_value
--        self.num_heads = num_heads
--        self.d_kv = d_kv
--
--
--    def forward(self, hidden_states):
--        batch_size = hidden_states.shape[0]
--        encoder_hidden_states_kvs = []
--        # for i in range(len(self.cross_value)):
--        #     encoder_hidden_states_kvs.append(
--        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
--        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
--        for i in range(len(self.cross_value)):
--            encoder_hidden_states_kvs.append(
--                torch.stack((self.cross_key[i](hidden_states),
--                             self.cross_value[i](hidden_states)), dim=0))
--        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
--        return past_cross_key_values
--
- @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
- class MT5ForConditionalGeneration(MT5PreTrainedModel):
-     r"""
-@@ -1724,45 +1575,28 @@ class MT5ForConditionalGeneration(MT5Pre
-     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
- 
-     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
--    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
-+    def __init__(self, config: MT5Config):
-         super().__init__(config)
--        
--        self.encoder_path = encoder_path
--        self.decoder_path = decoder_path
--        if not self.encoder_path or not self.decoder_path:
--            self.model_dim = config.d_model
--            self.shared = nn.Embedding(config.vocab_size, config.d_model)
--            decoder_config = copy.deepcopy(config)
--            decoder_config.is_decoder = True
--            decoder_config.is_encoder_decoder = False
--            decoder_config.num_layers = config.num_decoder_layers
--            
--
--            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
--            self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head)
--            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
--            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
--            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
--            encoder_config = copy.deepcopy(config)
--            encoder_config.is_decoder = False
--            encoder_config.use_cache = False
--            encoder_config.is_encoder_decoder = False
--            self.encoder = MT5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
--        self.encoder_mindie = None
--        self.decoder_mindie = None
--        if self.encoder_path:
--            self.encoder_mindie = torch.jit.load(self.encoder_path)
--        if self.decoder_path:
--            self.decoder_mindie = torch.jit.load(self.decoder_path)
--            self.stream = torch.npu.Stream(f"npu:{device_id}")
--            self.device_id = device_id
--
--    
--    def get_device(self):
--        return f"npu:{self.device_id}"
-+        self.model_dim = config.d_model
-+
-+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-+
-+        encoder_config = copy.deepcopy(config)
-+        encoder_config.is_decoder = False
-+        encoder_config.use_cache = False
-+        encoder_config.is_encoder_decoder = False
-+        self.encoder = MT5Stack(encoder_config, self.shared)
-+
-+        decoder_config = copy.deepcopy(config)
-+        decoder_config.is_decoder = True
-+        decoder_config.is_encoder_decoder = False
-+        decoder_config.num_layers = config.num_decoder_layers
-+        self.decoder = MT5Stack(decoder_config, self.shared)
-+
-+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
- 
-         # Initialize weights and apply final processing
--        # self.post_init()
-+        self.post_init()
- 
-         # Model parallel
-         self.model_parallel = False
-@@ -1845,7 +1679,6 @@ class MT5ForConditionalGeneration(MT5Pre
-         cross_attn_head_mask: Optional[torch.Tensor] = None,
-         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
--        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         inputs_embeds: Optional[torch.FloatTensor] = None,
-         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-         labels: Optional[torch.LongTensor] = None,
-@@ -1893,23 +1726,76 @@ class MT5ForConditionalGeneration(MT5Pre
-             if self.config.num_layers == self.config.num_decoder_layers:
-                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                 decoder_head_mask = head_mask
--        
--        hidden_states = encoder_outputs["last_hidden_state"]
--        past_cross_key_values = encoder_outputs["past_cross_key_values"]
- 
--        # if self.model_parallel:
--        #     torch.cuda.set_device(self.decoder.first_device)
-+        # Encode if needed (training, first prediction pass)
-+        if encoder_outputs is None:
-+            # Convert encoder inputs in embeddings if needed
-+            encoder_outputs = self.encoder(
-+                input_ids=input_ids,
-+                attention_mask=attention_mask,
-+                inputs_embeds=inputs_embeds,
-+                head_mask=head_mask,
-+                output_attentions=output_attentions,
-+                output_hidden_states=output_hidden_states,
-+                return_dict=return_dict,
-+            )
-+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-+            encoder_outputs = BaseModelOutput(
-+                last_hidden_state=encoder_outputs[0],
-+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-+            )
-+
-+        hidden_states = encoder_outputs[0]
-+
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
- 
-         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-             # get decoder inputs from shifting lm labels to the right
-             decoder_input_ids = self._shift_right(labels)
- 
--        with torch.npu.stream(self.stream): # set stream
--            # import pdb
--            # pdb.set_trace()
--            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
--        self.stream.synchronize() # synchronize
--        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
-+            hidden_states = hidden_states.to(self.decoder.first_device)
-+            if decoder_input_ids is not None:
-+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-+            if attention_mask is not None:
-+                attention_mask = attention_mask.to(self.decoder.first_device)
-+            if decoder_attention_mask is not None:
-+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-+
-+        # Decode
-+        decoder_outputs = self.decoder(
-+            input_ids=decoder_input_ids,
-+            attention_mask=decoder_attention_mask,
-+            inputs_embeds=decoder_inputs_embeds,
-+            past_key_values=past_key_values,
-+            encoder_hidden_states=hidden_states,
-+            encoder_attention_mask=attention_mask,
-+            head_mask=decoder_head_mask,
-+            cross_attn_head_mask=cross_attn_head_mask,
-+            use_cache=use_cache,
-+            output_attentions=output_attentions,
-+            output_hidden_states=output_hidden_states,
-+            return_dict=return_dict,
-+        )
-+
-+        sequence_output = decoder_outputs[0]
-+
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.encoder.first_device)
-+            self.lm_head = self.lm_head.to(self.encoder.first_device)
-+            sequence_output = sequence_output.to(self.lm_head.weight.device)
-+
-+        if self.config.tie_word_embeddings:
-+            # Rescale output before projecting on vocab
-+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-+            sequence_output = sequence_output * (self.model_dim**-0.5)
-+
-+        lm_logits = self.lm_head(sequence_output)
- 
-         loss = None
-         if labels is not None:
-@@ -1922,10 +1808,17 @@ class MT5ForConditionalGeneration(MT5Pre
-         if not return_dict:
-             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-             return ((loss,) + output) if loss is not None else output
-+
-         return Seq2SeqLMOutput(
-             loss=loss,
--            logits=decoder_outputs[0],
--            past_key_values=decoder_outputs[1]
-+            logits=lm_logits,
-+            past_key_values=decoder_outputs.past_key_values,
-+            decoder_hidden_states=decoder_outputs.hidden_states,
-+            decoder_attentions=decoder_outputs.attentions,
-+            cross_attentions=decoder_outputs.cross_attentions,
-+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-+            encoder_hidden_states=encoder_outputs.hidden_states,
-+            encoder_attentions=encoder_outputs.attentions,
-         )
- 
-     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
-@@ -1933,7 +1826,6 @@ class MT5ForConditionalGeneration(MT5Pre
-         self,
-         input_ids,
-         past_key_values=None,
--        past_cross_key_values=None,
-         attention_mask=None,
-         head_mask=None,
-         decoder_head_mask=None,
-@@ -1959,7 +1851,6 @@ class MT5ForConditionalGeneration(MT5Pre
-         return {
-             "decoder_input_ids": input_ids,
-             "past_key_values": past_key_values,
--            "past_cross_key_values": past_cross_key_values,
-             "encoder_outputs": encoder_outputs,
-             "attention_mask": attention_mask,
-             "head_mask": head_mask,
-- 
Gitee


From 1c98df50261ea7eba496c23e0fb978199abd95da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:53:17 +0000
Subject: [PATCH 042/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Ft5=5Fpatch.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/T5_modeling_t5_patch.py       | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
deleted file mode 100644
index e304f4f9f2..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import transformers
-
-
-def main():
-    transformers_path = transformers.__path__
-    transformers_version = transformers.__version__
-
-    assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
-    os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
-
-
-if __name__ == '__main__':
-    main()
-- 
Gitee


From f53e742428dce93119e97d52e96b2bf1f6b31b69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:53:29 +0000
Subject: [PATCH 043/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Fmodeling=5Futils=5Fpatch.p?=
 =?UTF-8?q?y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/T5_modeling_utils_patch.py    | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
deleted file mode 100644
index b3ad7bc20b..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_utils_patch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import transformers
-
-
-def main():
-    transformers_path = transformers.__path__
-    transformers_version = transformers.__version__
-
-    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
-    os.system(f'patch -p0 {transformers_path[0]}/modeling_utils.py modeling_utils.patch')
-
-
-if __name__ == '__main__':
-    main()
-- 
Gitee


From d44f0ade1f0f81d6be81e609a3fd6e3c386d24a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:53:41 +0000
Subject: [PATCH 044/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/T5=5Futils=5Fpatch.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/T5_utils_patch.py             | 28 -------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
deleted file mode 100644
index 046b6e6b85..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/T5_utils_patch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import transformers
-
-
-def main():
-    transformers_path = transformers.__path__
-    transformers_version = transformers.__version__
-
-    assert transformers_version == '4.42.0', "expectation transformers==4.42.0"
-    os.system(f'patch -p0 {transformers_path[0]}/generation/utils.py utils.patch')
-
-
-if __name__ == '__main__':
-    main()
-- 
Gitee


From eceacc242939e507a2c1a74ebea6fc463f99bebe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:53:59 +0000
Subject: [PATCH 045/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Foutputs.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
deleted file mode 100644
index 6c99414a69..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_outputs.patch
+++ /dev/null
@@ -1,10 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_outputs.py	2024-08-28 19:20:22.112000000 +0800
-+++ modeling_outputs.py	2024-09-02 18:32:37.720000000 +0800
-@@ -282,7 +282,6 @@ class BaseModelOutputWithPastAndCrossAtt
- 
-     last_hidden_state: torch.FloatTensor = None
-     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
--    past_cross_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-     cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-- 
Gitee


From 3e2708551fdd041724b830c29a8c4474a783f884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:54:14 +0000
Subject: [PATCH 046/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Ft5.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/modeling_t5.patch             | 596 ------------------
 1 file changed, 596 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
deleted file mode 100644
index 40920ac007..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ /dev/null
@@ -1,596 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py	2024-09-04 19:03:55.080000000 +0800
-+++ modling_t5.py	2024-09-04 19:04:47.048000000 +0800
-@@ -23,8 +23,6 @@ from typing import List, Optional, Tuple
- import torch
- from torch import nn
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
--import torch_npu
--import mindietorch
- 
- from ...activations import ACT2FN
- from ...modeling_outputs import (
-@@ -451,7 +449,6 @@ class T5Attention(nn.Module):
-         key_value_states=None,
-         position_bias=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         layer_head_mask=None,
-         query_length=None,
-         use_cache=False,
-@@ -468,8 +465,7 @@ class T5Attention(nn.Module):
-         real_seq_length = seq_length
- 
-         if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
-+            if len(past_key_value) != 2:
-                 raise ValueError(
-                     f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-                 )
-@@ -497,7 +493,6 @@ class T5Attention(nn.Module):
-                 hidden_states = shape(proj_layer(key_value_states))
- 
-             if past_key_value is not None:
--                past_key_value = shape(past_key_value)
-                 if key_value_states is None:
-                     # self-attn
-                     # (batch_size, n_heads, key_length, dim_per_head)
-@@ -571,133 +566,16 @@ class T5Attention(nn.Module):
- 
-         present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-         outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--       
--        if output_attentions:
--            outputs = outputs + (attn_weights,)
--        return outputs
--
--
--class T5SelfAttention(T5Attention):
--    def __init__(self, config: T5Config, has_relative_attention_bias=False):
--        super().__init__(config, has_relative_attention_bias)
--
--    def forward(
--        self,
--        hidden_states,
--        mask=None,
--        position_bias=None,
--        past_key_value=None,
--        layer_head_mask=None,
--        use_cache=False,
--        output_attentions=False,
--    ):
--        """
--        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
--        """
--        # Input is (batch_size, seq_length, dim)
--        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
--        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
--        batch_size, seq_length = hidden_states.shape[:2]
--
--        real_seq_length = seq_length
--
--        if past_key_value is not None:
--            if past_key_value.shape[0] != 2:
--            # if len(past_key_value) != 2:
--                raise ValueError(
--                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
--                )
--            real_seq_length += past_key_value[0].shape[2]
--        key_length = real_seq_length
--
--        def shape(states):
--            """projection"""
--            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
--
--        def unshape(states):
--            """reshape"""
--            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
--
--        def project(hidden_states, proj_layer, past_key_value):
--            """projects hidden states correctly to key/query states"""
--            if past_key_value is None:
--                # cross-attn
--                # (batch_size, n_heads, seq_length, dim_per_head)
--                hidden_states = shape(proj_layer(hidden_states))
--
--            if past_key_value is not None:
--                hidden_states = shape(proj_layer(hidden_states))
--                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
--            return hidden_states
--
--        # get query states
--        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
--
--        # get key/value states
--        key_states = project(
--            hidden_states, self.k, past_key_value[0] if past_key_value is not None else None
--        )
--        value_states = project(
--            hidden_states, self.v, past_key_value[1] if past_key_value is not None else None
--        )
--        # compute scores
--        scores = torch.matmul(
--            query_states, key_states.transpose(3, 2)
--        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--        # print("scores=",scores.dtype)
--        if position_bias is None:
--            if not self.has_relative_attention_bias:
--                position_bias = torch.zeros(
--                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
--                )
--                if self.gradient_checkpointing and self.training:
--                    position_bias.requires_grad = True
--            else:
--                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
--
--            # if key and values are already calculated
--            # we want only the last query position bias
--            if past_key_value is not None:
--                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
--
--            if mask is not None:
--                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
- 
--        if self.pruned_heads:
--            mask = torch.ones(position_bias.shape[1])
--            mask[list(self.pruned_heads)] = 0
--            position_bias_masked = position_bias[:, mask.bool()]
--        else:
--            position_bias_masked = position_bias
--
--        scores += position_bias_masked
--        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
--            scores
--        )  # (batch_size, n_heads, seq_length, key_length)
--        attn_weights = nn.functional.dropout(
--            attn_weights, p=self.dropout, training=self.training
--        )  # (batch_size, n_heads, seq_length, key_length)
--
--        # Mask heads if we want to
--        if layer_head_mask is not None:
--            attn_weights = attn_weights * layer_head_mask
--
--        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
--        attn_output = self.o(attn_output)
--
--        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
--        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-         if output_attentions:
-             outputs = outputs + (attn_weights,)
-         return outputs
- 
- 
--
--
- class T5LayerSelfAttention(nn.Module):
-     def __init__(self, config, has_relative_attention_bias=False):
-         super().__init__()
--        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-         self.dropout = nn.Dropout(config.dropout_rate)
- 
-@@ -784,7 +662,6 @@ class T5Block(nn.Module):
-         layer_head_mask=None,
-         cross_attn_layer_head_mask=None,
-         past_key_value=None,
--        past_cross_key_value=None,
-         use_cache=False,
-         output_attentions=False,
-         return_dict=True,
-@@ -794,15 +671,15 @@ class T5Block(nn.Module):
-                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
-             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
- 
--            # if len(past_key_value) != expected_num_past_key_values:
--            #     raise ValueError(
--            #         f"There should be {expected_num_past_key_values} past states. "
--            #         f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
--            #         f"Got {len(past_key_value)} past key / value states"
--            #     )
-+            if len(past_key_value) != expected_num_past_key_values:
-+                raise ValueError(
-+                    f"There should be {expected_num_past_key_values} past states. "
-+                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-+                    f"Got {len(past_key_value)} past key / value states"
-+                )
- 
--            self_attn_past_key_value = past_key_value
--            cross_attn_past_key_value = past_cross_key_value
-+            self_attn_past_key_value = past_key_value[:2]
-+            cross_attn_past_key_value = past_key_value[2:]
-         else:
-             self_attn_past_key_value, cross_attn_past_key_value = None, None
- 
-@@ -859,7 +736,9 @@ class T5Block(nn.Module):
-                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
- 
-             # Combine self attn and cross attn key value states
--            cross_attn_past_key_values = cross_attention_outputs[1]
-+            if present_key_value_state is not None:
-+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-+
-             # Keep cross-attention outputs and relative position weights
-             attention_outputs = attention_outputs + cross_attention_outputs[2:]
- 
-@@ -878,7 +757,7 @@ class T5Block(nn.Module):
-         outputs = (hidden_states,)
- 
-         if use_cache:
--            outputs = outputs + (present_key_value_state,) +(cross_attn_past_key_values,)+ attention_outputs
-+            outputs = outputs + (present_key_value_state,) + attention_outputs
-         else:
-             outputs = outputs + attention_outputs
- 
-@@ -1018,14 +897,11 @@ class T5PreTrainedModel(PreTrainedModel)
- 
- 
- class T5Stack(T5PreTrainedModel):
--    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskeyvalue=None):
-+    def __init__(self, config, embed_tokens=None):
-         super().__init__(config)
- 
-         self.embed_tokens = embed_tokens
-         self.is_decoder = config.is_decoder
--        self.lm_head=lm_head
--        self.encodecrosskeyvalue = encodecrosskeyvalue
--        self.model_dim = config.d_model
- 
-         self.block = nn.ModuleList(
-             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -1093,14 +969,13 @@ class T5Stack(T5PreTrainedModel):
-     def forward(
-         self,
-         input_ids=None,
-+        attention_mask=None,
-         encoder_hidden_states=None,
-         encoder_attention_mask=None,
--        past_key_values=None,
--        past_cross_key_values=None,
--        attention_mask=None,
-         inputs_embeds=None,
-         head_mask=None,
-         cross_attn_head_mask=None,
-+        past_key_values=None,
-         use_cache=None,
-         output_attentions=None,
-         output_hidden_states=None,
-@@ -1146,9 +1021,9 @@ class T5Stack(T5PreTrainedModel):
-                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
- 
-         # initialize past_key_values with `None` if past does not exist
--        if not self.is_decoder:
-+        if past_key_values is None:
-             past_key_values = [None] * len(self.block)
--            past_cross_key_values = [None] * len(self.block)
-+
-         if attention_mask is None:
-             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
- 
-@@ -1179,7 +1054,7 @@ class T5Stack(T5PreTrainedModel):
-         # Prepare head mask if needed
-         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
--        present_key_value_states = [] if use_cache else None
-+        present_key_value_states = () if use_cache else None
-         all_hidden_states = () if output_hidden_states else None
-         all_attentions = () if output_attentions else None
-         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1187,10 +1062,8 @@ class T5Stack(T5PreTrainedModel):
-         encoder_decoder_position_bias = None
- 
-         hidden_states = self.dropout(inputs_embeds)
--        for i, layer_module in enumerate(self.block):
--        # for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
--            past_key_value = past_key_values[i]
--            past_cross_key_value = past_cross_key_values[i]
-+
-+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-             layer_head_mask = head_mask[i]
-             cross_attn_layer_head_mask = cross_attn_head_mask[i]
-             # Model parallel
-@@ -1240,7 +1113,6 @@ class T5Stack(T5PreTrainedModel):
-                     layer_head_mask=layer_head_mask,
-                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                     past_key_value=past_key_value,
--                    past_cross_key_value=past_cross_key_value,
-                     use_cache=use_cache,
-                     output_attentions=output_attentions,
-                 )
-@@ -1248,19 +1120,19 @@ class T5Stack(T5PreTrainedModel):
-             # layer_outputs is a tuple with:
-             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-             if use_cache is False:
--                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
-+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
- 
--            hidden_states, present_key_value_state, present_cross_key_value_state = layer_outputs[:3]
-+            hidden_states, present_key_value_state = layer_outputs[:2]
- 
-             # We share the position biases between the layers - the first layer store them
-             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-             # (cross-attention position bias), (cross-attention weights)
--            position_bias = layer_outputs[3]
-+            position_bias = layer_outputs[2]
-             if self.is_decoder and encoder_hidden_states is not None:
--                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
-+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-             # append next layer key value states
-             if use_cache:
--                present_key_value_states.extend(present_key_value_state)
-+                present_key_value_states = present_key_value_states + (present_key_value_state,)
- 
-             if output_attentions:
-                 all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1274,7 +1146,7 @@ class T5Stack(T5PreTrainedModel):
-                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
- 
-         hidden_states = self.final_layer_norm(hidden_states)
--        hidden_states = self.dropout(hidden_states).half()
-+        hidden_states = self.dropout(hidden_states)
- 
-         # Add last layer
-         if output_hidden_states:
-@@ -1292,17 +1164,13 @@ class T5Stack(T5PreTrainedModel):
-                 ]
-                 if v is not None
-             )
--        present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
--        if not self.is_decoder and self.encodecrosskeyvalue:
--            res = self.encodecrosskeyvalue(hidden_states)
--            return tuple((hidden_states, res))
--        lm_logits = None
--        if self.is_decoder:
--            #logits = None
--            if self.config.tie_word_embeddings:
--                hidden_states = hidden_states * (self.model_dim ** -0.5)
--            lm_logits = self.lm_head(hidden_states)
--            return tuple((lm_logits, present_key_value_states))
-+        return BaseModelOutputWithPastAndCrossAttentions(
-+            last_hidden_state=hidden_states,
-+            past_key_values=present_key_value_states,
-+            hidden_states=all_hidden_states,
-+            attentions=all_attentions,
-+            cross_attentions=all_cross_attentions,
-+        )
- 
- 
- T5_START_DOCSTRING = r"""
-@@ -1673,31 +1541,6 @@ class T5Model(T5PreTrainedModel):
-         )
- 
- 
--
--class EncoderToCrossKeyValue(nn.Module):
--    def __init__(self, cross_key, cross_value, num_heads, d_kv):
--        super().__init__()
--        self.cross_key = cross_key
--        self.cross_value = cross_value
--        self.num_heads = num_heads
--        self.d_kv = d_kv
--
--
--    def forward(self, hidden_states):
--        batch_size = hidden_states.shape[0]
--        encoder_hidden_states_kvs = []
--        # for i in range(len(self.cross_value)):
--        #     encoder_hidden_states_kvs.append(
--        #         torch.stack((self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2),
--        #                      self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1, 2)), dim=0))
--        for i in range(len(self.cross_value)):
--            encoder_hidden_states_kvs.append(
--                torch.stack((self.cross_key[i](hidden_states),
--                             self.cross_value[i](hidden_states)), dim=0))
--        past_cross_key_values = torch.stack(encoder_hidden_states_kvs, dim=0)
--        return past_cross_key_values
--
--
- @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
- class T5ForConditionalGeneration(T5PreTrainedModel):
-     _keys_to_ignore_on_load_unexpected = [
-@@ -1705,47 +1548,28 @@ class T5ForConditionalGeneration(T5PreTr
-     ]
-     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
- 
--    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
-+    def __init__(self, config: T5Config):
-         super().__init__(config)
--        self.encoder_path = encoder_path
--        self.decoder_path = decoder_path
--        if not self.encoder_path or not self.decoder_path:
--            self.model_dim = config.d_model
--
--            self.shared = nn.Embedding(config.vocab_size, config.d_model)
--
--            decoder_config = copy.deepcopy(config)
--            decoder_config.is_decoder = True
--            decoder_config.is_encoder_decoder = False
--            decoder_config.num_layers = config.num_decoder_layers
--       
--            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
--            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
--
--            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
--            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
--            encodecrosskeyvalue = EncoderToCrossKeyValue(cross_key, cross_value, config.num_heads, config.d_kv)
--
--            encoder_config = copy.deepcopy(config)
--            encoder_config.is_decoder = False
--            encoder_config.use_cache = False
--            encoder_config.is_encoder_decoder = False
--            self.encoder = T5Stack(encoder_config, self.shared, encodecrosskeyvalue=encodecrosskeyvalue)
--        self.encoder_mindie = None
--        self.decoder_mindie = None
--        if self.encoder_path:
--            self.encoder_mindie = torch.jit.load(self.encoder_path)
--        if self.decoder_path:
--            self.decoder_mindie = torch.jit.load(self.decoder_path)
--            self.stream = torch.npu.Stream(f"npu:{device_id}")
--            self.device_id = device_id
--
--    
--    def get_device(self):
--        return f"npu:{self.device_id}"
-+        self.model_dim = config.d_model
-+
-+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-+
-+        encoder_config = copy.deepcopy(config)
-+        encoder_config.is_decoder = False
-+        encoder_config.use_cache = False
-+        encoder_config.is_encoder_decoder = False
-+        self.encoder = T5Stack(encoder_config, self.shared)
-+
-+        decoder_config = copy.deepcopy(config)
-+        decoder_config.is_decoder = True
-+        decoder_config.is_encoder_decoder = False
-+        decoder_config.num_layers = config.num_decoder_layers
-+        self.decoder = T5Stack(decoder_config, self.shared)
-+
-+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
- 
-         # Initialize weights and apply final processing
--        # self.post_init()
-+        self.post_init()
- 
-         # Model parallel
-         self.model_parallel = False
-@@ -1824,7 +1648,6 @@ class T5ForConditionalGeneration(T5PreTr
-         cross_attn_head_mask: Optional[torch.Tensor] = None,
-         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
--        past_cross_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-         inputs_embeds: Optional[torch.FloatTensor] = None,
-         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-         labels: Optional[torch.LongTensor] = None,
-@@ -1872,23 +1695,76 @@ class T5ForConditionalGeneration(T5PreTr
-             if self.config.num_layers == self.config.num_decoder_layers:
-                 warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                 decoder_head_mask = head_mask
--        
--        hidden_states = encoder_outputs["last_hidden_state"]
--        past_cross_key_values = encoder_outputs["past_cross_key_values"]
- 
--        # if self.model_parallel:
--        #     torch.cuda.set_device(self.decoder.first_device)
-+        # Encode if needed (training, first prediction pass)
-+        if encoder_outputs is None:
-+            # Convert encoder inputs in embeddings if needed
-+            encoder_outputs = self.encoder(
-+                input_ids=input_ids,
-+                attention_mask=attention_mask,
-+                inputs_embeds=inputs_embeds,
-+                head_mask=head_mask,
-+                output_attentions=output_attentions,
-+                output_hidden_states=output_hidden_states,
-+                return_dict=return_dict,
-+            )
-+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-+            encoder_outputs = BaseModelOutput(
-+                last_hidden_state=encoder_outputs[0],
-+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-+            )
-+
-+        hidden_states = encoder_outputs[0]
-+
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
- 
-         if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-             # get decoder inputs from shifting lm labels to the right
-             decoder_input_ids = self._shift_right(labels)
- 
--        with torch.npu.stream(self.stream): # set stream
--            # import pdb
--            # pdb.set_trace()
--            decoder_outputs = self.decoder_mindie.forward(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)   
--        self.stream.synchronize() # synchronize
--        # decoder_outputs = self.decoder(decoder_input_ids, hidden_states, attention_mask, past_key_values, past_cross_key_values)
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.decoder.first_device)
-+            hidden_states = hidden_states.to(self.decoder.first_device)
-+            if decoder_input_ids is not None:
-+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-+            if attention_mask is not None:
-+                attention_mask = attention_mask.to(self.decoder.first_device)
-+            if decoder_attention_mask is not None:
-+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-+
-+        # Decode
-+        decoder_outputs = self.decoder(
-+            input_ids=decoder_input_ids,
-+            attention_mask=decoder_attention_mask,
-+            inputs_embeds=decoder_inputs_embeds,
-+            past_key_values=past_key_values,
-+            encoder_hidden_states=hidden_states,
-+            encoder_attention_mask=attention_mask,
-+            head_mask=decoder_head_mask,
-+            cross_attn_head_mask=cross_attn_head_mask,
-+            use_cache=use_cache,
-+            output_attentions=output_attentions,
-+            output_hidden_states=output_hidden_states,
-+            return_dict=return_dict,
-+        )
-+
-+        sequence_output = decoder_outputs[0]
-+
-+        # Set device for model parallelism
-+        if self.model_parallel:
-+            torch.cuda.set_device(self.encoder.first_device)
-+            self.lm_head = self.lm_head.to(self.encoder.first_device)
-+            sequence_output = sequence_output.to(self.lm_head.weight.device)
-+
-+        if self.config.tie_word_embeddings:
-+            # Rescale output before projecting on vocab
-+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-+            sequence_output = sequence_output * (self.model_dim**-0.5)
-+
-+        lm_logits = self.lm_head(sequence_output)
- 
-         loss = None
-         if labels is not None:
-@@ -1901,17 +1777,23 @@ class T5ForConditionalGeneration(T5PreTr
-         if not return_dict:
-             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-             return ((loss,) + output) if loss is not None else output
-+
-         return Seq2SeqLMOutput(
-             loss=loss,
--            logits=decoder_outputs[0],
--            past_key_values=decoder_outputs[1]
-+            logits=lm_logits,
-+            past_key_values=decoder_outputs.past_key_values,
-+            decoder_hidden_states=decoder_outputs.hidden_states,
-+            decoder_attentions=decoder_outputs.attentions,
-+            cross_attentions=decoder_outputs.cross_attentions,
-+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-+            encoder_hidden_states=encoder_outputs.hidden_states,
-+            encoder_attentions=encoder_outputs.attentions,
-         )
- 
-     def prepare_inputs_for_generation(
-         self,
-         input_ids,
-         past_key_values=None,
--        past_cross_key_values=None,
-         attention_mask=None,
-         head_mask=None,
-         decoder_head_mask=None,
-@@ -1937,7 +1819,6 @@ class T5ForConditionalGeneration(T5PreTr
-         return {
-             "decoder_input_ids": input_ids,
-             "past_key_values": past_key_values,
--            "past_cross_key_values": past_cross_key_values,
-             "encoder_outputs": encoder_outputs,
-             "attention_mask": attention_mask,
-             "head_mask": head_mask,
-@@ -2086,6 +1967,7 @@ class T5EncoderModel(T5PreTrainedModel):
-         >>> last_hidden_states = outputs.last_hidden_state
-         ```"""
-         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-+
-         encoder_outputs = self.encoder(
-             input_ids=input_ids,
-             attention_mask=attention_mask,
-- 
Gitee


From 6f87011e6e1d32ec35dc05fe082fe6193c29832d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:54:26 +0000
Subject: [PATCH 047/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Futils.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/T5/modeling_utils.patch          | 41 -------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
deleted file mode 100644
index 1b9fef8cd2..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_utils.patch
+++ /dev/null
@@ -1,41 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/modeling_utils.py	2024-08-28 20:15:38.524000000 +0800
-+++ modeling_utils.py	2024-09-02 17:29:43.700000000 +0800
-@@ -975,7 +975,7 @@ class ModuleUtilsMixin:
-         `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-         device).
-         """
--        return self.get_device()
-+        return get_parameter_device(self)
- 
-     @property
-     def dtype(self) -> torch.dtype:
-@@ -1004,8 +1004,7 @@ class ModuleUtilsMixin:
-         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-         # encoder_extended_attention_mask.transpose(-1, -2))
-         encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
--        #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
--        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
-+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
- 
-         return encoder_extended_attention_mask
- 
-@@ -1019,9 +1018,7 @@ class ModuleUtilsMixin:
-             device = attention_mask.device
-         batch_size, seq_length = input_shape
-         seq_ids = torch.arange(seq_length, device=device)
--        # print("seq_ids=",seq_ids)
-         causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
--        # print("causal_mask=",causal_mask)
-         # in case past_key_values are used we need to add a prefix ones mask to the causal mask
-         # causal and attention masks must have same type with pytorch version < 1.3
-         causal_mask = causal_mask.to(attention_mask.dtype)
-@@ -1088,8 +1085,7 @@ class ModuleUtilsMixin:
-         # Since we are adding it to the raw scores before the softmax, this is
-         # effectively the same as removing these entirely.
-         extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
--        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
--        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
-+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
-         return extended_attention_mask
- 
-     def get_head_mask(
-- 
Gitee


From a87eaa722ec0f636c0c9aa7bf1c5b43e27366f4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:57:09 +0000
Subject: [PATCH 048/110] add MindIE/MindIE-Torch/built-in/T5.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
-- 
Gitee


From 83fcd45b764f7aafad49e04a9c9582beb7161c08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:57:33 +0000
Subject: [PATCH 049/110] add MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 80d5d1980a3499cc6f9f50c7fdcedf1c4781f747 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 02:58:09 +0000
Subject: [PATCH 050/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/T5/modeling_t5.py   | 1045 +++++++++++++++++
 1 file changed, 1045 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
index e69de29bb2..c764d99e7b 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
@@ -0,0 +1,1045 @@
+# coding=utf-8
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch T5 model."""
+
+import copy
+import math
+import os
+import warnings
+from typing import List, Optional, Tuple, Union
+from dataclasses import dataclass
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+# import torch_npu
+import mindietorch
+
+
+
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_t5 import T5Config
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.configuration_utils import GenerationMode
+from transformers.utils.generic import ModelOutput
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "T5Config"
+_CHECKPOINT_FOR_DOC = "google-t5/t5-small"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        if "_slot_" in name[-1]:
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "self_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[0]
+            elif scope_names[0] == "enc_dec_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[1]
+            elif scope_names[0] == "dense_relu_dense":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[2]
+            elif scope_names[0] == "rms_norm":
+                if hasattr(pointer, "layer_norm"):
+                    pointer = getattr(pointer, "layer_norm")
+                elif hasattr(pointer, "final_layer_norm"):
+                    pointer = getattr(pointer, "final_layer_norm")
+            elif scope_names[0] == "scale":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            elif scope_names[0] == "decoder" and name[1] == "logits":
+                continue
+            elif scope_names[0] == "logits":
+                pointer = getattr(pointer, "lm_head")
+            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f"wi_{scope_names[1]}")
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if scope_names[0] != "embedding":
+            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
+            following number of attention modules:
+
+                - google-t5/t5-small: 6
+                - google-t5/t5-base: 12
+                - google-t5/t5-large: 24
+                - google-t5/t5-3b: 24
+                - google-t5/t5-11b: 24
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules:
+    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with google-t5/t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm")
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
+    pass
+
+ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
+
+
+class T5DenseActDense(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = T5DenseActDense(config)
+
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key=None,
+        past_value=None,
+        past_cross_key=None,
+        past_cross_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key is not None:
+            real_seq_length += past_key.shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                past_key_value = shape(past_key_value)
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                # elif past_key_value.shape[2] != key_value_states.shape[1]:
+                #     # checking that the `sequence_length` of the `past_key_value` is the same as
+                #     # the provided `key_value_states` to support prefix tuning
+                #     # cross-attn
+                #     # (batch_size, n_heads, seq_length, dim_per_head)
+                #     hidden_states = shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states, past_key if past_key is not None else None
+        )
+        value_states = project(
+            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
+        )
+        # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,)
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+        scores += position_bias_masked
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
+        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
+       
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class T5SelfAttention(T5Attention):
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__(config, has_relative_attention_bias)
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        position_bias=None,
+        past_key=None,
+        past_value=None,
+        layer_head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+        
+        if past_key is not None:
+            real_seq_length += past_key.shape[2]
+        key_length = real_seq_length
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+
+            if past_key_value is not None:
+                hidden_states = shape(proj_layer(hidden_states))
+                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, past_key if past_key is not None else None
+        )
+        value_states = project(
+            hidden_states, self.v, past_value if past_value is not None else None
+        )
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+        scores += position_bias_masked
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
+        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key=None,
+        past_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key=past_key,
+            past_value=past_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key=None,
+        past_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key=past_key,
+            past_value=past_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config))
+
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key=None,
+        past_value=None,
+        past_cross_key=None,
+        past_cross_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key is not None:
+            self_attn_past_key = past_key
+            self_attn_past_value = past_value
+            cross_attn_past_key = past_cross_key
+            cross_attn_past_value = past_cross_value
+        else:
+            self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key=self_attn_past_key,
+            past_value=self_attn_past_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_state, present_value_state = self_attention_outputs[:3]
+        attention_outputs = self_attention_outputs[3:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_state is not None:
+                query_length = present_key_state[0].shape[2]
+            else:
+                query_length = None
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key=cross_attn_past_key,
+                past_value=cross_attn_past_value,
+                query_length=query_length,
+                use_cache=use_cache, 
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            # cross_attn_past_key_values = cross_attention_outputs[1]
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[3:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class T5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["T5Block"]
+    _keep_in_fp32_modules = ["wo"]
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(
+            module,
+            (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering),
+        ):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, T5ForTokenClassification):
+            if hasattr(module, "classifier"):
+                module.classifier.weight.data.normal_(mean=0.0, std=factor * 1.0)
+                module.classifier.bias.data.zero_()
+        elif isinstance(module, T5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
+        elif isinstance(module, T5DenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
+                "See T5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
\ No newline at end of file
-- 
Gitee


From 05699199272770fb3c8e7dd1bb38a9c1b4e7a826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 03:00:15 +0000
Subject: [PATCH 051/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/T5/modeling_t5.py   | 470 +++++++++++++++++-
 1 file changed, 469 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
index c764d99e7b..99fd48535a 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
@@ -1042,4 +1042,472 @@ class T5PreTrainedModel(PreTrainedModel):
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
-        return shifted_input_ids
\ No newline at end of file
+        return shifted_input_ids
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+        self.lm_head=lm_head
+        self.encodecrosskey = encodecrosskey
+        self.encodecrossvalue = encodecrossvalue
+        self.model_dim = config.d_model
+
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = "cuda:" + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to("cpu")
+        self.embed_tokens = self.embed_tokens.to("cpu")
+        self.final_layer_norm = self.final_layer_norm.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        encoder_hidden_states=None,
+        past_keys=None,
+        past_values=None,
+        past_cross_keys=None,
+        past_cross_values=None,
+        encoder_attention_mask=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **model_kwargs
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length
+
+        if use_cache is True:
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+
+        # initialize past_key_values with `None` if past does not exist
+        if not self.is_decoder:
+            past_keys = [None] * len(self.block)
+            past_values = [None] * len(self.block)
+            past_cross_keys = [None] * len(self.block)
+            past_cross_values = [None] * len(self.block)
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
+                )
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_states = () if use_cache else None
+        present_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+        # for i, layer_module in enumerate(self.block):
+        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.forward,
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                    use_cache,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key=past_key,
+                    past_value=past_value,
+                    past_cross_key=past_cross_key,
+                    past_cross_value=past_cross_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+
+            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[3]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+            # append next layer key value states
+            if use_cache:
+                present_key_states = present_key_states + present_key_state
+                present_value_states = present_value_states + present_value_state
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states).half()
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        if not self.is_decoder:
+            cross_keys = None
+            cross_values = None
+            if self.encodecrosskey:
+                cross_keys = self.encodecrosskey(hidden_states)
+            if self.encodecrossvalue:
+                cross_values = self.encodecrossvalue(hidden_states)
+            return tuple((hidden_states, cross_keys, cross_values))
+        lm_logits = None
+        if self.is_decoder:
+            if self.config.tie_word_embeddings:
+                hidden_states = hidden_states * (self.model_dim ** -0.5)
+            lm_logits = self.lm_head(hidden_states)
+            return tuple((lm_logits, present_key_states, present_value_states))
+
+
+class T5Stack_Encoder(T5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
+        super().__init__(config)
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+        self.encodecrosskey = encodecrosskey
+        self.encodecrossvalue = encodecrossvalue
+        self.model_dim = config.d_model
+
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+    
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+    
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **model_kwargs
+    ):
+        # Model parallel
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = seq_length
+
+        # initialize past_key_values with `None` if past does not exist
+        past_keys = [None] * len(self.block)
+        past_values = [None] * len(self.block)
+        past_cross_keys = [None] * len(self.block)
+        past_cross_values = [None] * len(self.block)
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+       
+        encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_states = () if use_cache else None
+        present_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key=past_key,
+                    past_value=past_value,
+                    past_cross_key=past_cross_key,
+                    past_cross_value=past_cross_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+
+            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[3]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+            # append next layer key value states
+            if use_cache:
+                present_key_states = present_key_states + present_key_state
+                present_value_states = present_value_states + present_value_state
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states).half()
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        if not self.is_decoder:
+            cross_keys = None
+            cross_values = None
+            if self.encodecrosskey:
+                cross_keys = self.encodecrosskey(hidden_states)
+            if self.encodecrossvalue:
+                cross_values = self.encodecrossvalue(hidden_states)
+            return tuple((hidden_states, cross_keys, cross_values))
+
+
-- 
Gitee


From fdb814b194ac34636783c0e39dd8936b0959479d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 03:01:13 +0000
Subject: [PATCH 052/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/T5/modeling_t5.py   | 151 ++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
index 99fd48535a..c6e5d57c8f 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
@@ -1510,4 +1510,155 @@ class T5Stack_Encoder(T5PreTrainedModel):
                 cross_values = self.encodecrossvalue(hidden_states)
             return tuple((hidden_states, cross_keys, cross_values))
 
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
+    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
+    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
+    text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`T5Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
 
-- 
Gitee


From 40152886b808171031d0cf5ad934672fe4291198 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 03:04:42 +0000
Subject: [PATCH 053/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
index c6e5d57c8f..039b3da657 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
@@ -1661,4 +1661,10 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
-- 
Gitee


From 1a20d26c2e392e5be33f855a70e88a0607ea65c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 03:05:37 +0000
Subject: [PATCH 054/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
index 039b3da657..d422aef611 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
@@ -1668,3 +1668,8 @@ The input argument `head_mask` was split into two arguments `head_mask` and `dec
 If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
 num_heads)`.
 """
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
\ No newline at end of file
-- 
Gitee


From ed7e43928f0deaeb37b67c9b2bff0983b2b05d1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:14:56 +0000
Subject: [PATCH 055/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/modeling=5Ft5.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MindIE-Torch/built-in/T5/modeling_t5.py   | 1675 -----------------
 1 file changed, 1675 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
deleted file mode 100644
index d422aef611..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.py
+++ /dev/null
@@ -1,1675 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch T5 model."""
-
-import copy
-import math
-import os
-import warnings
-from typing import List, Optional, Tuple, Union
-from dataclasses import dataclass
-import torch
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-# import torch_npu
-import mindietorch
-
-
-
-
-from ...activations import ACT2FN
-from ...modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-    TokenClassifierOutput,
-)
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_torch_fx_proxy,
-    logging,
-    replace_return_docstrings,
-)
-from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_t5 import T5Config
-from transformers.generation.logits_process import LogitsProcessorList
-from transformers.generation.stopping_criteria import StoppingCriteriaList
-from transformers.generation.configuration_utils import GenerationMode
-from transformers.utils.generic import ModelOutput
-
-
-@dataclass
-class Seq2SeqLMOutput(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "T5Config"
-_CHECKPOINT_FOR_DOC = "google-t5/t5-small"
-
-####################################################
-# This dict contains ids and associated url
-# for the pretrained weights provided with the models
-####################################################
-
-
-####################################################
-# This is a conversion method from TF 1.0 to PyTorch
-# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-####################################################
-def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        tf_weights[name] = array
-
-    for txt_name in names:
-        name = txt_name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            tf_weights.pop(txt_name, None)
-            continue
-        if "_slot_" in name[-1]:
-            logger.info(f"Skipping {'/'.join(name)}")
-            tf_weights.pop(txt_name, None)
-            continue
-        pointer = model
-        array = tf_weights[txt_name]
-
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] in ["kernel", "scale", "embedding"]:
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "self_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[0]
-            elif scope_names[0] == "enc_dec_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[1]
-            elif scope_names[0] == "dense_relu_dense":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[2]
-            elif scope_names[0] == "rms_norm":
-                if hasattr(pointer, "layer_norm"):
-                    pointer = getattr(pointer, "layer_norm")
-                elif hasattr(pointer, "final_layer_norm"):
-                    pointer = getattr(pointer, "final_layer_norm")
-            elif scope_names[0] == "scale":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            elif scope_names[0] == "decoder" and name[1] == "logits":
-                continue
-            elif scope_names[0] == "logits":
-                pointer = getattr(pointer, "lm_head")
-            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
-                pointer = getattr(pointer, f"wi_{scope_names[1]}")
-                continue
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if scope_names[0] not in ["kernel", "scale", "embedding"]:
-            pointer = getattr(pointer, "weight")
-        if scope_names[0] != "embedding":
-            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array.astype(np.float32))
-        tf_weights.pop(txt_name, None)
-
-    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
-    return model
-
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - google-t5/t5-small: 6
-                - google-t5/t5-base: 12
-                - google-t5/t5-large: 24
-                - google-t5/t5-3b: 24
-                - google-t5/t5-11b: 24
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with google-t5/t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-try:
-    from apex.normalization import FusedRMSNorm
-
-    T5LayerNorm = FusedRMSNorm  # noqa
-
-    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm")
-except ImportError:
-    # using the normal T5LayerNorm
-    pass
-except Exception:
-    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
-    pass
-
-ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
-
-
-class T5DenseActDense(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = ACT2FN[config.dense_act_fn]
-
-    def forward(self, hidden_states):
-        hidden_states = self.wi(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        if (
-            isinstance(self.wo.weight, torch.Tensor)
-            and hidden_states.dtype != self.wo.weight.dtype
-            and self.wo.weight.dtype != torch.int8
-        ):
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = ACT2FN[config.dense_act_fn]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
-        # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
-        if (
-            isinstance(self.wo.weight, torch.Tensor)
-            and hidden_states.dtype != self.wo.weight.dtype
-            and self.wo.weight.dtype != torch.int8
-        ):
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFF(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(config)
-        else:
-            self.DenseReluDense = T5DenseActDense(config)
-
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5Attention(nn.Module):
-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.relative_attention_max_distance = config.relative_attention_max_distance
-        self.d_model = config.d_model
-        self.key_value_proj_dim = config.d_kv
-        self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
-        self.pruned_heads = set()
-        self.gradient_checkpointing = False
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
-        )
-        # Prune linear layers
-        self.q = prune_linear_layer(self.q, index)
-        self.k = prune_linear_layer(self.k, index)
-        self.v = prune_linear_layer(self.v, index)
-        self.o = prune_linear_layer(self.o, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.inner_dim = self.key_value_proj_dim * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
-
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact)
-            / math.log(max_distance / max_exact)
-            * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_position_if_large = torch.min(
-            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
-        )
-
-        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length, device=None):
-        """Compute binned relative position bias"""
-        if device is None:
-            device = self.relative_attention_bias.weight.device
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=(not self.is_decoder),
-            num_buckets=self.relative_attention_num_buckets,
-            max_distance=self.relative_attention_max_distance,
-        )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-        position_bias=None,
-        past_key=None,
-        past_value=None,
-        past_cross_key=None,
-        past_cross_value=None,
-        layer_head_mask=None,
-        query_length=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        real_seq_length = seq_length
-
-        if past_key is not None:
-            real_seq_length += past_key.shape[2] if query_length is None else query_length
-
-        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                past_key_value = shape(past_key_value)
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, n_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                # elif past_key_value.shape[2] != key_value_states.shape[1]:
-                #     # checking that the `sequence_length` of the `past_key_value` is the same as
-                #     # the provided `key_value_states` to support prefix tuning
-                #     # cross-attn
-                #     # (batch_size, n_heads, seq_length, dim_per_head)
-                #     hidden_states = shape(proj_layer(key_value_states))
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value
-            return hidden_states
-
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, key_value_states, past_key if past_key is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
-        )
-        # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,)
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                )
-                if self.gradient_checkpointing and self.training:
-                    position_bias.requires_grad = True
-            else:
-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-
-            if mask is not None:
-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        if self.pruned_heads:
-            mask = torch.ones(position_bias.shape[1])
-            mask[list(self.pruned_heads)] = 0
-            position_bias_masked = position_bias[:, mask.bool()]
-        else:
-            position_bias_masked = position_bias
-        scores += position_bias_masked
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        # Mask heads if we want to
-        if layer_head_mask is not None:
-            attn_weights = attn_weights * layer_head_mask
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
-        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
-       
-        if output_attentions:
-            outputs = outputs + (attn_weights,)
-        return outputs
-
-
-class T5SelfAttention(T5Attention):
-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
-        super().__init__(config, has_relative_attention_bias)
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        position_bias=None,
-        past_key=None,
-        past_value=None,
-        layer_head_mask=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        real_seq_length = seq_length
-        
-        if past_key is not None:
-            real_seq_length += past_key.shape[2]
-        key_length = real_seq_length
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if past_key_value is None:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-
-            if past_key_value is not None:
-                hidden_states = shape(proj_layer(hidden_states))
-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-            return hidden_states
-
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, past_key if past_key is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, past_value if past_value is not None else None
-        )
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                )
-                if self.gradient_checkpointing and self.training:
-                    position_bias.requires_grad = True
-            else:
-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-            if mask is not None:
-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        if self.pruned_heads:
-            mask = torch.ones(position_bias.shape[1])
-            mask[list(self.pruned_heads)] = 0
-            position_bias_masked = position_bias[:, mask.bool()]
-        else:
-            position_bias_masked = position_bias
-        scores += position_bias_masked
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        # Mask heads if we want to
-        if layer_head_mask is not None:
-            attn_weights = attn_weights * layer_head_mask
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
-        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
-        if output_attentions:
-            outputs = outputs + (attn_weights,)
-        return outputs
-
-
-
-
-class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key=None,
-        past_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key=past_key,
-            past_value=past_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key=None,
-        past_value=None,
-        use_cache=False,
-        query_length=None,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key=past_key,
-            past_value=past_value,
-            use_cache=use_cache,
-            query_length=query_length,
-            output_attentions=output_attentions,
-        )
-        layer_output = hidden_states + self.dropout(attention_output[0])
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5Block(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
-        if self.is_decoder:
-            self.layer.append(T5LayerCrossAttention(config))
-
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key=None,
-        past_value=None,
-        past_cross_key=None,
-        past_cross_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key is not None:
-            self_attn_past_key = past_key
-            self_attn_past_value = past_value
-            cross_attn_past_key = past_cross_key
-            cross_attn_past_value = past_cross_value
-        else:
-            self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key=self_attn_past_key,
-            past_value=self_attn_past_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_state, present_value_state = self_attention_outputs[:3]
-        attention_outputs = self_attention_outputs[3:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(hidden_states).any(),
-                torch.finfo(hidden_states.dtype).max - 1000,
-                torch.finfo(hidden_states.dtype).max,
-            )
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-        if do_cross_attention:
-            
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_state is not None:
-                query_length = present_key_state[0].shape[2]
-            else:
-                query_length = None
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key=cross_attn_past_key,
-                past_value=cross_attn_past_value,
-                query_length=query_length,
-                use_cache=use_cache, 
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.where(
-                    torch.isinf(hidden_states).any(),
-                    torch.finfo(hidden_states.dtype).max - 1000,
-                    torch.finfo(hidden_states.dtype).max,
-                )
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            # cross_attn_past_key_values = cross_attention_outputs[1]
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[3:]
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(hidden_states).any(),
-                torch.finfo(hidden_states.dtype).max - 1000,
-                torch.finfo(hidden_states.dtype).max,
-            )
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class T5ClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(self, config: T5Config):
-        super().__init__()
-        self.dense = nn.Linear(config.d_model, config.d_model)
-        self.dropout = nn.Dropout(p=config.classifier_dropout)
-        self.out_proj = nn.Linear(config.d_model, config.num_labels)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
-class T5PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = T5Config
-    load_tf_weights = load_tf_weights_in_t5
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["T5Block"]
-    _keep_in_fp32_modules = ["wo"]
-
-    @property
-    def dummy_inputs(self):
-        input_ids = torch.tensor(DUMMY_INPUTS)
-        input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {
-            "decoder_input_ids": input_ids,
-            "input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        factor = self.config.initializer_factor  # Used for testing weights initialization
-        if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor * 1.0)
-        elif isinstance(
-            module,
-            (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering),
-        ):
-            # Mesh TensorFlow embeddings initialization
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
-            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
-                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
-            if hasattr(module, "qa_outputs"):
-                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-                module.qa_outputs.bias.data.zero_()
-        elif isinstance(module, T5ForTokenClassification):
-            if hasattr(module, "classifier"):
-                module.classifier.weight.data.normal_(mean=0.0, std=factor * 1.0)
-                module.classifier.bias.data.zero_()
-        elif isinstance(module, T5ClassificationHead):
-            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.dense, "bias") and module.dense.bias is not None:
-                module.dense.bias.data.zero_()
-            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
-                module.out_proj.bias.data.zero_()
-        elif isinstance(module, T5DenseActDense):
-            # Mesh TensorFlow FF initialization
-            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
-            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, "bias") and module.wi.bias is not None:
-                module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5DenseGatedActDense):
-            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
-                module.wi_0.bias.data.zero_()
-            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
-                module.wi_1.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5Attention):
-            # Mesh TensorFlow attention initialization to avoid scaling before softmax
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
-            d_model = self.config.d_model
-            key_value_proj_dim = self.config.d_kv
-            n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
-            if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-
-    def _shift_right(self, input_ids):
-        decoder_start_token_id = self.config.decoder_start_token_id
-        pad_token_id = self.config.pad_token_id
-
-        if decoder_start_token_id is None:
-            raise ValueError(
-                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
-                "See T5 docs for more information."
-            )
-
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
-
-        if pad_token_id is None:
-            raise ValueError("self.model.config.pad_token_id has to be defined.")
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-        return shifted_input_ids
-
-class T5Stack(T5PreTrainedModel):
-    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
-        super().__init__(config)
-
-        self.embed_tokens = embed_tokens
-        self.is_decoder = config.is_decoder
-        self.lm_head=lm_head
-        self.encodecrosskey = encodecrosskey
-        self.encodecrossvalue = encodecrossvalue
-        self.model_dim = config.d_model
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        warnings.warn(
-            "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
-            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
-            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
-            " 'block.1': 1, ...}",
-            FutureWarning,
-        )
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
-        )
-        assert_device_map(self.device_map, len(self.block))
-        self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for layer in v:
-                cuda_device = "cuda:" + str(k)
-                self.block[layer] = self.block[layer].to(cuda_device)
-
-        # Set embed_tokens to first layer
-        self.embed_tokens = self.embed_tokens.to(self.first_device)
-        # Set final layer norm to last device
-        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        warnings.warn(
-            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
-            FutureWarning,
-        )
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        for i in range(len(self.block)):
-            self.block[i] = self.block[i].to("cpu")
-        self.embed_tokens = self.embed_tokens.to("cpu")
-        self.final_layer_norm = self.final_layer_norm.to("cpu")
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embed_tokens = new_embeddings
-
-    def forward(
-        self,
-        input_ids=None,
-        encoder_hidden_states=None,
-        past_keys=None,
-        past_values=None,
-        past_cross_keys=None,
-        past_cross_values=None,
-        encoder_attention_mask=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **model_kwargs
-    ):
-        # Model parallel
-        if self.model_parallel:
-            torch.cuda.set_device(self.first_device)
-            self.embed_tokens = self.embed_tokens.to(self.first_device)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
-
-        if inputs_embeds is None:
-            if self.embed_tokens is None:
-                raise ValueError("You have to initialize the model with valid token embeddings")
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length
-
-        if use_cache is True:
-            if not self.is_decoder:
-                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
-
-        # initialize past_key_values with `None` if past does not exist
-        if not self.is_decoder:
-            past_keys = [None] * len(self.block)
-            past_values = [None] * len(self.block)
-            past_cross_keys = [None] * len(self.block)
-            past_cross_values = [None] * len(self.block)
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
-                )
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
-        present_key_states = () if use_cache else None
-        present_value_states = () if use_cache else None
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-        # for i, layer_module in enumerate(self.block):
-        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
-            layer_head_mask = head_mask[i]
-            cross_attn_layer_head_mask = cross_attn_head_mask[i]
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if position_bias is not None:
-                    position_bias = position_bias.to(hidden_states.device)
-                if encoder_hidden_states is not None:
-                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
-                if encoder_extended_attention_mask is not None:
-                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
-                if encoder_decoder_position_bias is not None:
-                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
-                if layer_head_mask is not None:
-                    layer_head_mask = layer_head_mask.to(hidden_states.device)
-                if cross_attn_layer_head_mask is not None:
-                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.forward,
-                    hidden_states,
-                    extended_attention_mask,
-                    position_bias,
-                    encoder_hidden_states,
-                    encoder_extended_attention_mask,
-                    encoder_decoder_position_bias,
-                    layer_head_mask,
-                    cross_attn_layer_head_mask,
-                    None,  # past_key_value is always None with gradient checkpointing
-                    use_cache,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask=extended_attention_mask,
-                    position_bias=position_bias,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_extended_attention_mask,
-                    encoder_decoder_position_bias=encoder_decoder_position_bias,
-                    layer_head_mask=layer_head_mask,
-                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                    past_key=past_key,
-                    past_value=past_value,
-                    past_cross_key=past_cross_key,
-                    past_cross_value=past_cross_value,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-            if use_cache is False:
-                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
-
-            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
-
-            # We share the position biases between the layers - the first layer store them
-            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-            # (cross-attention position bias), (cross-attention weights)
-            position_bias = layer_outputs[3]
-            if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
-            # append next layer key value states
-            if use_cache:
-                present_key_states = present_key_states + present_key_state
-                present_value_states = present_value_states + present_value_state
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
-                if self.is_decoder:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states).half()
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    present_key_value_states,
-                    all_hidden_states,
-                    all_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        if not self.is_decoder:
-            cross_keys = None
-            cross_values = None
-            if self.encodecrosskey:
-                cross_keys = self.encodecrosskey(hidden_states)
-            if self.encodecrossvalue:
-                cross_values = self.encodecrossvalue(hidden_states)
-            return tuple((hidden_states, cross_keys, cross_values))
-        lm_logits = None
-        if self.is_decoder:
-            if self.config.tie_word_embeddings:
-                hidden_states = hidden_states * (self.model_dim ** -0.5)
-            lm_logits = self.lm_head(hidden_states)
-            return tuple((lm_logits, present_key_states, present_value_states))
-
-
-class T5Stack_Encoder(T5PreTrainedModel):
-    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
-        super().__init__(config)
-        self.embed_tokens = embed_tokens
-        self.is_decoder = config.is_decoder
-        self.encodecrosskey = encodecrosskey
-        self.encodecrossvalue = encodecrossvalue
-        self.model_dim = config.d_model
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-    
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embed_tokens = new_embeddings
-    
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **model_kwargs
-    ):
-        # Model parallel
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = seq_length
-
-        # initialize past_key_values with `None` if past does not exist
-        past_keys = [None] * len(self.block)
-        past_values = [None] * len(self.block)
-        past_cross_keys = [None] * len(self.block)
-        past_cross_values = [None] * len(self.block)
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-       
-        encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
-        present_key_states = () if use_cache else None
-        present_value_states = () if use_cache else None
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
-            layer_head_mask = head_mask[i]
-            cross_attn_layer_head_mask = cross_attn_head_mask[i]
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask=extended_attention_mask,
-                    position_bias=position_bias,
-                    encoder_hidden_states=None,
-                    encoder_attention_mask=encoder_extended_attention_mask,
-                    encoder_decoder_position_bias=encoder_decoder_position_bias,
-                    layer_head_mask=layer_head_mask,
-                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                    past_key=past_key,
-                    past_value=past_value,
-                    past_cross_key=past_cross_key,
-                    past_cross_value=past_cross_value,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-            if use_cache is False:
-                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
-
-            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
-
-            # We share the position biases between the layers - the first layer store them
-            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-            # (cross-attention position bias), (cross-attention weights)
-            position_bias = layer_outputs[3]
-            if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
-            # append next layer key value states
-            if use_cache:
-                present_key_states = present_key_states + present_key_state
-                present_value_states = present_value_states + present_value_state
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
-                if self.is_decoder:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states).half()
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    present_key_value_states,
-                    all_hidden_states,
-                    all_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        if not self.is_decoder:
-            cross_keys = None
-            cross_values = None
-            if self.encodecrosskey:
-                cross_keys = self.encodecrosskey(hidden_states)
-            if self.encodecrossvalue:
-                cross_values = self.encodecrossvalue(hidden_states)
-            return tuple((hidden_states, cross_keys, cross_values))
-
-T5_START_DOCSTRING = r"""
-
-    The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
-    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
-    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
-    text-to-text denoising generative setting.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`T5Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            [What are input IDs?](../glossary#input-ids)
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are decoder input IDs?](../glossary#decoder-input-ids)
-
-            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
-            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
-
-            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
-            Training](./t5#training).
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
-                `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
-            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more control over how to convert
-            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            of `inputs_embeds`.
-
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
-`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
-If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
-num_heads)`.
-"""
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
-    T5_START_DOCSTRING,
-)
\ No newline at end of file
-- 
Gitee


From 308cf8e9198cb11b24a3dc166a0dc706fcb4f3f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:15:11 +0000
Subject: [PATCH 056/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/utils.patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/utils.patch | 108 --------------------
 1 file changed, 108 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/utils.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/utils.patch b/MindIE/MindIE-Torch/built-in/T5/utils.patch
deleted file mode 100644
index 4968e30c2b..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/utils.patch
+++ /dev/null
@@ -1,108 +0,0 @@
---- /usr/local/python3.10.2/lib/python3.10/site-packages/transformers/generation/utils.py	2024-09-04 17:07:15.776000000 +0800
-+++ utils.py	2024-09-04 19:05:05.300000000 +0800
-@@ -507,10 +507,7 @@ class GenerationMixin:
-         generation_config: GenerationConfig,
-     ) -> Dict[str, Any]:
-         # 1. get encoder
--        if self.encoder_mindie:
--            encoder = self.encoder_mindie
--        else:
--            encoder = self.get_encoder()
-+        encoder = self.get_encoder()
-         # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
-         # as the inputs.
-         if hasattr(self, "hf_device_map"):
-@@ -526,12 +523,12 @@ class GenerationMixin:
-             for argument, value in model_kwargs.items()
-             if not any(argument.startswith(p) for p in irrelevant_prefix)
-         }
--        # encoder_signature = set(inspect.signature(encoder.forward).parameters)
--        # encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
--        # if not encoder_accepts_wildcard:
--        #     encoder_kwargs = {
--        #         argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
--        #     }
-+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
-+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
-+        if not encoder_accepts_wildcard:
-+            encoder_kwargs = {
-+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
-+            }
-         encoder_kwargs["output_attentions"] = generation_config.output_attentions
-         encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
- 
-@@ -539,13 +536,8 @@ class GenerationMixin:
-         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
-         encoder_kwargs["return_dict"] = True
-         encoder_kwargs[model_input_name] = inputs_tensor
--        if self.encoder_mindie:
--            with torch.npu.stream(self.stream): # set stream
--                encoder_outputs=encoder.forward(encoder_kwargs["input_ids"])
--            self.stream.synchronize() # synchronize
--        else:
--            encoder_outputs = encoder(**encoder_kwargs)
--        model_kwargs["encoder_outputs"]: ModelOutput = {"last_hidden_state":encoder_outputs[0], "past_cross_key_values":encoder_outputs[1]}
-+        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
-+
-         return model_kwargs
- 
-     def _prepare_decoder_input_ids_for_generation(
-@@ -670,9 +662,6 @@ class GenerationMixin:
-             outputs, standardize_cache_format=standardize_cache_format
-         )
-         model_kwargs[cache_name] = cache
--        if "past_cross_key_values" in outputs:
--            past_cross_key_values = outputs.past_cross_key_values
--            model_kwargs["past_cross_key_values"] = past_cross_key_values
-         if getattr(outputs, "state", None) is not None:
-             model_kwargs["state"] = outputs.state
- 
-@@ -1804,16 +1793,16 @@ class GenerationMixin:
-                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
-             )
- 
--        # if self.device.type != input_ids.device.type:
--        #     warnings.warn(
--        #         "You are calling .generate() with the `input_ids` being on a device type different"
--        #         f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
--        #         f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
--        #         " Please make sure that you have put `input_ids` to the"
--        #         f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
--        #         " running `.generate()`.",
--        #         UserWarning,
--        #     )
-+        if self.device.type != input_ids.device.type:
-+            warnings.warn(
-+                "You are calling .generate() with the `input_ids` being on a device type different"
-+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
-+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
-+                " Please make sure that you have put `input_ids` to the"
-+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
-+                " running `.generate()`.",
-+                UserWarning,
-+            )
- 
-         # 8. prepare distribution pre_processing samplers
-         prepared_logits_processor = self._get_logits_processor(
-@@ -2647,20 +2636,15 @@ class GenerationMixin:
-             encoder_hidden_states = (
-                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-             )
--       
- 
-         # keep track of which sequences are already finished
-         batch_size = input_ids.shape[0]
-         this_peer_finished = False
-         unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
--        num_layers = self.config.num_layers
--        num_heads = self.config.num_heads
--        d_kv = self.config.d_kv
--        model_kwargs["past_key_values"] = torch.randn(num_layers, 2, batch_size, num_heads, 0, d_kv).half().npu()
-+
-         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-             # prepare model inputs
--            
-             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
- 
-             # forward pass to get next token
-- 
Gitee


From b4c1077337169f9e517a20431a2410f2822124c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:18:27 +0000
Subject: [PATCH 057/110] add
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5.patch             | 1641 +++++++++++++++++
 1 file changed, 1641 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
new file mode 100644
index 0000000000..95d0455bf5
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -0,0 +1,1641 @@
+diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+index 224769fdf..6af548437 100644
+--- a/modeling_t5.py
++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+@@ -19,22 +19,26 @@ import math
+ import os
+ import warnings
+ from typing import List, Optional, Tuple, Union
+-
++from dataclasses import dataclass
+ import torch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
++# import torch_npu
++import mindietorch
++
++
++
+ 
+ from ...activations import ACT2FN
+ from ...modeling_outputs import (
+     BaseModelOutput,
+     BaseModelOutputWithPastAndCrossAttentions,
+-    Seq2SeqLMOutput,
+     Seq2SeqModelOutput,
+     Seq2SeqQuestionAnsweringModelOutput,
+     Seq2SeqSequenceClassifierOutput,
+     TokenClassifierOutput,
+ )
+-from ...modeling_utils import PreTrainedModel
++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin
+ from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+ from ...utils import (
+     DUMMY_INPUTS,
+@@ -47,8 +51,44 @@ from ...utils import (
+ )
+ from ...utils.model_parallel_utils import assert_device_map, get_device_map
+ from .configuration_t5 import T5Config
++from transformers.generation.logits_process import LogitsProcessorList
++from transformers.generation.stopping_criteria import StoppingCriteriaList
++from transformers.generation.configuration_utils import GenerationMode
++from transformers.utils.generic import ModelOutput
+ 
+ 
++@dataclass
++class Seq2SeqLMOutput(ModelOutput):
++    """
++    Base class for model's outputs, with potential hidden states and attentions.
++
++    Args:
++        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
++            Sequence of hidden-states at the output of the last layer of the model.
++        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
++            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
++            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
++
++            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
++        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
++            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
++            sequence_length)`.
++
++            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
++            heads.
++    """
++    loss: Optional[torch.FloatTensor] = None
++    logits: torch.FloatTensor = None
++    past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
++    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++
+ logger = logging.get_logger(__name__)
+ 
+ _CONFIG_FOR_DOC = "T5Config"
+@@ -448,7 +488,10 @@ class T5Attention(nn.Module):
+         mask=None,
+         key_value_states=None,
+         position_bias=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -464,12 +507,8 @@ class T5Attention(nn.Module):
+ 
+         real_seq_length = seq_length
+ 
+-        if past_key_value is not None:
+-            if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
++        if past_key is not None:
++            real_seq_length += past_key.shape[2] if query_length is None else query_length
+ 
+         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+ 
+@@ -493,16 +532,17 @@ class T5Attention(nn.Module):
+                 hidden_states = shape(proj_layer(key_value_states))
+ 
+             if past_key_value is not None:
++                past_key_value = shape(past_key_value)
+                 if key_value_states is None:
+                     # self-attn
+                     # (batch_size, n_heads, key_length, dim_per_head)
+                     hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-                elif past_key_value.shape[2] != key_value_states.shape[1]:
+-                    # checking that the `sequence_length` of the `past_key_value` is the same as
+-                    # the provided `key_value_states` to support prefix tuning
+-                    # cross-attn
+-                    # (batch_size, n_heads, seq_length, dim_per_head)
+-                    hidden_states = shape(proj_layer(key_value_states))
++                # elif past_key_value.shape[2] != key_value_states.shape[1]:
++                #     # checking that the `sequence_length` of the `past_key_value` is the same as
++                #     # the provided `key_value_states` to support prefix tuning
++                #     # cross-attn
++                #     # (batch_size, n_heads, seq_length, dim_per_head)
++                #     hidden_states = shape(proj_layer(key_value_states))
+                 else:
+                     # cross-attn
+                     hidden_states = past_key_value
+@@ -513,17 +553,16 @@ class T5Attention(nn.Module):
+ 
+         # get key/value states
+         key_states = project(
+-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
++            hidden_states, self.k, key_value_states, past_key if past_key is not None else None
+         )
+         value_states = project(
+-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
++            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
+         )
+-
++        # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,)
+         # compute scores
+         scores = torch.matmul(
+             query_states, key_states.transpose(3, 2)
+         )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-
+         if position_bias is None:
+             if not self.has_relative_attention_bias:
+                 position_bias = torch.zeros(
+@@ -536,7 +575,7 @@ class T5Attention(nn.Module):
+ 
+             # if key and values are already calculated
+             # we want only the last query position bias
+-            if past_key_value is not None:
++            if past_key is not None:
+                 position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+ 
+             if mask is not None:
+@@ -548,7 +587,6 @@ class T5Attention(nn.Module):
+             position_bias_masked = position_bias[:, mask.bool()]
+         else:
+             position_bias_masked = position_bias
+-
+         scores += position_bias_masked
+         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+             scores
+@@ -564,18 +602,131 @@ class T5Attention(nn.Module):
+         attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+         attn_output = self.o(attn_output)
+ 
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
++        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
++       
++        if output_attentions:
++            outputs = outputs + (attn_weights,)
++        return outputs
++
++
++class T5SelfAttention(T5Attention):
++    def __init__(self, config: T5Config, has_relative_attention_bias=False):
++        super().__init__(config, has_relative_attention_bias)
++
++    def forward(
++        self,
++        hidden_states,
++        mask=None,
++        position_bias=None,
++        past_key=None,
++        past_value=None,
++        layer_head_mask=None,
++        use_cache=False,
++        output_attentions=False,
++    ):
++        """
++        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
++        """
++        # Input is (batch_size, seq_length, dim)
++        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
++        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
++        batch_size, seq_length = hidden_states.shape[:2]
++
++        real_seq_length = seq_length
++        
++        if past_key is not None:
++            real_seq_length += past_key.shape[2]
++        key_length = real_seq_length
++        def shape(states):
++            """projection"""
++            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
++
++        def unshape(states):
++            """reshape"""
++            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
++
++        def project(hidden_states, proj_layer, past_key_value):
++            """projects hidden states correctly to key/query states"""
++            if past_key_value is None:
++                # cross-attn
++                # (batch_size, n_heads, seq_length, dim_per_head)
++                hidden_states = shape(proj_layer(hidden_states))
++
++            if past_key_value is not None:
++                hidden_states = shape(proj_layer(hidden_states))
++                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
++            return hidden_states
++
++        # get query states
++        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
++
++        # get key/value states
++        key_states = project(
++            hidden_states, self.k, past_key if past_key is not None else None
++        )
++        value_states = project(
++            hidden_states, self.v, past_value if past_value is not None else None
++        )
++        # compute scores
++        scores = torch.matmul(
++            query_states, key_states.transpose(3, 2)
++        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
++        if position_bias is None:
++            if not self.has_relative_attention_bias:
++                position_bias = torch.zeros(
++                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
++                )
++                if self.gradient_checkpointing and self.training:
++                    position_bias.requires_grad = True
++            else:
++                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
++
++            # if key and values are already calculated
++            # we want only the last query position bias
++            if past_key is not None:
++                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
++            if mask is not None:
++                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
++
++        if self.pruned_heads:
++            mask = torch.ones(position_bias.shape[1])
++            mask[list(self.pruned_heads)] = 0
++            position_bias_masked = position_bias[:, mask.bool()]
++        else:
++            position_bias_masked = position_bias
++        scores += position_bias_masked
++        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
++            scores
++        )  # (batch_size, n_heads, seq_length, key_length)
++        attn_weights = nn.functional.dropout(
++            attn_weights, p=self.dropout, training=self.training
++        )  # (batch_size, n_heads, seq_length, key_length)
++
++        # Mask heads if we want to
++        if layer_head_mask is not None:
++            attn_weights = attn_weights * layer_head_mask
++
++        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
++        attn_output = self.o(attn_output)
+ 
++        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+ 
+ 
++
++
+ class T5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+         super().__init__()
+-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         output_attentions=False,
+     ):
+@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module):
+             mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         query_length=None,
+         output_attentions=False,
+@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module):
+             key_value_states=key_value_states,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             query_length=query_length,
+             output_attentions=output_attentions,
+@@ -661,39 +816,34 @@ class T5Block(nn.Module):
+         encoder_decoder_position_bias=None,
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+     ):
+-        if past_key_value is not None:
+-            if not self.is_decoder:
+-                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+-
+-            if len(past_key_value) != expected_num_past_key_values:
+-                raise ValueError(
+-                    f"There should be {expected_num_past_key_values} past states. "
+-                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-                    f"Got {len(past_key_value)} past key / value states"
+-                )
+-
+-            self_attn_past_key_value = past_key_value[:2]
+-            cross_attn_past_key_value = past_key_value[2:]
++        if past_key is not None:
++            self_attn_past_key = past_key
++            self_attn_past_value = past_value
++            cross_attn_past_key = past_cross_key
++            cross_attn_past_value = past_cross_value
+         else:
+-            self_attn_past_key_value, cross_attn_past_key_value = None, None
++            self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None
+ 
+         self_attention_outputs = self.layer[0](
+             hidden_states,
+             attention_mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=self_attn_past_key_value,
++            past_key=self_attn_past_key,
++            past_value=self_attn_past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+-        hidden_states, present_key_value_state = self_attention_outputs[:2]
+-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
++        hidden_states, present_key_state, present_value_state = self_attention_outputs[:3]
++        attention_outputs = self_attention_outputs[3:]  # Keep self-attention outputs and relative position weights
+ 
+         # clamp inf values to enable fp16 training
+         if hidden_states.dtype == torch.float16:
+@@ -706,22 +856,23 @@ class T5Block(nn.Module):
+ 
+         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+         if do_cross_attention:
++            
+             # the actual query length is unknown for cross attention
+             # if using past key value states. Need to inject it here
+-            if present_key_value_state is not None:
+-                query_length = present_key_value_state[0].shape[2]
++            if present_key_state is not None:
++                query_length = present_key_state[0].shape[2]
+             else:
+                 query_length = None
+-
+             cross_attention_outputs = self.layer[1](
+                 hidden_states,
+                 key_value_states=encoder_hidden_states,
+                 attention_mask=encoder_attention_mask,
+                 position_bias=encoder_decoder_position_bias,
+                 layer_head_mask=cross_attn_layer_head_mask,
+-                past_key_value=cross_attn_past_key_value,
++                past_key=cross_attn_past_key,
++                past_value=cross_attn_past_value,
+                 query_length=query_length,
+-                use_cache=use_cache,
++                use_cache=use_cache, 
+                 output_attentions=output_attentions,
+             )
+             hidden_states = cross_attention_outputs[0]
+@@ -736,11 +887,9 @@ class T5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            if present_key_value_state is not None:
+-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-
++            # cross_attn_past_key_values = cross_attention_outputs[1]
+             # Keep cross-attention outputs and relative position weights
+-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
++            attention_outputs = attention_outputs + cross_attention_outputs[3:]
+ 
+         # Apply Feed Forward layer
+         hidden_states = self.layer[-1](hidden_states)
+@@ -757,7 +906,7 @@ class T5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) + attention_outputs
++            outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel):
+ 
+ 
+ class T5Stack(T5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None):
++    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
++        self.lm_head=lm_head
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel):
+     def set_input_embeddings(self, new_embeddings):
+         self.embed_tokens = new_embeddings
+ 
++    def invert_attention_mask(self, encoder_attention_mask):
++        if encoder_attention_mask.dim() == 3:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
++        if encoder_attention_mask.dim() == 2:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
++        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
++
++        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++        print("encoder_extended_attention_mask=",encoder_extended_attention_mask)
++
++        return encoder_extended_attention_mask
++
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
+     def forward(
+         self,
+         input_ids=None,
+-        attention_mask=None,
+         encoder_hidden_states=None,
++        past_keys=None,
++        past_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
+         encoder_attention_mask=None,
++        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
+-        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+         return_dict=None,
++        **model_kwargs
+     ):
+         # Model parallel
+         if self.model_parallel:
+@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel):
+                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+             )
+         elif input_ids is not None:
++            
+             input_shape = input_ids.size()
+             input_ids = input_ids.view(-1, input_shape[-1])
++            input_shape = input_ids.size()
+         elif inputs_embeds is not None:
+             input_shape = inputs_embeds.size()[:-1]
+         else:
+@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel):
+             inputs_embeds = self.embed_tokens(input_ids)
+ 
+         batch_size, seq_length = input_shape
+-
+         # required mask seq length can be calculated via length of past
+-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
++        mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length
+ 
+         if use_cache is True:
+             if not self.is_decoder:
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        if past_key_values is None:
+-            past_key_values = [None] * len(self.block)
+-
++        if not self.is_decoder:
++            past_keys = [None] * len(self.block)
++            past_values = [None] * len(self.block)
++            past_cross_keys = [None] * len(self.block)
++            past_cross_values = [None] * len(self.block)
+         if attention_mask is None:
+             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+ 
+@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        present_key_value_states = () if use_cache else None
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-
+-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
++        # for i, layer_module in enumerate(self.block):
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel):
+                     encoder_decoder_position_bias=encoder_decoder_position_bias,
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+-                    past_key_value=past_key_value,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state = layer_outputs[:2]
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[2]
++            position_bias = layer_outputs[3]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states = present_key_value_states + (present_key_value_state,)
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
+ 
+         # Add last layer
+         if output_hidden_states:
+@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel):
+                 ]
+                 if v is not None
+             )
+-        return BaseModelOutputWithPastAndCrossAttentions(
+-            last_hidden_state=hidden_states,
+-            past_key_values=present_key_value_states,
+-            hidden_states=all_hidden_states,
+-            attentions=all_attentions,
+-            cross_attentions=all_cross_attentions,
++        if not self.is_decoder:
++            cross_keys = None
++            cross_values = None
++            if self.encodecrosskey:
++                cross_keys = self.encodecrosskey(hidden_states)
++            if self.encodecrossvalue:
++                cross_values = self.encodecrossvalue(hidden_states)
++            return tuple((hidden_states, cross_keys, cross_values))
++        lm_logits = None
++        if self.is_decoder:
++            if self.config.tie_word_embeddings:
++                hidden_states = hidden_states * (self.model_dim ** -0.5)
++            lm_logits = self.lm_head(hidden_states)
++            return tuple((lm_logits, present_key_states, present_value_states))
++
++
++class T5Stack_Encoder(T5PreTrainedModel):
++    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
++        super().__init__(config)
++        self.embed_tokens = embed_tokens
++        self.is_decoder = config.is_decoder
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
++
++        self.block = nn.ModuleList(
++            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+         )
++        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
++        self.dropout = nn.Dropout(config.dropout_rate)
++
++        # Initialize weights and apply final processing
++        self.post_init()
++        # Model parallel
++        self.model_parallel = False
++        self.device_map = None
++        self.gradient_checkpointing = False
++    
++    def get_input_embeddings(self):
++        return self.embed_tokens
++
++    def set_input_embeddings(self, new_embeddings):
++        self.embed_tokens = new_embeddings
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++    
++    def forward(
++        self,
++        input_ids=None,
++        attention_mask=None,
++        head_mask=None,
++        cross_attn_head_mask=None,
++        use_cache=None,
++        output_attentions=None,
++        output_hidden_states=None,
++        return_dict=None,
++        **model_kwargs
++    ):
++        # Model parallel
++        use_cache = use_cache if use_cache is not None else self.config.use_cache
++        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
++        output_hidden_states = (
++            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
++        )
++        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++
++        input_shape = input_ids.size()
++        input_ids = input_ids.view(-1, input_shape[-1])
++
++        inputs_embeds = self.embed_tokens(input_ids)
++
++        batch_size, seq_length = input_shape
++        # required mask seq length can be calculated via length of past
++        mask_seq_length = seq_length
++
++        # initialize past_key_values with `None` if past does not exist
++        past_keys = [None] * len(self.block)
++        past_values = [None] * len(self.block)
++        past_cross_keys = [None] * len(self.block)
++        past_cross_values = [None] * len(self.block)
++        if attention_mask is None:
++            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
++
++        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
++        # ourselves in which case we just need to make it broadcastable to all heads.
++        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
++
++        # If a 2D or 3D attention mask is provided for the cross-attention
++        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
++       
++        encoder_extended_attention_mask = None
++
++        # Prepare head mask if needed
++        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
++        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
++        all_hidden_states = () if output_hidden_states else None
++        all_attentions = () if output_attentions else None
++        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
++        position_bias = None
++        encoder_decoder_position_bias = None
++
++        hidden_states = self.dropout(inputs_embeds)
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
++            layer_head_mask = head_mask[i]
++            cross_attn_layer_head_mask = cross_attn_head_mask[i]
++            if output_hidden_states:
++                all_hidden_states = all_hidden_states + (hidden_states,)
++
++            layer_outputs = layer_module(
++                    hidden_states,
++                    attention_mask=extended_attention_mask,
++                    position_bias=position_bias,
++                    encoder_hidden_states=None,
++                    encoder_attention_mask=encoder_extended_attention_mask,
++                    encoder_decoder_position_bias=encoder_decoder_position_bias,
++                    layer_head_mask=layer_head_mask,
++                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
++                    use_cache=use_cache,
++                    output_attentions=output_attentions,
++                )
++
++            # layer_outputs is a tuple with:
++            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
++            if use_cache is False:
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
++
++            # We share the position biases between the layers - the first layer store them
++            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
++            # (cross-attention position bias), (cross-attention weights)
++            position_bias = layer_outputs[3]
++            if self.is_decoder and encoder_hidden_states is not None:
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
++            # append next layer key value states
++            if use_cache:
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
++
++            if output_attentions:
++                all_attentions = all_attentions + (layer_outputs[3],)
++                if self.is_decoder:
++                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
++
++            # Model Parallel: If it's the last layer for that device, put things on the next device
++            if self.model_parallel:
++                for k, v in self.device_map.items():
++                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
++                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
++
++        hidden_states = self.final_layer_norm(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
++
++        # Add last layer
++        if output_hidden_states:
++            all_hidden_states = all_hidden_states + (hidden_states,)
++
++        if not return_dict:
++            return tuple(
++                v
++                for v in [
++                    hidden_states,
++                    present_key_value_states,
++                    all_hidden_states,
++                    all_attentions,
++                    all_cross_attentions,
++                ]
++                if v is not None
++            )
++        # present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
++        if not self.is_decoder:
++            cross_keys = None
++            cross_values = None
++            if self.encodecrosskey:
++                cross_keys = self.encodecrosskey(hidden_states)
++            if self.encodecrossvalue:
++                cross_values = self.encodecrossvalue(hidden_states)
++            return tuple((hidden_states, cross_keys, cross_values))
+ 
+ 
+ T5_START_DOCSTRING = r"""
+@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel):
+         )
+ 
+ 
++class EncoderToCrossKey(nn.Module):
++    def __init__(self, cross_key, num_heads, d_kv):
++        super().__init__()
++        self.cross_key = cross_key
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_keys = ()
++        for i in range(len(self.cross_key)):
++           past_cross_keys +=  (self.cross_key[i](hidden_states),)
++        return past_cross_keys
++
++
++class EncoderToCrossValue(nn.Module):
++    def __init__(self, cross_value, num_heads, d_kv):
++        super().__init__()
++        self.cross_value = cross_value
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_values = ()
++        for i in range(len(self.cross_value)):
++           past_cross_values +=  (self.cross_value[i](hidden_states),)
++        return past_cross_values
++
++
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
+ class T5ForConditionalGeneration(T5PreTrainedModel):
+     _keys_to_ignore_on_load_unexpected = [
+@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+     ]
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+-    def __init__(self, config: T5Config):
++    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
+         super().__init__(config)
+-        self.model_dim = config.d_model
+-
+-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-        encoder_config = copy.deepcopy(config)
+-        encoder_config.is_decoder = False
+-        encoder_config.use_cache = False
+-        encoder_config.is_encoder_decoder = False
+-        self.encoder = T5Stack(encoder_config, self.shared)
+-
+-        decoder_config = copy.deepcopy(config)
+-        decoder_config.is_decoder = True
+-        decoder_config.is_encoder_decoder = False
+-        decoder_config.num_layers = config.num_decoder_layers
+-        self.decoder = T5Stack(decoder_config, self.shared)
+-
+-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++        self.encoder_path = encoder_path
++        self.decoder_path = decoder_path
++        self.is_mindie = False
++        if not self.encoder_path or not self.decoder_path:
++            self.model_dim = config.d_model
++
++            self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++            decoder_config = copy.deepcopy(config)
++            decoder_config.is_decoder = True
++            decoder_config.is_encoder_decoder = False
++            decoder_config.num_layers = config.num_decoder_layers
++       
++            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
++
++            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
++            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
++            encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv)
++            encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv)
++
++            encoder_config = copy.deepcopy(config)
++            encoder_config.is_decoder = False
++            encoder_config.use_cache = False
++            encoder_config.is_encoder_decoder = False
++            self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
++        self.encoder_mindie = None
++        self.decoder_mindie = None
++        if self.encoder_path:
++            self.encoder_mindie = torch.jit.load(self.encoder_path)
++            self.is_mindie = True
++        if self.decoder_path:
++            self.decoder_mindie = torch.jit.load(self.decoder_path)
++            
++            self.stream = torch.npu.Stream(f"npu:{device_id}")
++            self.device_id = device_id
++
++    
++    def get_device(self):
++        return f"npu:{self.device_id}"
+ 
+         # Initialize weights and apply final processing
+-        self.post_init()
++        # self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+ 
+     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+-    def forward(
+-        self,
+-        input_ids: Optional[torch.LongTensor] = None,
+-        attention_mask: Optional[torch.FloatTensor] = None,
+-        decoder_input_ids: Optional[torch.LongTensor] = None,
+-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+-        head_mask: Optional[torch.FloatTensor] = None,
+-        decoder_head_mask: Optional[torch.FloatTensor] = None,
+-        cross_attn_head_mask: Optional[torch.Tensor] = None,
+-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+-        labels: Optional[torch.LongTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
++    def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+         r"""
+         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+             Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+         >>> # studies have shown that owning a dog is good for you.
+         ```"""
+-        use_cache = use_cache if use_cache is not None else self.config.use_cache
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+-        if head_mask is not None and decoder_head_mask is None:
+-            if self.config.num_layers == self.config.num_decoder_layers:
+-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+-                decoder_head_mask = head_mask
+-
+-        # Encode if needed (training, first prediction pass)
+-        if encoder_outputs is None:
+-            # Convert encoder inputs in embeddings if needed
+-            encoder_outputs = self.encoder(
+-                input_ids=input_ids,
+-                attention_mask=attention_mask,
+-                inputs_embeds=inputs_embeds,
+-                head_mask=head_mask,
+-                output_attentions=output_attentions,
+-                output_hidden_states=output_hidden_states,
+-                return_dict=return_dict,
+-            )
+-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+-            encoder_outputs = BaseModelOutput(
+-                last_hidden_state=encoder_outputs[0],
+-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+-            )
+-
+-        hidden_states = encoder_outputs[0]
+-
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-
+-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+-            # get decoder inputs from shifting lm labels to the right
+-            decoder_input_ids = self._shift_right(labels)
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-            hidden_states = hidden_states.to(self.decoder.first_device)
+-            if decoder_input_ids is not None:
+-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+-            if attention_mask is not None:
+-                attention_mask = attention_mask.to(self.decoder.first_device)
+-            if decoder_attention_mask is not None:
+-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+-
+-        # Decode
+-        decoder_outputs = self.decoder(
+-            input_ids=decoder_input_ids,
+-            attention_mask=decoder_attention_mask,
+-            inputs_embeds=decoder_inputs_embeds,
+-            past_key_values=past_key_values,
+-            encoder_hidden_states=hidden_states,
+-            encoder_attention_mask=attention_mask,
+-            head_mask=decoder_head_mask,
+-            cross_attn_head_mask=cross_attn_head_mask,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-
+-        sequence_output = decoder_outputs[0]
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.encoder.first_device)
+-            self.lm_head = self.lm_head.to(self.encoder.first_device)
+-            sequence_output = sequence_output.to(self.lm_head.weight.device)
+-
+-        if self.config.tie_word_embeddings:
+-            # Rescale output before projecting on vocab
+-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+-            sequence_output = sequence_output * (self.model_dim**-0.5)
+-
+-        lm_logits = self.lm_head(sequence_output)
++        if self.is_mindie:
++            with torch.npu.stream(self.stream): # set stream
++                decoder_outputs = self.decoder_mindie.forward(*args)
++            self.stream.synchronize() # synchronize
++        else:
++            hidden_states = args[0]
++            past_cross_keys = args[1:self.config.num_decoder_layers+1]
++            past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1]
++            past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1]
++            past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1]
++            encoder_attention_mask = args[-2]
++            decoder_input_ids = args[-1]
++            decoder_outputs = self.decoder(input_ids=decoder_input_ids,
++                                           encoder_hidden_states=hidden_states,
++                                           past_keys=past_keys,
++                                           past_values=past_values,
++                                           past_cross_keys=past_cross_keys,
++                                           past_cross_values=past_cross_values,
++                                           encoder_attention_mask=encoder_attention_mask)
++            
+ 
+         loss = None
+-        if labels is not None:
+-            loss_fct = CrossEntropyLoss(ignore_index=-100)
+-            # move labels to correct device to enable PP
+-            labels = labels.to(lm_logits.device)
+-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+-
+-        if not return_dict:
+-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+-            return ((loss,) + output) if loss is not None else output
+-
+-        return Seq2SeqLMOutput(
+-            loss=loss,
+-            logits=lm_logits,
+-            past_key_values=decoder_outputs.past_key_values,
+-            decoder_hidden_states=decoder_outputs.hidden_states,
+-            decoder_attentions=decoder_outputs.attentions,
+-            cross_attentions=decoder_outputs.cross_attentions,
+-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+-            encoder_hidden_states=encoder_outputs.hidden_states,
+-            encoder_attentions=encoder_outputs.attentions,
+-        )
++        return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2])
+ 
+     def prepare_inputs_for_generation(
+         self,
+         input_ids,
+-        past_key_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
++        past_keys=None,
++        past_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+         **kwargs,
+     ):
+         # cut decoder_input_ids if past_key_values is used
+-        if past_key_values is not None:
+-            past_length = past_key_values[0][0].shape[2]
++        if past_keys is not None:
++            past_length = past_keys[0].shape[2]
+ 
+             # Some generation methods already pass only the last input ID
+             if input_ids.shape[1] > past_length:
+@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             else:
+                 # Default to old behavior: keep only final ID
+                 remove_prefix_length = input_ids.shape[1] - 1
+-
+             input_ids = input_ids[:, remove_prefix_length:]
+ 
+         return {
+             "decoder_input_ids": input_ids,
+-            "past_key_values": past_key_values,
++            "past_cross_keys":past_cross_keys,
++            "past_cross_values":past_cross_values,
++            "past_keys":past_keys,
++            "past_values":past_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
+@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             "decoder_attention_mask": decoder_attention_mask,
+             "cross_attn_head_mask": cross_attn_head_mask,
+             "use_cache": use_cache,
++            
+         }
+ 
+     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+         return reordered_decoder_past
+ 
++    def _prepare_encoder_decoder_kwargs_for_generation(
++        self,
++        inputs_tensor: torch.Tensor,
++        model_kwargs,
++        model_input_name,
++        generation_config,
++    ):
++        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
++        encoder_kwargs = {
++            argument: value
++            for argument, value in model_kwargs.items()
++            if not any(argument.startswith(p) for p in irrelevant_prefix)
++        }
++        encoder_kwargs["output_attentions"] = generation_config.output_attentions
++        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
++        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
++        encoder_kwargs["return_dict"] = True
++        encoder_kwargs[model_input_name] = inputs_tensor
++        import time
++        start_time = time.time()
++        with torch.npu.stream(self.stream): # set stream
++            encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"])
++        self.stream.synchronize() # synchronize
++        model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]}
++        model_kwargs["past_cross_keys"] = encoder_outputs[1]
++        model_kwargs["past_cross_values"] =encoder_outputs[2]
++        return model_kwargs
++
++    def _update_model_kwargs_for_generation(
++        self,
++        outputs,
++        model_kwargs,
++        is_encoder_decoder = False,
++        standardize_cache_format = False,
++        num_new_tokens = 1,
++    ):
++        # update past_key_values keeping its naming used in model code
++        cache_name, cache = self._extract_past_from_model_output(
++            outputs, standardize_cache_format=standardize_cache_format
++        )
++        model_kwargs[cache_name] = cache
++        if "past_keys" in outputs:
++            past_keys = outputs.past_keys
++            model_kwargs["past_keys"] = past_keys
++        if "past_values" in outputs:
++            past_values = outputs.past_values
++            model_kwargs["past_values"] = past_values
++        # update decoder attention mask
++        if "decoder_attention_mask" in model_kwargs:
++            decoder_attention_mask = model_kwargs["decoder_attention_mask"]
++            model_kwargs["decoder_attention_mask"] = torch.cat(
++                 [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
++                dim=-1,
++            )
++        return model_kwargs
++
++    @torch.no_grad()
++    def generate(
++        self,
++        inputs = None,
++        generation_config = None,
++        logits_processor = None,
++        stopping_criteria = None,
++        prefix_allowed_tokens_fn = None,
++        assistant_model = None,
++        negative_prompt_ids = None,
++        negative_prompt_attention_mask = None,
++        **kwargs,
++    ):
++        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
++        import time 
++        start_time = time.time()
++        self._validate_model_class()
++        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
++        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
++        self._validate_model_kwargs(model_kwargs.copy())
++
++
++        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
++        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
++
++        accepts_attention_mask = True
++        requires_attention_mask = "encoder_outputs" not in model_kwargs
++        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
++
++        # 3. Define model inputs
++        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
++            inputs, generation_config.bos_token_id, model_kwargs
++        )
++        batch_size = inputs_tensor.shape[0]
++
++        device = inputs_tensor.device
++        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
++
++        # 4. Define other model kwargs
++        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
++        # generating the first new token or not, and we only want to use the embeddings for the first new token)
++        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
++            model_kwargs["use_cache"] = True
++        else:
++            model_kwargs["use_cache"] = generation_config.use_cache
++        if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
++            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
++                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
++            )
++
++        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
++            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
++            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
++                inputs_tensor, model_kwargs, model_input_name, generation_config
++            )
++
++        # 5. Prepare `input_ids` which will be used for auto-regressive generation
++        if self.config.is_encoder_decoder:
++            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
++                batch_size=batch_size,
++                model_input_name=model_input_name,
++                model_kwargs=model_kwargs,
++                decoder_start_token_id=generation_config.decoder_start_token_id,
++                device=inputs_tensor.device,
++            )
++        else:
++            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
++
++        if generation_config.token_healing:
++            input_ids = self.heal_tokens(input_ids, tokenizer)
++
++        # 6. Prepare `max_length` depending on other stopping criteria.
++        input_ids_length = input_ids.shape[-1]
++        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
++        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
++        generation_config = self._prepare_generated_length(
++            generation_config=generation_config,
++            has_default_max_length=has_default_max_length,
++            has_default_min_length=has_default_min_length,
++            model_input_name=model_input_name,
++            inputs_tensor=inputs_tensor,
++            input_ids_length=input_ids_length,
++        )
++
++        use_dynamic_cache_by_default = False
++        if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None:
++            raise ValueError(
++                "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
++                "Cache object) is unsupported. Please use only one of the two."
++            )
++        elif generation_config.cache_implementation is not None:
++            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
++                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
++                    raise ValueError(
++                        "This model does not support `cache_implementation='static'`. Please check the following "
++                        "issue: https://github.com/huggingface/transformers/issues/28981"
++                    )
++                model_kwargs["past_key_values"] = self._get_cache(
++                    generation_config.cache_implementation,
++                    getattr(generation_config, "num_beams", 1) * batch_size,
++                    generation_config.max_length,
++                )
++            elif generation_config.cache_implementation == "quantized":
++                if not self._supports_quantized_cache:
++                    raise ValueError(
++                        "This model does not support the quantized cache. If you want your model to support quantized "
++                        "cache, please open an issue."
++                    )
++
++                cache_config = (
++                    generation_config.cache_config
++                    if generation_config.cache_config is not None
++                    else QuantizedCacheConfig()
++                )
++                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
++
++                if cache_config.backend == "quanto" and not is_quanto_available():
++                    raise ImportError(
++                        "You need to install `quanto` in order to use KV cache quantization with quanto backend. "
++                        "Please install it via  with `pip install quanto`"
++                    )
++                elif cache_config.backend == "HQQ" and not is_hqq_available():
++                    raise ImportError(
++                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
++                        "Please install it via  with `pip install hqq`"
++                    )
++
++                model_kwargs["past_key_values"] = cache_class(cache_config)
++        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
++        # keeps copying the cache thus using much more memory
++        elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache():
++            past = model_kwargs.get("past_key_values", None)
++            if past is None:
++                model_kwargs["past_key_values"] = DynamicCache()
++                use_dynamic_cache_by_default = True
++            elif isinstance(past, tuple):
++                model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past)
++                use_dynamic_cache_by_default = True
++
++        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
++
++        # 7. determine generation mode
++        generation_mode = generation_config.get_generation_mode(assistant_model)
++        # 8. prepare distribution pre_processing samplers
++        prepared_logits_processor = self._get_logits_processor(
++            generation_config=generation_config,
++            input_ids_seq_length=input_ids_length,
++            encoder_input_ids=inputs_tensor,
++            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
++            logits_processor=logits_processor,
++            device=inputs_tensor.device,
++            model_kwargs=model_kwargs,
++            negative_prompt_ids=negative_prompt_ids,
++            negative_prompt_attention_mask=negative_prompt_attention_mask,
++        )
++
++        # 9. prepare stopping criteria
++        prepared_stopping_criteria = self._get_stopping_criteria(
++            generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
++        )
++
++        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
++            # 11. prepare logits warper
++            prepared_logits_warper = (
++                self._get_logits_warper(generation_config, device=input_ids.device)
++                if generation_config.do_sample
++                else None
++            )
++
++            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
++            input_ids, model_kwargs = self._expand_inputs_for_generation(
++                input_ids=input_ids,
++                expand_size=generation_config.num_return_sequences,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++                **model_kwargs,
++            )
++            # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
++            result = self._sample(
++                input_ids,
++                logits_processor=prepared_logits_processor,
++                logits_warper=prepared_logits_warper,
++                stopping_criteria=prepared_stopping_criteria,
++                generation_config=generation_config,
++                **model_kwargs,
++            )
++        return result
++    
++    def _sample(
++        self,
++        input_ids,
++        logits_processor,
++        stopping_criteria,
++        generation_config,
++        logits_warper = None,
++        **model_kwargs,
++    ):
++        # init values
++        pad_token_id = generation_config.pad_token_id
++        output_attentions = generation_config.output_attentions
++        output_hidden_states = generation_config.output_hidden_states
++        output_scores = generation_config.output_scores
++        output_logits = generation_config.output_logits
++        return_dict_in_generate = generation_config.return_dict_in_generate
++        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
++        do_sample = generation_config.do_sample
++        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
++            raise ValueError(
++                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
++                f"{logits_warper})."
++            )
++
++        # init attention / hidden states / scores tuples
++        scores = () if (return_dict_in_generate and output_scores) else None
++        raw_logits = () if (return_dict_in_generate and output_logits) else None
++        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
++        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
++        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
++
++        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
++        if return_dict_in_generate and self.config.is_encoder_decoder:
++            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
++            encoder_hidden_states = (
++                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
++            )
++       
++        this_peer_finished = False
++        batch_size = input_ids.shape[0]
++        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
++        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
++        
++        # keep track of which sequences are already finished
++        if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration":
++            num_layers = self.config.num_layers
++            num_heads = self.config.num_heads
++            d_kv = self.config.d_kv
++            model_kwargs["past_keys"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)]
++            model_kwargs["past_values"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)]
++       
++
++        while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device):
++            # prepare model inputs
++            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
++            model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]]
++            model_args.extend(model_kwargs["past_cross_keys"])
++            model_args.extend(model_kwargs["past_cross_values"])
++            model_args.extend(model_inputs["past_keys"])
++            model_args.extend(model_inputs["past_values"])
++            model_args.append(model_inputs["attention_mask"])
++            model_args.append(model_inputs["decoder_input_ids"])
++            
++            # forward pass to get next token
++            outputs = self(*model_args)
++            outputs = Seq2SeqLMOutput(logits=outputs[0],
++                                      past_keys=outputs[1],
++                                      past_values=outputs[2])
++
++            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
++            # (the clone itself is always small)
++            next_token_logits = outputs.logits[:, -1, :].clone()
++
++            # pre-process distribution
++            next_token_scores = logits_processor(input_ids, next_token_logits)
++            if do_sample:
++                next_token_scores = logits_warper(input_ids, next_token_scores)
++
++            # Store scores, attentions and hidden_states when required
++            if return_dict_in_generate:
++                if output_scores:
++                    scores += (next_token_scores,)
++                if output_logits:
++                    raw_logits += (next_token_logits,)
++                if output_attentions:
++                    decoder_attentions += (
++                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
++                    )
++                    if self.config.is_encoder_decoder:
++                        cross_attentions += (outputs.cross_attentions,)
++
++                if output_hidden_states:
++                    decoder_hidden_states += (
++                        (outputs.decoder_hidden_states,)
++                        if self.config.is_encoder_decoder
++                        else (outputs.hidden_states,)
++                    )
++
++            # token selection
++            if do_sample:
++                probs = nn.functional.softmax(next_token_scores, dim=-1)
++                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
++            else:
++                next_tokens = torch.argmax(next_token_scores, dim=-1)
++
++            # finished sentences should have their next token be a padding token
++            if has_eos_stopping_criteria:
++                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
++
++            # update generated ids, model inputs, and length for next step
++            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
++            model_kwargs = self._update_model_kwargs_for_generation(
++                outputs,
++                model_kwargs,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++            )
++            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
++            this_peer_finished = unfinished_sequences.max() == 0
++            # This is needed to properly delete outputs.logits which may be very large for first iteration
++            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
++            del outputs
++        return input_ids
++
++    def invert_attention_mask(self, encoder_attention_mask):
++        """
++        Invert an attention mask (e.g., switches 0. and 1.).
++
++        Args:
++            encoder_attention_mask (`torch.Tensor`): An attention mask.
++
++        Returns:
++            `torch.Tensor`: The inverted attention mask.
++        """
++        if encoder_attention_mask.dim() == 3:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
++        if encoder_attention_mask.dim() == 2:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
++        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
++        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
++        # /transformer/transformer_layers.py#L270
++        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
++        # encoder_extended_attention_mask.transpose(-1, -2))
++        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
++        #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
++        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++
++        return encoder_extended_attention_mask
++    
++    @property
++    def device(self) -> torch.device:
++        """
++        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
++        device).
++        """
++        return self.get_device()
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, devic=None, dtype=None
++    ):
++        """
++        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
++
++        Arguments:
++            attention_mask (`torch.Tensor`):
++                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
++            input_shape (`Tuple[int]`):
++                The shape of the input to the model.
++
++        Returns:
++            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
++        """
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
++        # ourselves in which case we just need to make it broadcastable to all heads.
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            # Provided a padding mask of dimensions [batch_size, seq_length]
++            # - if the model is a decoder, apply a causal mask in addition to the padding mask
++            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++
++        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
++        # masked positions, this operation will create a tensor which is 0.0 for
++        # positions we want to attend and the dtype's smallest value for masked positions.
++        # Since we are adding it to the raw scores before the softmax, this is
++        # effectively the same as removing these entirely.
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
++
++
+ 
+ @add_start_docstrings(
+     "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel):
+         >>> last_hidden_states = outputs.last_hidden_state
+         ```"""
+         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+         encoder_outputs = self.encoder(
+             input_ids=input_ids,
+             attention_mask=attention_mask,
-- 
Gitee


From 989e8f4c2c52a5b1ff1894bed74ad14426eeca6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:22:30 +0000
Subject: [PATCH 058/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 93 +++++++++++---------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index af67451d69..e152265ae9 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -3,6 +3,7 @@ import torch
 import torch_npu
 import argparse
 import os
+import math
 import mindietorch
 from transformers import T5ForConditionalGeneration
 
@@ -58,17 +59,8 @@ class TextDecoderExport(torch.nn.Module):
         self.textdecoder_model = textdecoder_model
     
     def forward(self,
-                input_ids,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                past_key_values,
-                past_cross_key_values):
-        return self.textdecoder_model(input_ids=input_ids,
-                                      encoder_hidden_states=encoder_hidden_states,
-                                      encoder_attention_mask=encoder_attention_mask,
-                                      past_key_values=past_key_values,
-                                      past_cross_key_values=past_cross_key_values,
-                                      return_dict=True)
+                *args):
+        return self.textdecoder_model(*args)
 
 def export_textencoder(args, model, save_dir, batch_size):
     encoder_path = os.path.join(save_dir, "encoder")
@@ -88,7 +80,6 @@ def export_textencoder(args, model, save_dir, batch_size):
         traced_model = torch.jit.load(traced_path).eval()
         
         inputs0 = []
-        # inputs1 = []
         inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
         print("compiling encoder")
         compiled_model = mindietorch.compile(
@@ -112,48 +103,70 @@ def export_textdecoder(args, model, save_dir, batch_size):
     model_path = args.model_path
     max_lenth = 120
     if not os.path.exists(traced_path):
-        text_decoder = model.decoder
-        dummy_input = (
-            torch.ones([1, 1], dtype=torch.int64).npu(),
-           torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(),
-            torch.ones(1,16).npu(),
-            torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(),
-            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()
-        )
-        decoder = TextDecoderExport(text_decoder).npu()
+        text_decoder = model
+        all_past_keys = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_values = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_keys = [torch.randn([1, 16, model.config.d_model]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_values = [torch.randn([1, 16, model.config.d_model]).to(torch.float16).npu()] * model.config.num_layers
+        dummy_input = [torch.randn(1, 16, model.config.d_model).to(torch.float16).npu()]
+        dummy_input.extend(all_past_cross_keys)
+        dummy_input.extend(all_past_cross_values)
+        dummy_input.extend(all_past_keys)
+        dummy_input.extend(all_past_values)
+        dummy_input.append(torch.ones(1,16).npu())
+        dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu())
+        decoder = TextDecoderExport(text_decoder).npu() 
         decoder.eval()
         torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
     if not os.path.exists(compiled_path):
         traced_model = torch.jit.load(traced_path).eval()
         print("compiling decoder")
+        input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)]
+        past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_cross_value_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_key_infos = [mindietorch.Input(min_shape =(1,  model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_value_infos = [mindietorch.Input(min_shape =(1,  model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,1),
+                                      dtype=mindietorch.dtype.INT64)]
+        encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.INT64)]
+        input_info.extend(past_cross_key_infos)
+        input_info.extend(past_cross_value_infos)
+        input_info.extend(past_key_infos)
+        input_info.extend(past_value_infos)
+        input_info.extend(encoder_attention_mask_info)
+        input_info.extend(decoder_input_ids_info)
+        buffer = []
+        for _ in range(2*model.config.num_layers):
+            buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) 
+        buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024)
+        buffer.append(buffer_size0)
+        print("buffer=",buffer)
         compiled_model = mindietorch.compile(
             traced_model,
-            inputs=[mindietorch.Input(min_shape =(1, 1),
-                                      max_shape = (args.max_batchsize,1),
-                                      dtype=mindietorch.dtype.INT64),
-
-                    mindietorch.Input(min_shape =(1, 1, model.config.d_model),
-                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
-                                      dtype=mindietorch.dtype.FLOAT16),
-                                      
-                    mindietorch.Input(min_shape = (1,1),
-                                      max_shape =(args.max_batchsize,args.max_input_seq_len),
-                                      dtype=mindietorch.dtype.INT64),
-                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv),
-                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
-                                      dtype=mindietorch.dtype.FLOAT16),
-
-                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads),
-                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads),
-                                      dtype=mindietorch.dtype.FLOAT16)],
+            inputs=input_info,
             allow_tensor_replace_int=True,
             require_full_compilation=False,
             truncate_long_and_double=True,
             precision_policy=mindietorch.PrecisionPolicy.FP16,
             soc_version="Ascend910B4",
+            default_buffer_size_vec=buffer,
             optimization_level=0
         )
         compiled_model.save(compiled_path)
+        
 
 def main():
     args = parse_arguments()
@@ -175,7 +188,5 @@ def main():
     print("export decoder_model done!")
     
 
-
-
 if __name__ == "__main__":
     main()
-- 
Gitee


From 1b0910e7859a1879c53ea3bbd8dbef4729845f99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:24:40 +0000
Subject: [PATCH 059/110] update MindIE/MindIE-Torch/built-in/T5/main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py
index e1ec51d66a..8ac34ceec5 100644
--- a/MindIE/MindIE-Torch/built-in/T5/main.py
+++ b/MindIE/MindIE-Torch/built-in/T5/main.py
@@ -1,7 +1,6 @@
 import torch
 import time
 import argparse
-import torch_npu
 from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config
 
 def parse_args():
-- 
Gitee


From b99351f5b9be33faaf1c5183962b80f63d90128d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:27:02 +0000
Subject: [PATCH 060/110] add
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_modeling_t5_patch.py       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
new file mode 100644
index 0000000000..e304f4f9f2
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From f3a88de8c3514363f2a01044f8788750ce5ebd31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 09:35:51 +0000
Subject: [PATCH 061/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index f518880708..b677c10796 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -67,21 +67,9 @@
 3. 代码修改,在T5目录下
 
    执行命令：
-   
-   ```bash
-   python T5_modeling_outputs_patch.py
-   ```
-
    ```bash
    python T5_modeling_t5_patch.py
    ```
-
-   ```bash
-   python T5_modeling_utils_patch.py
-   ```
-   ```bash
-   python T5_utils_patch.py
-   ```
 4.导出mindietorch模型
  ```bash
    python export_t5.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
-- 
Gitee


From 29e928df2600fe854fc83dc8fabe7dc583879101 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 10:37:15 +0000
Subject: [PATCH 062/110] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20MT5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/MT5/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/.keep

diff --git a/MindIE/MindIE-Torch/built-in/MT5/.keep b/MindIE/MindIE-Torch/built-in/MT5/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From b35c9a5cd3710ac2f6a65f2f20533c02b6c57942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:27:16 +0000
Subject: [PATCH 063/110] add MindIE/MindIE-Torch/built-in/MT5/export_mt5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/MT5/export_mt5.py   | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/export_mt5.py

diff --git a/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py
new file mode 100644
index 0000000000..138728fc16
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/MT5/export_mt5.py
@@ -0,0 +1,192 @@
+
+import torch
+import torch_npu
+import argparse
+import os
+import math
+import mindietorch
+from transformers import MT5ForConditionalGeneration
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./models",
+        help="save dir"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./MT5-Small",
+        help="T5 model path"
+    )
+    parser.add_argument(
+        "--max_batchsize",
+        type=int,
+        default=1,
+        help="max batchsize when running"
+    )
+
+    parser.add_argument(
+        "--max_input_seq_len",
+        type=int,
+        default=256,
+        help="max input_sequence length when running"
+    )
+
+    
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="npu device id"
+    )
+    return parser.parse_args()
+
+
+class TextEncoderExport(torch.nn.Module):
+    def __init__(self, textencoder_model):
+        super(TextEncoderExport, self).__init__()
+        self.textencoder_model = textencoder_model
+    
+    def forward(self, input_ids):
+        return self.textencoder_model(input_ids=input_ids)
+
+class TextDecoderExport(torch.nn.Module):
+    def __init__(self, textdecoder_model):
+        super(TextDecoderExport, self).__init__()
+        self.textdecoder_model = textdecoder_model
+    
+    def forward(self,
+                *args):
+        return self.textdecoder_model(*args)
+
+def export_textencoder(args, model, save_dir, batch_size):
+    encoder_path = os.path.join(save_dir, "encoder")
+    if not os.path.exists(encoder_path):
+        os.makedirs(encoder_path, mode=0o640)
+    traced_path = os.path.join(encoder_path, "encoder.pt")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(traced_path):
+        text_encoder = model.encoder
+        dummy_input = (
+            torch.ones([1, 128], dtype=torch.int64).npu()
+        )
+        encoder = TextEncoderExport(text_encoder)
+        encoder.eval()
+        torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        
+        inputs0 = []
+        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
+        print("compiling encoder")
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=inputs0,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def export_textdecoder(args, model, save_dir, batch_size):
+    decoder_path = os.path.join(save_dir, "decoder")
+    if not os.path.exists(decoder_path):
+        os.makedirs(decoder_path, mode=0o640)
+    traced_path = os.path.join(decoder_path, "decoder.pt")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    model_path = args.model_path
+    max_lenth = 120
+    if not os.path.exists(traced_path):
+        text_decoder = model
+        all_past_keys = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_values = [torch.randn([1, model.config.num_heads, 1, model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_keys = [torch.randn([1, 16, model.config.num_heads * model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_values = [torch.randn([1, 16, model.config.num_heads * model.config.d_kv]).to(torch.float16).npu()] * model.config.num_layers
+        dummy_input = [torch.randn(1, 16, model.config.d_model).to(torch.float16).npu()]
+        dummy_input.extend(all_past_cross_keys)
+        dummy_input.extend(all_past_cross_values)
+        dummy_input.extend(all_past_keys)
+        dummy_input.extend(all_past_values)
+        dummy_input.append(torch.ones(1,16).npu())
+        dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu())
+        decoder = TextDecoderExport(text_decoder).npu() 
+        decoder.eval()
+        torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        print("compiling decoder")
+        input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)]
+        past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_cross_value_infos = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_key_infos = [mindietorch.Input(min_shape =(1,  model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_value_infos = [mindietorch.Input(min_shape =(1,  model.config.num_heads, 0, model.config.d_kv),
+                                      max_shape=(args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,1),
+                                      dtype=mindietorch.dtype.INT64)]
+        encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.INT64)]
+        input_info.extend(past_cross_key_infos)
+        input_info.extend(past_cross_value_infos)
+        input_info.extend(past_key_infos)
+        input_info.extend(past_value_infos)
+        input_info.extend(encoder_attention_mask_info)
+        input_info.extend(decoder_input_ids_info)
+        buffer = []
+        for _ in range(2*model.config.num_layers):
+            buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) 
+        buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024)
+        buffer.append(buffer_size0)
+        print("buffer=",buffer)
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=input_info,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            default_buffer_size_vec=buffer,
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+        
+
+def main():
+    args = parse_arguments()
+    device_id = args.device_id
+    save_dir = args.output_dir
+    torch.npu.set_device(device_id)
+    batch_size = 1
+    model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu()
+    encoder_path = os.path.join(save_dir, "encoder")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textencoder(args, model, save_dir, batch_size)
+    print("export encoder_model done!")
+
+    decoder_path = os.path.join(save_dir, "decoder")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textdecoder(args, model, save_dir, batch_size)
+    print("export decoder_model done!")
+    
+
+if __name__ == "__main__":
+    main()
-- 
Gitee


From fae9c1dfe5dfbc790126b3ae40f76a1b507a3c89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:28:22 +0000
Subject: [PATCH 064/110] add MindIE/MindIE-Torch/built-in/MT5/test_mt5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/test_mt5.py | 50 ++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/test_mt5.py

diff --git a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py
new file mode 100644
index 0000000000..c73905875e
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py
@@ -0,0 +1,50 @@
+import torch
+import time
+import argparse
+import torch_npu
+from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hf_model_path", type=str, required=True)
+
+    parser.add_argument("--encoder_aie_path", type=str, required=True)
+    parser.add_argument("--decoder_aie_path", type=str, required=True)
+
+    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    torch.npu.set_device(args.device_id)
+    model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu()
+    encoder = model.encoder
+    decoder = model.decoder
+    encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu()
+    t5_config = MT5Config.from_pretrained(args.hf_model_path)
+
+    encoder_output = encoder(encoder_input)[0]
+    model = MT5ForConditionalGeneration(config=t5_config,
+                                        encoder_path=args.encoder_aie_path,
+                                        decoder_path=args.decoder_aie_path,
+                                        device_id=args.device_id).half().npu()
+    
+    encoder_mindie = model.encoder_mindie
+    decoder_mindie = model.decoder_mindie
+    mindie_stream = model.stream
+    with torch.npu.stream(mindie_stream): # set stream
+        mindie_encoder_output = encoder_mindie(encoder_input)[0]
+    mindie_stream.synchronize() # synchronize
+    import pdb
+    pdb.set_trace()
+    if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99:
+        print("encoder precision failed")
+    else:
+        print("test OK")
+
+
+if __name__ == "__main__":
+    main()
+
-- 
Gitee


From c4b40d99040f21c419bbaa69565352c89833b6de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:33:36 +0000
Subject: [PATCH 065/110] add
 MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/MT5/modeling_mt5.patch           | 1557 +++++++++++++++++
 1 file changed, 1557 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch

diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
new file mode 100644
index 0000000000..0df148b2ea
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
@@ -0,0 +1,1557 @@
+diff --git a/modeling_mt5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
+index 1336b9196..5b94d69c7 100644
+--- a/modeling_mt5_origin.py
++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
+@@ -19,22 +19,26 @@ import math
+ import os
+ import warnings
+ from typing import List, Optional, Tuple, Union
+-
++from dataclasses import dataclass
+ import torch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
++# import torch_npu
++import mindietorch
++
++
++
+ 
+ from ...activations import ACT2FN
+ from ...modeling_outputs import (
+     BaseModelOutput,
+     BaseModelOutputWithPastAndCrossAttentions,
+-    Seq2SeqLMOutput,
+     Seq2SeqModelOutput,
+     Seq2SeqQuestionAnsweringModelOutput,
+     Seq2SeqSequenceClassifierOutput,
+     TokenClassifierOutput,
+ )
+-from ...modeling_utils import PreTrainedModel
++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin
+ from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+ from ...utils import (
+     DUMMY_INPUTS,
+@@ -47,8 +51,44 @@ from ...utils import (
+ )
+ from ...utils.model_parallel_utils import assert_device_map, get_device_map
+ from .configuration_mt5 import MT5Config
++from transformers.generation.logits_process import LogitsProcessorList
++from transformers.generation.stopping_criteria import StoppingCriteriaList
++from transformers.generation.configuration_utils import GenerationMode
++from transformers.utils.generic import ModelOutput
+ 
+ 
++@dataclass
++class Seq2SeqLMOutput(ModelOutput):
++    """
++    Base class for model's outputs, with potential hidden states and attentions.
++
++    Args:
++        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
++            Sequence of hidden-states at the output of the last layer of the model.
++        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
++            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
++            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
++
++            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
++        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
++            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
++            sequence_length)`.
++
++            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
++            heads.
++    """
++    loss: Optional[torch.FloatTensor] = None
++    logits: torch.FloatTensor = None
++    past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
++    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++
+ logger = logging.get_logger(__name__)
+ 
+ _CONFIG_FOR_DOC = "MT5Config"
+@@ -323,7 +363,10 @@ class MT5Attention(nn.Module):
+         mask=None,
+         key_value_states=None,
+         position_bias=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -339,17 +382,15 @@ class MT5Attention(nn.Module):
+ 
+         real_seq_length = seq_length
+ 
+-        if past_key_value is not None:
+-            if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
++        if past_key is not None:
++            real_seq_length += past_key.shape[2] if query_length is None else query_length
+ 
+         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+ 
+         def shape(states):
+             """projection"""
++            # import pdb
++            # pdb.set_trace()
+             return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+ 
+         def unshape(states):
+@@ -368,16 +409,17 @@ class MT5Attention(nn.Module):
+                 hidden_states = shape(proj_layer(key_value_states))
+ 
+             if past_key_value is not None:
++                past_key_value = shape(past_key_value)
+                 if key_value_states is None:
+                     # self-attn
+                     # (batch_size, n_heads, key_length, dim_per_head)
+                     hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-                elif past_key_value.shape[2] != key_value_states.shape[1]:
+-                    # checking that the `sequence_length` of the `past_key_value` is the same as
+-                    # the provided `key_value_states` to support prefix tuning
+-                    # cross-attn
+-                    # (batch_size, n_heads, seq_length, dim_per_head)
+-                    hidden_states = shape(proj_layer(key_value_states))
++                # elif past_key_value.shape[2] != key_value_states.shape[1]:
++                #     # checking that the `sequence_length` of the `past_key_value` is the same as
++                #     # the provided `key_value_states` to support prefix tuning
++                #     # cross-attn
++                #     # (batch_size, n_heads, seq_length, dim_per_head)
++                #     hidden_states = shape(proj_layer(key_value_states))
+                 else:
+                     # cross-attn
+                     hidden_states = past_key_value
+@@ -388,10 +430,10 @@ class MT5Attention(nn.Module):
+ 
+         # get key/value states
+         key_states = project(
+-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
++            hidden_states, self.k, key_value_states, past_key if past_key is not None else None
+         )
+         value_states = project(
+-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
++            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
+         )
+ 
+         # compute scores
+@@ -411,7 +453,7 @@ class MT5Attention(nn.Module):
+ 
+             # if key and values are already calculated
+             # we want only the last query position bias
+-            if past_key_value is not None:
++            if past_key is not None:
+                 position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+ 
+             if mask is not None:
+@@ -439,14 +481,124 @@ class MT5Attention(nn.Module):
+         attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+         attn_output = self.o(attn_output)
+ 
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-
++        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
++       
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+ 
+ 
++class MT5SelfAttention(MT5Attention):
++    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
++        super().__init__(config, has_relative_attention_bias)
++
++    def forward(
++        self,
++        hidden_states,
++        mask=None,
++        position_bias=None,
++        past_key=None,
++        past_value=None,
++        layer_head_mask=None,
++        use_cache=False,
++        output_attentions=False,
++    ):
++        """
++        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
++        """
++        # Input is (batch_size, seq_length, dim)
++        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
++        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
++        batch_size, seq_length = hidden_states.shape[:2]
++
++        real_seq_length = seq_length
++        
++        if past_key is not None:
++            real_seq_length += past_key.shape[2]
++        key_length = real_seq_length
++        def shape(states):
++            """projection"""
++            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
++
++        def unshape(states):
++            """reshape"""
++            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
++
++        def project(hidden_states, proj_layer, past_key_value):
++            """projects hidden states correctly to key/query states"""
++            if past_key_value is None:
++                # cross-attn
++                # (batch_size, n_heads, seq_length, dim_per_head)
++                hidden_states = shape(proj_layer(hidden_states))
++
++            if past_key_value is not None:
++                hidden_states = shape(proj_layer(hidden_states))
++                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
++            return hidden_states
++
++        # get query states
++        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
++
++        # get key/value states
++        key_states = project(
++            hidden_states, self.k, past_key if past_key is not None else None
++        )
++        value_states = project(
++            hidden_states, self.v, past_value if past_value is not None else None
++        )
++        # compute scores
++        scores = torch.matmul(
++            query_states, key_states.transpose(3, 2)
++        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
++        if position_bias is None:
++            if not self.has_relative_attention_bias:
++                position_bias = torch.zeros(
++                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
++                )
++                if self.gradient_checkpointing and self.training:
++                    position_bias.requires_grad = True
++            else:
++                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
++
++            # if key and values are already calculated
++            # we want only the last query position bias
++            if past_key is not None:
++                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
++            if mask is not None:
++                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
++
++        if self.pruned_heads:
++            mask = torch.ones(position_bias.shape[1])
++            mask[list(self.pruned_heads)] = 0
++            position_bias_masked = position_bias[:, mask.bool()]
++        else:
++            position_bias_masked = position_bias
++        scores += position_bias_masked
++        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
++            scores
++        )  # (batch_size, n_heads, seq_length, key_length)
++        attn_weights = nn.functional.dropout(
++            attn_weights, p=self.dropout, training=self.training
++        )  # (batch_size, n_heads, seq_length, key_length)
++
++        # Mask heads if we want to
++        if layer_head_mask is not None:
++            attn_weights = attn_weights * layer_head_mask
++
++        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
++        attn_output = self.o(attn_output)
++
++        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
++        if output_attentions:
++            outputs = outputs + (attn_weights,)
++        return outputs
++
+ # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
+ class MT5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         output_attentions=False,
+     ):
+@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module):
+             mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         query_length=None,
+         output_attentions=False,
+@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module):
+             key_value_states=key_value_states,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             query_length=query_length,
+             output_attentions=output_attentions,
+@@ -539,39 +695,34 @@ class MT5Block(nn.Module):
+         encoder_decoder_position_bias=None,
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+     ):
+-        if past_key_value is not None:
+-            if not self.is_decoder:
+-                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+-
+-            if len(past_key_value) != expected_num_past_key_values:
+-                raise ValueError(
+-                    f"There should be {expected_num_past_key_values} past states. "
+-                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-                    f"Got {len(past_key_value)} past key / value states"
+-                )
+-
+-            self_attn_past_key_value = past_key_value[:2]
+-            cross_attn_past_key_value = past_key_value[2:]
++        if past_key is not None:
++            self_attn_past_key = past_key
++            self_attn_past_value = past_value
++            cross_attn_past_key = past_cross_key
++            cross_attn_past_value = past_cross_value
+         else:
+-            self_attn_past_key_value, cross_attn_past_key_value = None, None
++            self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None
+ 
+         self_attention_outputs = self.layer[0](
+             hidden_states,
+             attention_mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=self_attn_past_key_value,
++            past_key=self_attn_past_key,
++            past_value=self_attn_past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+-        hidden_states, present_key_value_state = self_attention_outputs[:2]
+-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
++        hidden_states, present_key_state, present_value_state = self_attention_outputs[:3]
++        attention_outputs = self_attention_outputs[3:]  # Keep self-attention outputs and relative position weights
+ 
+         # clamp inf values to enable fp16 training
+         if hidden_states.dtype == torch.float16:
+@@ -586,8 +737,8 @@ class MT5Block(nn.Module):
+         if do_cross_attention:
+             # the actual query length is unknown for cross attention
+             # if using past key value states. Need to inject it here
+-            if present_key_value_state is not None:
+-                query_length = present_key_value_state[0].shape[2]
++            if present_key_state is not None:
++                query_length = present_key_state[0].shape[2]
+             else:
+                 query_length = None
+ 
+@@ -597,7 +748,8 @@ class MT5Block(nn.Module):
+                 attention_mask=encoder_attention_mask,
+                 position_bias=encoder_decoder_position_bias,
+                 layer_head_mask=cross_attn_layer_head_mask,
+-                past_key_value=cross_attn_past_key_value,
++                past_key=cross_attn_past_key,
++                past_value=cross_attn_past_value,
+                 query_length=query_length,
+                 use_cache=use_cache,
+                 output_attentions=output_attentions,
+@@ -614,11 +766,9 @@ class MT5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            if present_key_value_state is not None:
+-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-
++            # cross_attn_past_key_values = cross_attention_outputs[1]
+             # Keep cross-attention outputs and relative position weights
+-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
++            attention_outputs = attention_outputs + cross_attention_outputs[3:]
+ 
+         # Apply Feed Forward layer
+         hidden_states = self.layer[-1](hidden_states)
+@@ -635,7 +785,7 @@ class MT5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) + attention_outputs
++            outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel):
+ 
+ # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
+ class MT5Stack(MT5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None):
++    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
++        self.lm_head=lm_head
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel):
+     def set_input_embeddings(self, new_embeddings):
+         self.embed_tokens = new_embeddings
+ 
++    def invert_attention_mask(self, encoder_attention_mask):
++        if encoder_attention_mask.dim() == 3:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
++        if encoder_attention_mask.dim() == 2:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
++        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
++
++        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++
++        return encoder_extended_attention_mask
++
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
+     def forward(
+         self,
+         input_ids=None,
+-        attention_mask=None,
+         encoder_hidden_states=None,
++        past_keys=None,
++        past_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
+         encoder_attention_mask=None,
++        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
+-        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+         return_dict=None,
++        **model_kwargs
+     ):
+         # Model parallel
+         if self.model_parallel:
+@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel):
+                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+             )
+         elif input_ids is not None:
++            
+             input_shape = input_ids.size()
+             input_ids = input_ids.view(-1, input_shape[-1])
++            input_shape = input_ids.size()
+         elif inputs_embeds is not None:
+             input_shape = inputs_embeds.size()[:-1]
+         else:
+@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel):
+             inputs_embeds = self.embed_tokens(input_ids)
+ 
+         batch_size, seq_length = input_shape
+-
+         # required mask seq length can be calculated via length of past
+-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
++        mask_seq_length = past_keys[0].shape[2] + seq_length if past_keys is not None else seq_length
+ 
+         if use_cache is True:
+             if not self.is_decoder:
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        if past_key_values is None:
+-            past_key_values = [None] * len(self.block)
+-
++        if not self.is_decoder:
++            past_keys = [None] * len(self.block)
++            past_values = [None] * len(self.block)
++            past_cross_keys = [None] * len(self.block)
++            past_cross_values = [None] * len(self.block)
+         if attention_mask is None:
+             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+ 
+@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        present_key_value_states = () if use_cache else None
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-
+-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
++        # for i, layer_module in enumerate(self.block):
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel):
+                     encoder_decoder_position_bias=encoder_decoder_position_bias,
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+-                    past_key_value=past_key_value,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state = layer_outputs[:2]
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[2]
++            position_bias = layer_outputs[3]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states = present_key_value_states + (present_key_value_state,)
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
+ 
+         # Add last layer
+         if output_hidden_states:
+@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel):
+                 ]
+                 if v is not None
+             )
+-        return BaseModelOutputWithPastAndCrossAttentions(
+-            last_hidden_state=hidden_states,
+-            past_key_values=present_key_value_states,
+-            hidden_states=all_hidden_states,
+-            attentions=all_attentions,
+-            cross_attentions=all_cross_attentions,
++        if not self.is_decoder:
++            cross_keys = None
++            cross_values = None
++            if self.encodecrosskey:
++                cross_keys = self.encodecrosskey(hidden_states)
++            if self.encodecrossvalue:
++                cross_values = self.encodecrossvalue(hidden_states)
++            return tuple((hidden_states, cross_keys, cross_values))
++        lm_logits = None
++        if self.is_decoder:
++            if self.config.tie_word_embeddings:
++                hidden_states = hidden_states * (self.model_dim ** -0.5)
++            lm_logits = self.lm_head(hidden_states)
++            return tuple((lm_logits, present_key_states, present_value_states))
++
++
++class MT5Stack_Encoder(MT5PreTrainedModel):
++    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
++        super().__init__(config)
++        self.embed_tokens = embed_tokens
++        self.is_decoder = config.is_decoder
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
++
++        self.block = nn.ModuleList(
++            [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+         )
++        self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
++        self.dropout = nn.Dropout(config.dropout_rate)
++
++        # Initialize weights and apply final processing
++        self.post_init()
++        # Model parallel
++        self.model_parallel = False
++        self.device_map = None
++        self.gradient_checkpointing = False
++    
++    def get_input_embeddings(self):
++        return self.embed_tokens
++
++    def set_input_embeddings(self, new_embeddings):
++        self.embed_tokens = new_embeddings
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++    
++    def forward(
++        self,
++        input_ids=None,
++        attention_mask=None,
++        head_mask=None,
++        cross_attn_head_mask=None,
++        use_cache=None,
++        output_attentions=None,
++        output_hidden_states=None,
++        return_dict=None,
++        **model_kwargs
++    ):
++        # Model parallel
++        use_cache = use_cache if use_cache is not None else self.config.use_cache
++        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
++        output_hidden_states = (
++            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
++        )
++        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++
++        input_shape = input_ids.size()
++        input_ids = input_ids.view(-1, input_shape[-1])
++
++        inputs_embeds = self.embed_tokens(input_ids)
++
++        batch_size, seq_length = input_shape
++        # required mask seq length can be calculated via length of past
++        mask_seq_length = seq_length
++
++        # initialize past_key_values with `None` if past does not exist
++        past_keys = [None] * len(self.block)
++        past_values = [None] * len(self.block)
++        past_cross_keys = [None] * len(self.block)
++        past_cross_values = [None] * len(self.block)
++        if attention_mask is None:
++            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
++
++        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
++        # ourselves in which case we just need to make it broadcastable to all heads.
++        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
++
++        # If a 2D or 3D attention mask is provided for the cross-attention
++        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
++       
++        encoder_extended_attention_mask = None
++
++        # Prepare head mask if needed
++        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
++        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
++        all_hidden_states = () if output_hidden_states else None
++        all_attentions = () if output_attentions else None
++        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
++        position_bias = None
++        encoder_decoder_position_bias = None
++
++        hidden_states = self.dropout(inputs_embeds)
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
++            layer_head_mask = head_mask[i]
++            cross_attn_layer_head_mask = cross_attn_head_mask[i]
++            if output_hidden_states:
++                all_hidden_states = all_hidden_states + (hidden_states,)
++
++            layer_outputs = layer_module(
++                    hidden_states,
++                    attention_mask=extended_attention_mask,
++                    position_bias=position_bias,
++                    encoder_hidden_states=None,
++                    encoder_attention_mask=encoder_extended_attention_mask,
++                    encoder_decoder_position_bias=encoder_decoder_position_bias,
++                    layer_head_mask=layer_head_mask,
++                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
++                    use_cache=use_cache,
++                    output_attentions=output_attentions,
++                )
++
++            # layer_outputs is a tuple with:
++            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
++            if use_cache is False:
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
++
++            # We share the position biases between the layers - the first layer store them
++            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
++            # (cross-attention position bias), (cross-attention weights)
++            position_bias = layer_outputs[3]
++            if self.is_decoder and encoder_hidden_states is not None:
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
++            # append next layer key value states
++            if use_cache:
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
++
++            if output_attentions:
++                all_attentions = all_attentions + (layer_outputs[3],)
++                if self.is_decoder:
++                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
++
++            # Model Parallel: If it's the last layer for that device, put things on the next device
++            if self.model_parallel:
++                for k, v in self.device_map.items():
++                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
++                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
++
++        hidden_states = self.final_layer_norm(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
++
++        # Add last layer
++        if output_hidden_states:
++            all_hidden_states = all_hidden_states + (hidden_states,)
++
++        if not return_dict:
++            return tuple(
++                v
++                for v in [
++                    hidden_states,
++                    present_key_value_states,
++                    all_hidden_states,
++                    all_attentions,
++                    all_cross_attentions,
++                ]
++                if v is not None
++            )
++        # present_key_value_states = torch.concat(present_key_value_states).reshape(len(self.block),2,*present_key_value_states[0].shape).half() if use_cache else None
++        if not self.is_decoder:
++            cross_keys = None
++            cross_values = None
++            if self.encodecrosskey:
++                cross_keys = self.encodecrosskey(hidden_states)
++            if self.encodecrossvalue:
++                cross_values = self.encodecrossvalue(hidden_states)
++            return tuple((hidden_states, cross_keys, cross_values))
+ 
+ 
+ MT5_START_DOCSTRING = r"""
+@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel):
+         )
+ 
+ 
++class EncoderToCrossKey(nn.Module):
++    def __init__(self, cross_key, num_heads, d_kv):
++        super().__init__()
++        self.cross_key = cross_key
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_keys = ()
++        for i in range(len(self.cross_key)):
++           past_cross_keys +=  (self.cross_key[i](hidden_states),)
++        #    import pdb
++        #    pdb.set_trace()
++        return past_cross_keys
++
++
++class EncoderToCrossValue(nn.Module):
++    def __init__(self, cross_value, num_heads, d_kv):
++        super().__init__()
++        self.cross_value = cross_value
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_values = ()
++        for i in range(len(self.cross_value)):
++           past_cross_values +=  (self.cross_value[i](hidden_states),)
++        return past_cross_values
++
+ @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
+ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     r"""
+@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
+-    def __init__(self, config: MT5Config):
++    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
+         super().__init__(config)
+-        self.model_dim = config.d_model
+-
+-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-        encoder_config = copy.deepcopy(config)
+-        encoder_config.is_decoder = False
+-        encoder_config.use_cache = False
+-        encoder_config.is_encoder_decoder = False
+-        self.encoder = MT5Stack(encoder_config, self.shared)
+-
+-        decoder_config = copy.deepcopy(config)
+-        decoder_config.is_decoder = True
+-        decoder_config.is_encoder_decoder = False
+-        decoder_config.num_layers = config.num_decoder_layers
+-        self.decoder = MT5Stack(decoder_config, self.shared)
+-
+-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++        self.encoder_path = encoder_path
++        self.decoder_path = decoder_path
++        self.is_mindie = False
++        if not self.encoder_path or not self.decoder_path:
++            self.model_dim = config.d_model
++
++            self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++            decoder_config = copy.deepcopy(config)
++            decoder_config.is_decoder = True
++            decoder_config.is_encoder_decoder = False
++            decoder_config.num_layers = config.num_decoder_layers
++            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++            self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head)
++            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
++            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
++            encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv)
++            encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv)
++            encoder_config = copy.deepcopy(config)
++            encoder_config.is_decoder = False
++            encoder_config.use_cache = False
++            encoder_config.is_encoder_decoder = False
++            self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
++        self.encoder_mindie = None
++        self.decoder_mindie = None
++        if self.encoder_path:
++            self.encoder_mindie = torch.jit.load(self.encoder_path)
++            self.is_mindie = True
++        if self.decoder_path:
++            self.decoder_mindie = torch.jit.load(self.decoder_path)
++            self.stream = torch.npu.Stream(f"npu:{device_id}")
++            self.device_id = device_id
+ 
+         # Initialize weights and apply final processing
+-        self.post_init()
++        if not self.is_mindie:
++            self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+         self.device_map = None
+ 
++    def get_device(self):
++        return f"npu:{self.device_id}"
++    
+     @add_start_docstrings(PARALLELIZE_DOCSTRING)
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
+     def parallelize(self, device_map=None):
+@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5
+-    def forward(
+-        self,
+-        input_ids: Optional[torch.LongTensor] = None,
+-        attention_mask: Optional[torch.FloatTensor] = None,
+-        decoder_input_ids: Optional[torch.LongTensor] = None,
+-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+-        head_mask: Optional[torch.FloatTensor] = None,
+-        decoder_head_mask: Optional[torch.FloatTensor] = None,
+-        cross_attn_head_mask: Optional[torch.Tensor] = None,
+-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+-        labels: Optional[torch.LongTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
++    def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+         r"""
+         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+             Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+         >>> # studies have shown that owning a dog is good for you.
+         ```"""
+-        use_cache = use_cache if use_cache is not None else self.config.use_cache
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+-        if head_mask is not None and decoder_head_mask is None:
+-            if self.config.num_layers == self.config.num_decoder_layers:
+-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+-                decoder_head_mask = head_mask
+-
+-        # Encode if needed (training, first prediction pass)
+-        if encoder_outputs is None:
+-            # Convert encoder inputs in embeddings if needed
+-            encoder_outputs = self.encoder(
+-                input_ids=input_ids,
+-                attention_mask=attention_mask,
+-                inputs_embeds=inputs_embeds,
+-                head_mask=head_mask,
+-                output_attentions=output_attentions,
+-                output_hidden_states=output_hidden_states,
+-                return_dict=return_dict,
+-            )
+-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+-            encoder_outputs = BaseModelOutput(
+-                last_hidden_state=encoder_outputs[0],
+-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+-            )
+-
+-        hidden_states = encoder_outputs[0]
+-
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-
+-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+-            # get decoder inputs from shifting lm labels to the right
+-            decoder_input_ids = self._shift_right(labels)
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-            hidden_states = hidden_states.to(self.decoder.first_device)
+-            if decoder_input_ids is not None:
+-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+-            if attention_mask is not None:
+-                attention_mask = attention_mask.to(self.decoder.first_device)
+-            if decoder_attention_mask is not None:
+-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+-
+-        # Decode
+-        decoder_outputs = self.decoder(
+-            input_ids=decoder_input_ids,
+-            attention_mask=decoder_attention_mask,
+-            inputs_embeds=decoder_inputs_embeds,
+-            past_key_values=past_key_values,
+-            encoder_hidden_states=hidden_states,
+-            encoder_attention_mask=attention_mask,
+-            head_mask=decoder_head_mask,
+-            cross_attn_head_mask=cross_attn_head_mask,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-
+-        sequence_output = decoder_outputs[0]
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.encoder.first_device)
+-            self.lm_head = self.lm_head.to(self.encoder.first_device)
+-            sequence_output = sequence_output.to(self.lm_head.weight.device)
+-
+-        if self.config.tie_word_embeddings:
+-            # Rescale output before projecting on vocab
+-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+-            sequence_output = sequence_output * (self.model_dim**-0.5)
+-
+-        lm_logits = self.lm_head(sequence_output)
++        if self.is_mindie:
++            with torch.npu.stream(self.stream): # set stream
++                decoder_outputs = self.decoder_mindie.forward(*args)
++            self.stream.synchronize() # synchronize
++        else:
++            hidden_states = args[0]
++            past_cross_keys = args[1:self.config.num_decoder_layers+1]
++            past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1]
++            past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1]
++            past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1]
++            encoder_attention_mask = args[-2]
++            decoder_input_ids = args[-1]
++            decoder_outputs = self.decoder(input_ids=decoder_input_ids,
++                                           encoder_hidden_states=hidden_states,
++                                           past_keys=past_keys,
++                                           past_values=past_values,
++                                           past_cross_keys=past_cross_keys,
++                                           past_cross_values=past_cross_values,
++                                           encoder_attention_mask=encoder_attention_mask)
++            
+ 
+         loss = None
+-        if labels is not None:
+-            loss_fct = CrossEntropyLoss(ignore_index=-100)
+-            # move labels to correct device to enable PP
+-            labels = labels.to(lm_logits.device)
+-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+-
+-        if not return_dict:
+-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+-            return ((loss,) + output) if loss is not None else output
+-
+-        return Seq2SeqLMOutput(
+-            loss=loss,
+-            logits=lm_logits,
+-            past_key_values=decoder_outputs.past_key_values,
+-            decoder_hidden_states=decoder_outputs.hidden_states,
+-            decoder_attentions=decoder_outputs.attentions,
+-            cross_attentions=decoder_outputs.cross_attentions,
+-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+-            encoder_hidden_states=encoder_outputs.hidden_states,
+-            encoder_attentions=encoder_outputs.attentions,
+-        )
++        return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2])
+ 
+-    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
+     def prepare_inputs_for_generation(
+         self,
+         input_ids,
+-        past_key_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
++        past_keys=None,
++        past_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+         **kwargs,
+     ):
+         # cut decoder_input_ids if past_key_values is used
+-        if past_key_values is not None:
+-            past_length = past_key_values[0][0].shape[2]
++        if past_keys is not None:
++            past_length = past_keys[0].shape[2]
+ 
+             # Some generation methods already pass only the last input ID
+             if input_ids.shape[1] > past_length:
+@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+ 
+         return {
+             "decoder_input_ids": input_ids,
+-            "past_key_values": past_key_values,
++            "past_cross_keys":past_cross_keys,
++            "past_cross_values":past_cross_values,
++            "past_keys":past_keys,
++            "past_values":past_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
+@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+         return reordered_decoder_past
+ 
++    def _prepare_encoder_decoder_kwargs_for_generation(
++        self,
++        inputs_tensor: torch.Tensor,
++        model_kwargs,
++        model_input_name,
++        generation_config,
++    ):
++        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
++        encoder_kwargs = {
++            argument: value
++            for argument, value in model_kwargs.items()
++            if not any(argument.startswith(p) for p in irrelevant_prefix)
++        }
++        encoder_kwargs["output_attentions"] = generation_config.output_attentions
++        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
++        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
++        encoder_kwargs["return_dict"] = True
++        encoder_kwargs[model_input_name] = inputs_tensor
++        import time
++        start_time = time.time()
++        with torch.npu.stream(self.stream): # set stream
++            encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"])
++        self.stream.synchronize() # synchronize
++        model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]}
++        model_kwargs["past_cross_keys"] = encoder_outputs[1]
++        model_kwargs["past_cross_values"] =encoder_outputs[2]
++        return model_kwargs
++
++    def _update_model_kwargs_for_generation(
++        self,
++        outputs,
++        model_kwargs,
++        is_encoder_decoder = False,
++        standardize_cache_format = False,
++        num_new_tokens = 1,
++    ):
++        # update past_key_values keeping its naming used in model code
++        cache_name, cache = self._extract_past_from_model_output(
++            outputs, standardize_cache_format=standardize_cache_format
++        )
++        model_kwargs[cache_name] = cache
++        if "past_keys" in outputs:
++            past_keys = outputs.past_keys
++            model_kwargs["past_keys"] = past_keys
++        if "past_values" in outputs:
++            past_values = outputs.past_values
++            model_kwargs["past_values"] = past_values
++        # update decoder attention mask
++        if "decoder_attention_mask" in model_kwargs:
++            decoder_attention_mask = model_kwargs["decoder_attention_mask"]
++            model_kwargs["decoder_attention_mask"] = torch.cat(
++                 [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
++                dim=-1,
++            )
++        return model_kwargs
++
++    @torch.no_grad()
++    def generate(
++        self,
++        inputs = None,
++        generation_config = None,
++        logits_processor = None,
++        stopping_criteria = None,
++        prefix_allowed_tokens_fn = None,
++        assistant_model = None,
++        negative_prompt_ids = None,
++        negative_prompt_attention_mask = None,
++        **kwargs,
++    ):
++        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
++        import time 
++        start_time = time.time()
++        self._validate_model_class()
++        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
++        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
++        self._validate_model_kwargs(model_kwargs.copy())
++
++
++        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
++        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
++
++        accepts_attention_mask = True
++        requires_attention_mask = "encoder_outputs" not in model_kwargs
++        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
++
++        # 3. Define model inputs
++        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
++            inputs, generation_config.bos_token_id, model_kwargs
++        )
++        batch_size = inputs_tensor.shape[0]
++
++        device = inputs_tensor.device
++        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
++
++        # 4. Define other model kwargs
++        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
++        # generating the first new token or not, and we only want to use the embeddings for the first new token)
++        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
++            model_kwargs["use_cache"] = True
++        else:
++            model_kwargs["use_cache"] = generation_config.use_cache
++        if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
++            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
++                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
++            )
++
++        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
++            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
++            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
++                inputs_tensor, model_kwargs, model_input_name, generation_config
++            )
++
++        # 5. Prepare `input_ids` which will be used for auto-regressive generation
++        if self.config.is_encoder_decoder:
++            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
++                batch_size=batch_size,
++                model_input_name=model_input_name,
++                model_kwargs=model_kwargs,
++                decoder_start_token_id=generation_config.decoder_start_token_id,
++                device=inputs_tensor.device,
++            )
++        else:
++            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
++
++        if generation_config.token_healing:
++            input_ids = self.heal_tokens(input_ids, tokenizer)
++
++        # 6. Prepare `max_length` depending on other stopping criteria.
++        input_ids_length = input_ids.shape[-1]
++        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
++        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
++        generation_config = self._prepare_generated_length(
++            generation_config=generation_config,
++            has_default_max_length=has_default_max_length,
++            has_default_min_length=has_default_min_length,
++            model_input_name=model_input_name,
++            inputs_tensor=inputs_tensor,
++            input_ids_length=input_ids_length,
++        )
++
++        use_dynamic_cache_by_default = False
++        if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None:
++            raise ValueError(
++                "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
++                "Cache object) is unsupported. Please use only one of the two."
++            )
++        elif generation_config.cache_implementation is not None:
++            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
++                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
++                    raise ValueError(
++                        "This model does not support `cache_implementation='static'`. Please check the following "
++                        "issue: https://github.com/huggingface/transformers/issues/28981"
++                    )
++                model_kwargs["past_key_values"] = self._get_cache(
++                    generation_config.cache_implementation,
++                    getattr(generation_config, "num_beams", 1) * batch_size,
++                    generation_config.max_length,
++                )
++            elif generation_config.cache_implementation == "quantized":
++                if not self._supports_quantized_cache:
++                    raise ValueError(
++                        "This model does not support the quantized cache. If you want your model to support quantized "
++                        "cache, please open an issue."
++                    )
++
++                cache_config = (
++                    generation_config.cache_config
++                    if generation_config.cache_config is not None
++                    else QuantizedCacheConfig()
++                )
++                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
++
++                if cache_config.backend == "quanto" and not is_quanto_available():
++                    raise ImportError(
++                        "You need to install `quanto` in order to use KV cache quantization with quanto backend. "
++                        "Please install it via  with `pip install quanto`"
++                    )
++                elif cache_config.backend == "HQQ" and not is_hqq_available():
++                    raise ImportError(
++                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
++                        "Please install it via  with `pip install hqq`"
++                    )
++
++                model_kwargs["past_key_values"] = cache_class(cache_config)
++        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
++        # keeps copying the cache thus using much more memory
++        elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache():
++            past = model_kwargs.get("past_key_values", None)
++            if past is None:
++                model_kwargs["past_key_values"] = DynamicCache()
++                use_dynamic_cache_by_default = True
++            elif isinstance(past, tuple):
++                model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past)
++                use_dynamic_cache_by_default = True
++
++        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
++
++        # 7. determine generation mode
++        generation_mode = generation_config.get_generation_mode(assistant_model)
++        # 8. prepare distribution pre_processing samplers
++        prepared_logits_processor = self._get_logits_processor(
++            generation_config=generation_config,
++            input_ids_seq_length=input_ids_length,
++            encoder_input_ids=inputs_tensor,
++            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
++            logits_processor=logits_processor,
++            device=inputs_tensor.device,
++            model_kwargs=model_kwargs,
++            negative_prompt_ids=negative_prompt_ids,
++            negative_prompt_attention_mask=negative_prompt_attention_mask,
++        )
++
++        # 9. prepare stopping criteria
++        prepared_stopping_criteria = self._get_stopping_criteria(
++            generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
++        )
++
++        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
++            # 11. prepare logits warper
++            prepared_logits_warper = (
++                self._get_logits_warper(generation_config, device=input_ids.device)
++                if generation_config.do_sample
++                else None
++            )
++
++            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
++            input_ids, model_kwargs = self._expand_inputs_for_generation(
++                input_ids=input_ids,
++                expand_size=generation_config.num_return_sequences,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++                **model_kwargs,
++            )
++            # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
++            result = self._sample(
++                input_ids,
++                logits_processor=prepared_logits_processor,
++                logits_warper=prepared_logits_warper,
++                stopping_criteria=prepared_stopping_criteria,
++                generation_config=generation_config,
++                **model_kwargs,
++            )
++        return result
++    
++    def _sample(
++        self,
++        input_ids,
++        logits_processor,
++        stopping_criteria,
++        generation_config,
++        logits_warper = None,
++        **model_kwargs,
++    ):
++        # init values
++        pad_token_id = generation_config.pad_token_id
++        output_attentions = generation_config.output_attentions
++        output_hidden_states = generation_config.output_hidden_states
++        output_scores = generation_config.output_scores
++        output_logits = generation_config.output_logits
++        return_dict_in_generate = generation_config.return_dict_in_generate
++        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
++        do_sample = generation_config.do_sample
++        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
++            raise ValueError(
++                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
++                f"{logits_warper})."
++            )
++
++        # init attention / hidden states / scores tuples
++        scores = () if (return_dict_in_generate and output_scores) else None
++        raw_logits = () if (return_dict_in_generate and output_logits) else None
++        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
++        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
++        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
++
++        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
++        if return_dict_in_generate and self.config.is_encoder_decoder:
++            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
++            encoder_hidden_states = (
++                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
++            )
++       
++        this_peer_finished = False
++        batch_size = input_ids.shape[0]
++        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
++        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
++        
++        # keep track of which sequences are already finished
++        if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration":
++            num_layers = self.config.num_layers
++            num_heads = self.config.num_heads
++            d_kv = self.config.d_kv
++            model_kwargs["past_keys"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)]
++            model_kwargs["past_values"] = [torch.randn(batch_size, num_heads, 0, d_kv).half().npu() for _ in range(num_layers)]
++       
++
++        while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device):
++            # prepare model inputs
++            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
++            model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]]
++            model_args.extend(model_kwargs["past_cross_keys"])
++            model_args.extend(model_kwargs["past_cross_values"])
++            model_args.extend(model_inputs["past_keys"])
++            model_args.extend(model_inputs["past_values"])
++            model_args.append(model_inputs["attention_mask"])
++            model_args.append(model_inputs["decoder_input_ids"])
++            
++            # forward pass to get next token
++            outputs = self(*model_args)
++            outputs = Seq2SeqLMOutput(logits=outputs[0],
++                                      past_keys=outputs[1],
++                                      past_values=outputs[2])
++
++            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
++            # (the clone itself is always small)
++            next_token_logits = outputs.logits[:, -1, :].clone()
++
++            # pre-process distribution
++            next_token_scores = logits_processor(input_ids, next_token_logits)
++            if do_sample:
++                next_token_scores = logits_warper(input_ids, next_token_scores)
++
++            # Store scores, attentions and hidden_states when required
++            if return_dict_in_generate:
++                if output_scores:
++                    scores += (next_token_scores,)
++                if output_logits:
++                    raw_logits += (next_token_logits,)
++                if output_attentions:
++                    decoder_attentions += (
++                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
++                    )
++                    if self.config.is_encoder_decoder:
++                        cross_attentions += (outputs.cross_attentions,)
++
++                if output_hidden_states:
++                    decoder_hidden_states += (
++                        (outputs.decoder_hidden_states,)
++                        if self.config.is_encoder_decoder
++                        else (outputs.hidden_states,)
++                    )
++
++            # token selection
++            if do_sample:
++                probs = nn.functional.softmax(next_token_scores, dim=-1)
++                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
++            else:
++                next_tokens = torch.argmax(next_token_scores, dim=-1)
++
++            # finished sentences should have their next token be a padding token
++            if has_eos_stopping_criteria:
++                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
++
++            # update generated ids, model inputs, and length for next step
++            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
++            model_kwargs = self._update_model_kwargs_for_generation(
++                outputs,
++                model_kwargs,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++            )
++            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
++            this_peer_finished = unfinished_sequences.max() == 0
++            # This is needed to properly delete outputs.logits which may be very large for first iteration
++            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
++            del outputs
++        return input_ids
++
++    def invert_attention_mask(self, encoder_attention_mask):
++        if encoder_attention_mask.dim() == 3:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
++        if encoder_attention_mask.dim() == 2:
++            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
++        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
++
++        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++
++        return encoder_extended_attention_mask
++    
++    @property
++    def device(self) -> torch.device:
++        """
++        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
++        device).
++        """
++        return self.get_device()
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, deviceNone, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
+ 
+ @add_start_docstrings(
+     "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-- 
Gitee


From 516533d1fc741d8d03c2300df62b2a6cd0e0e155 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:35:07 +0000
Subject: [PATCH 066/110] add MindIE/MindIE-Torch/built-in/MT5.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py

diff --git a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 8a3a411bd493f5800c919891bd7178a9b853a8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:36:01 +0000
Subject: [PATCH 067/110] update
 MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/MT5/MT5_modeling_patch.py        | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py
index e69de29bb2..35a6ec8613 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py
+++ b/MindIE/MindIE-Torch/built-in/MT5/MT5_modeling_patch.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import transformers
+
+
+def main():
+    transformers_path = transformers.__path__
+    transformers_version = transformers.__version__
+
+    assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
+    os.system(f'patch -p0 {transformers_path[0]}/models/mt5/modeling_mt5.py modeling_mt5.patch')
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From e115bbf46c896f60a881abe45a3decc58bee355a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 11:36:10 +0000
Subject: [PATCH 068/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/MT5/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/MT5/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/MT5/.keep

diff --git a/MindIE/MindIE-Torch/built-in/MT5/.keep b/MindIE/MindIE-Torch/built-in/MT5/.keep
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From ba3650fe72702dd5aeac61d37d3363b75f7c201f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 12:02:39 +0000
Subject: [PATCH 069/110] add MindIE/MindIE-Torch/built-in/MT5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/readme.md | 95 ++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/MT5/readme.md

diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md
new file mode 100644
index 0000000000..3ffa911ed6
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md
@@ -0,0 +1,95 @@
+# MT5模型-推理指导  
+
+
+- [概述](#ZH-CN_TOPIC_0000001172161501)
+  
+   - [输入输出数据](#section540883920406)
+
+- [推理环境准备](#ZH-CN_TOPIC_0000001126281702)
+
+- [快速上手](#ZH-CN_TOPIC_0000001126281700)
+
+  - [模型推理](#section741711594517)
+
+
+
+# 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
+
+  T5全称是Text-to-Text Transfer Transformer，是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务，如分类、相似度计算、文本生成都用一个Text-to-text（文本到文本）的框架里进行解决。
+
+
+## 输入输出数据<a name="section540883920406"></a>
+
+- 输入数据
+
+  | 输入数据  | 大小     | 数据类型 | 数据排布格式 |
+  | -------- | -------- | -------- | ------------ |
+  | input    |  batchsize x input_seq_len | FLOAT16 |  NHWC |
+
+
+- 输出数据
+
+  | 输出数据 | 大小      | 数据类型 | 数据排布格式 |
+  | -------- | -------- | -------- | ------------ |
+  | output   | batchsize x input_seq_len | INT32  | NTHWC |
+
+
+# 推理环境准备<a name="ZH-CN_TOPIC_0000001126281702"></a>
+
+- 该模型需要以下插件与驱动
+
+  **表 1**  版本配套表
+- 
+  | 配套                                                         | 版本     | 备注                                                 |
+  | ------------------------------------------------------------ |--------| ------------------------------------------------------------ |
+  | Python                                                       | 3.10.2 | -                                                            |
+  | torch | 2.1.0  | 导出pt模型所需版本                                            |
+  | torch_npu | 2.1.0  | 模型编译和推理所需版本                                         |
+
+
+# 快速上手<a name="ZH-CN_TOPIC_0000001126281700"></a>
+
+
+1. 安装transformers4.42.0版本。
+   ```bash
+   pip3 install transformers==4.42.0
+   ```
+
+2. 安装mindie包，需要与torch_npu配合使用，请参考mindietorch配套torch_npu配置环境
+
+   ```bash
+   # 安装mindie
+   chmod +x ./Ascend-mindie_xxx.run
+   ./Ascend-mindie_xxx.run --install
+   source /usr/local/Ascend/mindie/set_env.sh
+   ```
+
+3. 代码修改,在MT5目录下
+
+   执行命令：
+   ```bash
+   python MT5_modeling_patch.py
+   ```
+4.导出mindietorch模型
+ ```bash
+   python export_mt5.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
+   ```
+参数说明：
+{output_path}是输出的目录
+{model_path}模型所在目录
+{max_batchsize}推理过程中最大的batchsize
+{max_input_seq_len}推理过程中最大输入长度
+{device_id} 用哪个npu device
+
+运行该命令后会自动生成encoder和decoder优化后的模型
+
+5.精度测试
+ ```bash
+python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id
+```
+
+参数说明：
+{model_path}模型所在目录
+{encoder_aie_path}优化后的encoder的模型路径，要具体到.pt文件
+{decoder_aie_path}优化后的decoder的模型路径，要具体到.pt文件
+{device_id} 用哪个npu device
\ No newline at end of file
-- 
Gitee


From 4c4157b5b0868d0901e668e84466070e31f906b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 12:03:02 +0000
Subject: [PATCH 070/110] update MindIE/MindIE-Torch/built-in/MT5/test_mt5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/test_mt5.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py
index c73905875e..92717df66f 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py
+++ b/MindIE/MindIE-Torch/built-in/MT5/test_mt5.py
@@ -37,8 +37,6 @@ def main():
     with torch.npu.stream(mindie_stream): # set stream
         mindie_encoder_output = encoder_mindie(encoder_input)[0]
     mindie_stream.synchronize() # synchronize
-    import pdb
-    pdb.set_trace()
     if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99:
         print("encoder precision failed")
     else:
-- 
Gitee


From 6fc8a5686c1bf764cd4d25103d66f661ecab8676 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 12:08:14 +0000
Subject: [PATCH 071/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index 95d0455bf5..d0c6a08f48 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -476,7 +476,6 @@ index 224769fdf..6af548437 100644
 +        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
 +
 +        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
-+        print("encoder_extended_attention_mask=",encoder_extended_attention_mask)
 +
 +        return encoder_extended_attention_mask
 +
-- 
Gitee


From dbcbf54c8418ddb2cac75d4e309b71ebaa801dbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 12:37:35 +0000
Subject: [PATCH 072/110] update
 MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/MT5/modeling_mt5.patch           | 280 ++++++++++++------
 1 file changed, 182 insertions(+), 98 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
index 0df148b2ea..95d0455bf5 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
+++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
@@ -1,7 +1,7 @@
-diff --git a/modeling_mt5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
-index 1336b9196..5b94d69c7 100644
---- a/modeling_mt5_origin.py
-+++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
+diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+index 224769fdf..6af548437 100644
+--- a/modeling_t5.py
++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
 @@ -19,22 +19,26 @@ import math
  import os
  import warnings
@@ -29,13 +29,13 @@ index 1336b9196..5b94d69c7 100644
  )
 -from ...modeling_utils import PreTrainedModel
 +from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin
- from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+ from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
  from ...utils import (
      DUMMY_INPUTS,
 @@ -47,8 +51,44 @@ from ...utils import (
  )
  from ...utils.model_parallel_utils import assert_device_map, get_device_map
- from .configuration_mt5 import MT5Config
+ from .configuration_t5 import T5Config
 +from transformers.generation.logits_process import LogitsProcessorList
 +from transformers.generation.stopping_criteria import StoppingCriteriaList
 +from transformers.generation.configuration_utils import GenerationMode
@@ -76,8 +76,8 @@ index 1336b9196..5b94d69c7 100644
 +
  logger = logging.get_logger(__name__)
  
- _CONFIG_FOR_DOC = "MT5Config"
-@@ -323,7 +363,10 @@ class MT5Attention(nn.Module):
+ _CONFIG_FOR_DOC = "T5Config"
+@@ -448,7 +488,10 @@ class T5Attention(nn.Module):
          mask=None,
          key_value_states=None,
          position_bias=None,
@@ -89,7 +89,7 @@ index 1336b9196..5b94d69c7 100644
          layer_head_mask=None,
          query_length=None,
          use_cache=False,
-@@ -339,17 +382,15 @@ class MT5Attention(nn.Module):
+@@ -464,12 +507,8 @@ class T5Attention(nn.Module):
  
          real_seq_length = seq_length
  
@@ -104,14 +104,7 @@ index 1336b9196..5b94d69c7 100644
  
          key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
  
-         def shape(states):
-             """projection"""
-+            # import pdb
-+            # pdb.set_trace()
-             return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
- 
-         def unshape(states):
-@@ -368,16 +409,17 @@ class MT5Attention(nn.Module):
+@@ -493,16 +532,17 @@ class T5Attention(nn.Module):
                  hidden_states = shape(proj_layer(key_value_states))
  
              if past_key_value is not None:
@@ -135,7 +128,7 @@ index 1336b9196..5b94d69c7 100644
                  else:
                      # cross-attn
                      hidden_states = past_key_value
-@@ -388,10 +430,10 @@ class MT5Attention(nn.Module):
+@@ -513,17 +553,16 @@ class T5Attention(nn.Module):
  
          # get key/value states
          key_states = project(
@@ -146,9 +139,17 @@ index 1336b9196..5b94d69c7 100644
 -            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
 +            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
          )
- 
+-
++        # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,)
          # compute scores
-@@ -411,7 +453,7 @@ class MT5Attention(nn.Module):
+         scores = torch.matmul(
+             query_states, key_states.transpose(3, 2)
+         )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-
+         if position_bias is None:
+             if not self.has_relative_attention_bias:
+                 position_bias = torch.zeros(
+@@ -536,7 +575,7 @@ class T5Attention(nn.Module):
  
              # if key and values are already calculated
              # we want only the last query position bias
@@ -157,25 +158,32 @@ index 1336b9196..5b94d69c7 100644
                  position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
  
              if mask is not None:
-@@ -439,14 +481,124 @@ class MT5Attention(nn.Module):
+@@ -548,7 +587,6 @@ class T5Attention(nn.Module):
+             position_bias_masked = position_bias[:, mask.bool()]
+         else:
+             position_bias_masked = position_bias
+-
+         scores += position_bias_masked
+         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+             scores
+@@ -564,18 +602,131 @@ class T5Attention(nn.Module):
          attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
          attn_output = self.o(attn_output)
  
 -        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 -        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
--
 +        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 +        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
 +        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
 +        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
 +       
-         if output_attentions:
-             outputs = outputs + (attn_weights,)
-         return outputs
- 
- 
-+class MT5SelfAttention(MT5Attention):
-+    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
++        if output_attentions:
++            outputs = outputs + (attn_weights,)
++        return outputs
++
++
++class T5SelfAttention(T5Attention):
++    def __init__(self, config: T5Config, has_relative_attention_bias=False):
 +        super().__init__(config, has_relative_attention_bias)
 +
 +    def forward(
@@ -273,19 +281,27 @@ index 1336b9196..5b94d69c7 100644
 +
 +        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
 +        attn_output = self.o(attn_output)
-+
+ 
 +        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 +        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
 +        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
 +        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
-+        if output_attentions:
-+            outputs = outputs + (attn_weights,)
-+        return outputs
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+ 
+ 
++
 +
- # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
- class MT5LayerSelfAttention(nn.Module):
+ class T5LayerSelfAttention(nn.Module):
      def __init__(self, config, has_relative_attention_bias=False):
-@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module):
+         super().__init__()
+-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -295,7 +311,7 @@ index 1336b9196..5b94d69c7 100644
          use_cache=False,
          output_attentions=False,
      ):
-@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module):
+@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module):
              mask=attention_mask,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -305,7 +321,7 @@ index 1336b9196..5b94d69c7 100644
              use_cache=use_cache,
              output_attentions=output_attentions,
          )
-@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module):
+@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -315,7 +331,7 @@ index 1336b9196..5b94d69c7 100644
          use_cache=False,
          query_length=None,
          output_attentions=False,
-@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module):
+@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module):
              key_value_states=key_value_states,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -325,7 +341,7 @@ index 1336b9196..5b94d69c7 100644
              use_cache=use_cache,
              query_length=query_length,
              output_attentions=output_attentions,
-@@ -539,39 +695,34 @@ class MT5Block(nn.Module):
+@@ -661,39 +816,34 @@ class T5Block(nn.Module):
          encoder_decoder_position_bias=None,
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
@@ -379,8 +395,11 @@ index 1336b9196..5b94d69c7 100644
  
          # clamp inf values to enable fp16 training
          if hidden_states.dtype == torch.float16:
-@@ -586,8 +737,8 @@ class MT5Block(nn.Module):
+@@ -706,22 +856,23 @@ class T5Block(nn.Module):
+ 
+         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
          if do_cross_attention:
++            
              # the actual query length is unknown for cross attention
              # if using past key value states. Need to inject it here
 -            if present_key_value_state is not None:
@@ -389,8 +408,10 @@ index 1336b9196..5b94d69c7 100644
 +                query_length = present_key_state[0].shape[2]
              else:
                  query_length = None
- 
-@@ -597,7 +748,8 @@ class MT5Block(nn.Module):
+-
+             cross_attention_outputs = self.layer[1](
+                 hidden_states,
+                 key_value_states=encoder_hidden_states,
                  attention_mask=encoder_attention_mask,
                  position_bias=encoder_decoder_position_bias,
                  layer_head_mask=cross_attn_layer_head_mask,
@@ -398,9 +419,12 @@ index 1336b9196..5b94d69c7 100644
 +                past_key=cross_attn_past_key,
 +                past_value=cross_attn_past_value,
                  query_length=query_length,
-                 use_cache=use_cache,
+-                use_cache=use_cache,
++                use_cache=use_cache, 
                  output_attentions=output_attentions,
-@@ -614,11 +766,9 @@ class MT5Block(nn.Module):
+             )
+             hidden_states = cross_attention_outputs[0]
+@@ -736,11 +887,9 @@ class T5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
@@ -414,7 +438,7 @@ index 1336b9196..5b94d69c7 100644
  
          # Apply Feed Forward layer
          hidden_states = self.layer[-1](hidden_states)
-@@ -635,7 +785,7 @@ class MT5Block(nn.Module):
+@@ -757,7 +906,7 @@ class T5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
@@ -423,10 +447,10 @@ index 1336b9196..5b94d69c7 100644
          else:
              outputs = outputs + attention_outputs
  
-@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel):
+@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel):
  
- # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
- class MT5Stack(MT5PreTrainedModel):
+ 
+ class T5Stack(T5PreTrainedModel):
 -    def __init__(self, config, embed_tokens=None):
 +    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
          super().__init__(config)
@@ -439,8 +463,8 @@ index 1336b9196..5b94d69c7 100644
 +        self.model_dim = config.d_model
  
          self.block = nn.ModuleList(
-             [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel):
+             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel):
      def set_input_embeddings(self, new_embeddings):
          self.embed_tokens = new_embeddings
  
@@ -452,6 +476,7 @@ index 1336b9196..5b94d69c7 100644
 +        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
 +
 +        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
++        print("encoder_extended_attention_mask=",encoder_extended_attention_mask)
 +
 +        return encoder_extended_attention_mask
 +
@@ -506,7 +531,7 @@ index 1336b9196..5b94d69c7 100644
      ):
          # Model parallel
          if self.model_parallel:
-@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel):
                  f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
              )
          elif input_ids is not None:
@@ -517,7 +542,7 @@ index 1336b9196..5b94d69c7 100644
          elif inputs_embeds is not None:
              input_shape = inputs_embeds.size()[:-1]
          else:
-@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel):
              inputs_embeds = self.embed_tokens(input_ids)
  
          batch_size, seq_length = input_shape
@@ -542,7 +567,7 @@ index 1336b9196..5b94d69c7 100644
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -552,7 +577,7 @@ index 1336b9196..5b94d69c7 100644
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -563,7 +588,7 @@ index 1336b9196..5b94d69c7 100644
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel):
                      encoder_decoder_position_bias=encoder_decoder_position_bias,
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
@@ -575,7 +600,7 @@ index 1336b9196..5b94d69c7 100644
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -601,7 +626,7 @@ index 1336b9196..5b94d69c7 100644
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -610,7 +635,7 @@ index 1336b9196..5b94d69c7 100644
  
          # Add last layer
          if output_hidden_states:
-@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel):
+@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel):
                  ]
                  if v is not None
              )
@@ -636,7 +661,7 @@ index 1336b9196..5b94d69c7 100644
 +            return tuple((lm_logits, present_key_states, present_value_states))
 +
 +
-+class MT5Stack_Encoder(MT5PreTrainedModel):
++class T5Stack_Encoder(T5PreTrainedModel):
 +    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
 +        super().__init__(config)
 +        self.embed_tokens = embed_tokens
@@ -646,9 +671,9 @@ index 1336b9196..5b94d69c7 100644
 +        self.model_dim = config.d_model
 +
 +        self.block = nn.ModuleList(
-+            [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
++            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
          )
-+        self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
++        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
 +        self.dropout = nn.Dropout(config.dropout_rate)
 +
 +        # Initialize weights and apply final processing
@@ -832,8 +857,8 @@ index 1336b9196..5b94d69c7 100644
 +            return tuple((hidden_states, cross_keys, cross_values))
  
  
- MT5_START_DOCSTRING = r"""
-@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel):
+ T5_START_DOCSTRING = r"""
+@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel):
          )
  
  
@@ -850,8 +875,6 @@ index 1336b9196..5b94d69c7 100644
 +        past_cross_keys = ()
 +        for i in range(len(self.cross_key)):
 +           past_cross_keys +=  (self.cross_key[i](hidden_states),)
-+        #    import pdb
-+        #    pdb.set_trace()
 +        return past_cross_keys
 +
 +
@@ -870,15 +893,16 @@ index 1336b9196..5b94d69c7 100644
 +           past_cross_values +=  (self.cross_value[i](hidden_states),)
 +        return past_cross_values
 +
- @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
- class MT5ForConditionalGeneration(MT5PreTrainedModel):
-     r"""
-@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
++
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
+ class T5ForConditionalGeneration(T5PreTrainedModel):
+     _keys_to_ignore_on_load_unexpected = [
+@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+     ]
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
-     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
--    def __init__(self, config: MT5Config):
-+    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
+-    def __init__(self, config: T5Config):
++    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
          super().__init__(config)
 -        self.model_dim = config.d_model
 -
@@ -888,13 +912,13 @@ index 1336b9196..5b94d69c7 100644
 -        encoder_config.is_decoder = False
 -        encoder_config.use_cache = False
 -        encoder_config.is_encoder_decoder = False
--        self.encoder = MT5Stack(encoder_config, self.shared)
+-        self.encoder = T5Stack(encoder_config, self.shared)
 -
 -        decoder_config = copy.deepcopy(config)
 -        decoder_config.is_decoder = True
 -        decoder_config.is_encoder_decoder = False
 -        decoder_config.num_layers = config.num_decoder_layers
--        self.decoder = MT5Stack(decoder_config, self.shared)
+-        self.decoder = T5Stack(decoder_config, self.shared)
 -
 -        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
 +        self.encoder_path = encoder_path
@@ -909,17 +933,20 @@ index 1336b9196..5b94d69c7 100644
 +            decoder_config.is_decoder = True
 +            decoder_config.is_encoder_decoder = False
 +            decoder_config.num_layers = config.num_decoder_layers
++       
 +            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-+            self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head)
++            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
++
 +            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
 +            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
 +            encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv)
 +            encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv)
++
 +            encoder_config = copy.deepcopy(config)
 +            encoder_config.is_decoder = False
 +            encoder_config.use_cache = False
 +            encoder_config.is_encoder_decoder = False
-+            self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
++            self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
 +        self.encoder_mindie = None
 +        self.decoder_mindie = None
 +        if self.encoder_path:
@@ -927,28 +954,24 @@ index 1336b9196..5b94d69c7 100644
 +            self.is_mindie = True
 +        if self.decoder_path:
 +            self.decoder_mindie = torch.jit.load(self.decoder_path)
++            
 +            self.stream = torch.npu.Stream(f"npu:{device_id}")
 +            self.device_id = device_id
++
++    
++    def get_device(self):
++        return f"npu:{self.device_id}"
  
          # Initialize weights and apply final processing
 -        self.post_init()
-+        if not self.is_mindie:
-+            self.post_init()
++        # self.post_init()
  
          # Model parallel
          self.model_parallel = False
-         self.device_map = None
+@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
  
-+    def get_device(self):
-+        return f"npu:{self.device_id}"
-+    
-     @add_start_docstrings(PARALLELIZE_DOCSTRING)
-     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
-     def parallelize(self, device_map=None):
-@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
-     @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
      @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5
 -    def forward(
 -        self,
 -        input_ids: Optional[torch.LongTensor] = None,
@@ -972,7 +995,7 @@ index 1336b9196..5b94d69c7 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1100,7 +1123,6 @@ index 1336b9196..5b94d69c7 100644
 -        )
 +        return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2])
  
--    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
      def prepare_inputs_for_generation(
          self,
          input_ids,
@@ -1112,7 +1134,7 @@ index 1336b9196..5b94d69c7 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1123,7 +1145,12 @@ index 1336b9196..5b94d69c7 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             else:
+                 # Default to old behavior: keep only final ID
+                 remove_prefix_length = input_ids.shape[1] - 1
+-
+             input_ids = input_ids[:, remove_prefix_length:]
  
          return {
              "decoder_input_ids": input_ids,
@@ -1135,7 +1162,15 @@ index 1336b9196..5b94d69c7 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             "decoder_attention_mask": decoder_attention_mask,
+             "cross_attn_head_mask": cross_attn_head_mask,
+             "use_cache": use_cache,
++            
+         }
+ 
+     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1426,7 +1461,7 @@ index 1336b9196..5b94d69c7 100644
 +        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 +        
 +        # keep track of which sequences are already finished
-+        if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration":
++        if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration":
 +            num_layers = self.config.num_layers
 +            num_heads = self.config.num_heads
 +            d_kv = self.config.d_kv
@@ -1506,12 +1541,26 @@ index 1336b9196..5b94d69c7 100644
 +        return input_ids
 +
 +    def invert_attention_mask(self, encoder_attention_mask):
++        """
++        Invert an attention mask (e.g., switches 0. and 1.).
++
++        Args:
++            encoder_attention_mask (`torch.Tensor`): An attention mask.
++
++        Returns:
++            `torch.Tensor`: The inverted attention mask.
++        """
 +        if encoder_attention_mask.dim() == 3:
 +            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
 +        if encoder_attention_mask.dim() == 2:
 +            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
++        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
++        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
++        # /transformer/transformer_layers.py#L270
++        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
++        # encoder_extended_attention_mask.transpose(-1, -2))
 +        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-+
++        #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
 +        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
 +
 +        return encoder_extended_attention_mask
@@ -1525,19 +1574,37 @@ index 1336b9196..5b94d69c7 100644
 +        return self.get_device()
 +    
 +    def get_extended_attention_mask(
-+        self, attention_mask, input_shape, deviceNone, dtype=None
++        self, attention_mask, input_shape, devic=None, dtype=None
 +    ):
++        """
++        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
++
++        Arguments:
++            attention_mask (`torch.Tensor`):
++                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
++            input_shape (`Tuple[int]`):
++                The shape of the input to the model.
++
++        Returns:
++            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
++        """
 +        if dtype is None:
 +            dtype = self.dtype
 +
 +        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
 +            if device is not None:
 +                warnings.warn(
 +                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
 +                )
++        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
++        # ourselves in which case we just need to make it broadcastable to all heads.
 +        if attention_mask.dim() == 3:
 +            extended_attention_mask = attention_mask[:, None, :, :]
 +        elif attention_mask.dim() == 2:
++            # Provided a padding mask of dimensions [batch_size, seq_length]
++            # - if the model is a decoder, apply a causal mask in addition to the padding mask
++            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
 +            if self.config.is_decoder:
 +                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
 +                    input_shape, attention_mask, device
@@ -1548,10 +1615,27 @@ index 1336b9196..5b94d69c7 100644
 +            raise ValueError(
 +                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
 +            )
++
++        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
++        # masked positions, this operation will create a tensor which is 0.0 for
++        # positions we want to attend and the dtype's smallest value for masked positions.
++        # Since we are adding it to the raw scores before the softmax, this is
++        # effectively the same as removing these entirely.
 +        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
 +        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
 +        return extended_attention_mask
++
++
 +
  
  @add_start_docstrings(
-     "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+     "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel):
+         >>> last_hidden_states = outputs.last_hidden_state
+         ```"""
+         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+         encoder_outputs = self.encoder(
+             input_ids=input_ids,
+             attention_mask=attention_mask,
-- 
Gitee


From a0a6f76dcbe2ffd3ca97046f9315d1a87551dba6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 11 Sep 2024 12:43:51 +0000
Subject: [PATCH 073/110] update MindIE/MindIE-Torch/built-in/T5/main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/main.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py
index 8ac34ceec5..ccad949c44 100644
--- a/MindIE/MindIE-Torch/built-in/T5/main.py
+++ b/MindIE/MindIE-Torch/built-in/T5/main.py
@@ -19,11 +19,9 @@ def main():
     args = parse_args()
     torch.npu.set_device(args.device_id)
     tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path)
-    text = [
-                "translate English to German: The house is wonderful.",
-                "summarize: I am a high-performance inference optimizer and runtime.",
-                "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world",
-                ]
+    text = ["贵州毛台现在多少钱一瓶啊，想买两瓶尝尝味道。",
+        "能不能帮我买点淇淋，好久没吃了",
+        "脑子有点胡涂了，这道题冥冥学过还没有做出来"]
     t5_config = T5Config.from_pretrained(args.hf_model_path)
     model = T5ForConditionalGeneration(config=t5_config,
                                         encoder_path=args.encoder_aie_path,
-- 
Gitee


From 155a808d014aea535aece353d3fbfa1a88afa404 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 12 Sep 2024 03:38:38 +0000
Subject: [PATCH 074/110] update
 MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/MT5/modeling_mt5.patch           | 280 ++++++------------
 1 file changed, 98 insertions(+), 182 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
index 95d0455bf5..a5afef98e2 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
+++ b/MindIE/MindIE-Torch/built-in/MT5/modeling_mt5.patch
@@ -1,7 +1,7 @@
-diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..6af548437 100644
---- a/modeling_t5.py
-+++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+diff --git a/modeling_mt5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
+index 1336b9196..5b94d69c7 100644
+--- a/modeling_mt5.py
++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.py
 @@ -19,22 +19,26 @@ import math
  import os
  import warnings
@@ -29,13 +29,13 @@ index 224769fdf..6af548437 100644
  )
 -from ...modeling_utils import PreTrainedModel
 +from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin
- from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+ from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
  from ...utils import (
      DUMMY_INPUTS,
 @@ -47,8 +51,44 @@ from ...utils import (
  )
  from ...utils.model_parallel_utils import assert_device_map, get_device_map
- from .configuration_t5 import T5Config
+ from .configuration_mt5 import MT5Config
 +from transformers.generation.logits_process import LogitsProcessorList
 +from transformers.generation.stopping_criteria import StoppingCriteriaList
 +from transformers.generation.configuration_utils import GenerationMode
@@ -76,8 +76,8 @@ index 224769fdf..6af548437 100644
 +
  logger = logging.get_logger(__name__)
  
- _CONFIG_FOR_DOC = "T5Config"
-@@ -448,7 +488,10 @@ class T5Attention(nn.Module):
+ _CONFIG_FOR_DOC = "MT5Config"
+@@ -323,7 +363,10 @@ class MT5Attention(nn.Module):
          mask=None,
          key_value_states=None,
          position_bias=None,
@@ -89,7 +89,7 @@ index 224769fdf..6af548437 100644
          layer_head_mask=None,
          query_length=None,
          use_cache=False,
-@@ -464,12 +507,8 @@ class T5Attention(nn.Module):
+@@ -339,17 +382,15 @@ class MT5Attention(nn.Module):
  
          real_seq_length = seq_length
  
@@ -104,7 +104,14 @@ index 224769fdf..6af548437 100644
  
          key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
  
-@@ -493,16 +532,17 @@ class T5Attention(nn.Module):
+         def shape(states):
+             """projection"""
++            # import pdb
++            # pdb.set_trace()
+             return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+ 
+         def unshape(states):
+@@ -368,16 +409,17 @@ class MT5Attention(nn.Module):
                  hidden_states = shape(proj_layer(key_value_states))
  
              if past_key_value is not None:
@@ -128,7 +135,7 @@ index 224769fdf..6af548437 100644
                  else:
                      # cross-attn
                      hidden_states = past_key_value
-@@ -513,17 +553,16 @@ class T5Attention(nn.Module):
+@@ -388,10 +430,10 @@ class MT5Attention(nn.Module):
  
          # get key/value states
          key_states = project(
@@ -139,17 +146,9 @@ index 224769fdf..6af548437 100644
 -            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
 +            hidden_states, self.v, key_value_states, past_value if past_value is not None else None
          )
--
-+        # torch.ops.mindie.flash_attention_plugin(query_states, key_states, value_states,)
+ 
          # compute scores
-         scores = torch.matmul(
-             query_states, key_states.transpose(3, 2)
-         )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
--
-         if position_bias is None:
-             if not self.has_relative_attention_bias:
-                 position_bias = torch.zeros(
-@@ -536,7 +575,7 @@ class T5Attention(nn.Module):
+@@ -411,7 +453,7 @@ class MT5Attention(nn.Module):
  
              # if key and values are already calculated
              # we want only the last query position bias
@@ -158,32 +157,25 @@ index 224769fdf..6af548437 100644
                  position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
  
              if mask is not None:
-@@ -548,7 +587,6 @@ class T5Attention(nn.Module):
-             position_bias_masked = position_bias[:, mask.bool()]
-         else:
-             position_bias_masked = position_bias
--
-         scores += position_bias_masked
-         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-             scores
-@@ -564,18 +602,131 @@ class T5Attention(nn.Module):
+@@ -439,14 +481,124 @@ class MT5Attention(nn.Module):
          attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
          attn_output = self.o(attn_output)
  
 -        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 -        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+-
 +        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 +        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
 +        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
 +        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
 +       
-+        if output_attentions:
-+            outputs = outputs + (attn_weights,)
-+        return outputs
-+
-+
-+class T5SelfAttention(T5Attention):
-+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+         if output_attentions:
+             outputs = outputs + (attn_weights,)
+         return outputs
+ 
+ 
++class MT5SelfAttention(MT5Attention):
++    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
 +        super().__init__(config, has_relative_attention_bias)
 +
 +    def forward(
@@ -281,27 +273,19 @@ index 224769fdf..6af548437 100644
 +
 +        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
 +        attn_output = self.o(attn_output)
- 
++
 +        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 +        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
 +        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
 +        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
-         if output_attentions:
-             outputs = outputs + (attn_weights,)
-         return outputs
- 
- 
-+
++        if output_attentions:
++            outputs = outputs + (attn_weights,)
++        return outputs
 +
- class T5LayerSelfAttention(nn.Module):
+ # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
+ class MT5LayerSelfAttention(nn.Module):
      def __init__(self, config, has_relative_attention_bias=False):
-         super().__init__()
--        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-+        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-         self.dropout = nn.Dropout(config.dropout_rate)
- 
-@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -461,7 +613,8 @@ class MT5LayerSelfAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -311,7 +295,7 @@ index 224769fdf..6af548437 100644
          use_cache=False,
          output_attentions=False,
      ):
-@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -471,7 +624,8 @@ class MT5LayerSelfAttention(nn.Module):
              mask=attention_mask,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -321,7 +305,7 @@ index 224769fdf..6af548437 100644
              use_cache=use_cache,
              output_attentions=output_attentions,
          )
-@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -495,7 +649,8 @@ class MT5LayerCrossAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -331,7 +315,7 @@ index 224769fdf..6af548437 100644
          use_cache=False,
          query_length=None,
          output_attentions=False,
-@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -507,7 +662,8 @@ class MT5LayerCrossAttention(nn.Module):
              key_value_states=key_value_states,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -341,7 +325,7 @@ index 224769fdf..6af548437 100644
              use_cache=use_cache,
              query_length=query_length,
              output_attentions=output_attentions,
-@@ -661,39 +816,34 @@ class T5Block(nn.Module):
+@@ -539,39 +695,34 @@ class MT5Block(nn.Module):
          encoder_decoder_position_bias=None,
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
@@ -395,11 +379,8 @@ index 224769fdf..6af548437 100644
  
          # clamp inf values to enable fp16 training
          if hidden_states.dtype == torch.float16:
-@@ -706,22 +856,23 @@ class T5Block(nn.Module):
- 
-         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+@@ -586,8 +737,8 @@ class MT5Block(nn.Module):
          if do_cross_attention:
-+            
              # the actual query length is unknown for cross attention
              # if using past key value states. Need to inject it here
 -            if present_key_value_state is not None:
@@ -408,10 +389,8 @@ index 224769fdf..6af548437 100644
 +                query_length = present_key_state[0].shape[2]
              else:
                  query_length = None
--
-             cross_attention_outputs = self.layer[1](
-                 hidden_states,
-                 key_value_states=encoder_hidden_states,
+ 
+@@ -597,7 +748,8 @@ class MT5Block(nn.Module):
                  attention_mask=encoder_attention_mask,
                  position_bias=encoder_decoder_position_bias,
                  layer_head_mask=cross_attn_layer_head_mask,
@@ -419,12 +398,9 @@ index 224769fdf..6af548437 100644
 +                past_key=cross_attn_past_key,
 +                past_value=cross_attn_past_value,
                  query_length=query_length,
--                use_cache=use_cache,
-+                use_cache=use_cache, 
+                 use_cache=use_cache,
                  output_attentions=output_attentions,
-             )
-             hidden_states = cross_attention_outputs[0]
-@@ -736,11 +887,9 @@ class T5Block(nn.Module):
+@@ -614,11 +766,9 @@ class MT5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
@@ -438,7 +414,7 @@ index 224769fdf..6af548437 100644
  
          # Apply Feed Forward layer
          hidden_states = self.layer[-1](hidden_states)
-@@ -757,7 +906,7 @@ class T5Block(nn.Module):
+@@ -635,7 +785,7 @@ class MT5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
@@ -447,10 +423,10 @@ index 224769fdf..6af548437 100644
          else:
              outputs = outputs + attention_outputs
  
-@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel):
+@@ -884,11 +1034,15 @@ class MT5PreTrainedModel(PreTrainedModel):
  
- 
- class T5Stack(T5PreTrainedModel):
+ # Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
+ class MT5Stack(MT5PreTrainedModel):
 -    def __init__(self, config, embed_tokens=None):
 +    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
          super().__init__(config)
@@ -463,8 +439,8 @@ index 224769fdf..6af548437 100644
 +        self.model_dim = config.d_model
  
          self.block = nn.ModuleList(
-             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel):
+             [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -953,20 +1107,63 @@ class MT5Stack(MT5PreTrainedModel):
      def set_input_embeddings(self, new_embeddings):
          self.embed_tokens = new_embeddings
  
@@ -476,7 +452,6 @@ index 224769fdf..6af548437 100644
 +        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
 +
 +        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
-+        print("encoder_extended_attention_mask=",encoder_extended_attention_mask)
 +
 +        return encoder_extended_attention_mask
 +
@@ -531,7 +506,7 @@ index 224769fdf..6af548437 100644
      ):
          # Model parallel
          if self.model_parallel:
-@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -985,8 +1182,10 @@ class MT5Stack(MT5PreTrainedModel):
                  f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
              )
          elif input_ids is not None:
@@ -542,7 +517,7 @@ index 224769fdf..6af548437 100644
          elif inputs_embeds is not None:
              input_shape = inputs_embeds.size()[:-1]
          else:
-@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -999,18 +1198,19 @@ class MT5Stack(MT5PreTrainedModel):
              inputs_embeds = self.embed_tokens(input_ids)
  
          batch_size, seq_length = input_shape
@@ -567,7 +542,7 @@ index 224769fdf..6af548437 100644
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1041,7 +1241,8 @@ class MT5Stack(MT5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -577,7 +552,7 @@ index 224769fdf..6af548437 100644
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1049,8 +1250,8 @@ class MT5Stack(MT5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -588,7 +563,7 @@ index 224769fdf..6af548437 100644
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -1099,7 +1300,10 @@ class MT5Stack(MT5PreTrainedModel):
                      encoder_decoder_position_bias=encoder_decoder_position_bias,
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
@@ -600,7 +575,7 @@ index 224769fdf..6af548437 100644
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel):
+@@ -1107,19 +1311,20 @@ class MT5Stack(MT5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -626,7 +601,7 @@ index 224769fdf..6af548437 100644
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel):
+@@ -1133,7 +1338,7 @@ class MT5Stack(MT5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -635,7 +610,7 @@ index 224769fdf..6af548437 100644
  
          # Add last layer
          if output_hidden_states:
-@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel):
+@@ -1151,13 +1356,216 @@ class MT5Stack(MT5PreTrainedModel):
                  ]
                  if v is not None
              )
@@ -661,7 +636,7 @@ index 224769fdf..6af548437 100644
 +            return tuple((lm_logits, present_key_states, present_value_states))
 +
 +
-+class T5Stack_Encoder(T5PreTrainedModel):
++class MT5Stack_Encoder(MT5PreTrainedModel):
 +    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
 +        super().__init__(config)
 +        self.embed_tokens = embed_tokens
@@ -671,9 +646,9 @@ index 224769fdf..6af548437 100644
 +        self.model_dim = config.d_model
 +
 +        self.block = nn.ModuleList(
-+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
++            [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
          )
-+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
++        self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
 +        self.dropout = nn.Dropout(config.dropout_rate)
 +
 +        # Initialize weights and apply final processing
@@ -857,8 +832,8 @@ index 224769fdf..6af548437 100644
 +            return tuple((hidden_states, cross_keys, cross_values))
  
  
- T5_START_DOCSTRING = r"""
-@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel):
+ MT5_START_DOCSTRING = r"""
+@@ -1549,6 +1957,39 @@ class MT5Model(MT5PreTrainedModel):
          )
  
  
@@ -875,6 +850,8 @@ index 224769fdf..6af548437 100644
 +        past_cross_keys = ()
 +        for i in range(len(self.cross_key)):
 +           past_cross_keys +=  (self.cross_key[i](hidden_states),)
++        #    import pdb
++        #    pdb.set_trace()
 +        return past_cross_keys
 +
 +
@@ -893,16 +870,15 @@ index 224769fdf..6af548437 100644
 +           past_cross_values +=  (self.cross_value[i](hidden_states),)
 +        return past_cross_values
 +
-+
- @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
- class T5ForConditionalGeneration(T5PreTrainedModel):
-     _keys_to_ignore_on_load_unexpected = [
-@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
-     ]
+ @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
+ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     r"""
+@@ -1573,33 +2014,52 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
--    def __init__(self, config: T5Config):
-+    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
+-    def __init__(self, config: MT5Config):
++    def __init__(self, config: MT5Config, encoder_path=None, decoder_path=None, device_id=0):
          super().__init__(config)
 -        self.model_dim = config.d_model
 -
@@ -912,13 +888,13 @@ index 224769fdf..6af548437 100644
 -        encoder_config.is_decoder = False
 -        encoder_config.use_cache = False
 -        encoder_config.is_encoder_decoder = False
--        self.encoder = T5Stack(encoder_config, self.shared)
+-        self.encoder = MT5Stack(encoder_config, self.shared)
 -
 -        decoder_config = copy.deepcopy(config)
 -        decoder_config.is_decoder = True
 -        decoder_config.is_encoder_decoder = False
 -        decoder_config.num_layers = config.num_decoder_layers
--        self.decoder = T5Stack(decoder_config, self.shared)
+-        self.decoder = MT5Stack(decoder_config, self.shared)
 -
 -        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
 +        self.encoder_path = encoder_path
@@ -933,20 +909,17 @@ index 224769fdf..6af548437 100644
 +            decoder_config.is_decoder = True
 +            decoder_config.is_encoder_decoder = False
 +            decoder_config.num_layers = config.num_decoder_layers
-+       
 +            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-+            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
-+
++            self.decoder = MT5Stack(decoder_config, self.shared, self.lm_head)
 +            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
 +            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
 +            encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv)
 +            encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv)
-+
 +            encoder_config = copy.deepcopy(config)
 +            encoder_config.is_decoder = False
 +            encoder_config.use_cache = False
 +            encoder_config.is_encoder_decoder = False
-+            self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
++            self.encoder = MT5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
 +        self.encoder_mindie = None
 +        self.decoder_mindie = None
 +        if self.encoder_path:
@@ -954,24 +927,28 @@ index 224769fdf..6af548437 100644
 +            self.is_mindie = True
 +        if self.decoder_path:
 +            self.decoder_mindie = torch.jit.load(self.decoder_path)
-+            
 +            self.stream = torch.npu.Stream(f"npu:{device_id}")
 +            self.device_id = device_id
-+
-+    
-+    def get_device(self):
-+        return f"npu:{self.device_id}"
  
          # Initialize weights and apply final processing
 -        self.post_init()
-+        # self.post_init()
++        if not self.is_mindie:
++            self.post_init()
  
          # Model parallel
          self.model_parallel = False
-@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+         self.device_map = None
  
-     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
++    def get_device(self):
++        return f"npu:{self.device_id}"
++    
+     @add_start_docstrings(PARALLELIZE_DOCSTRING)
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
+     def parallelize(self, device_map=None):
+@@ -1666,25 +2126,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
+     @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
      @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5
 -    def forward(
 -        self,
 -        input_ids: Optional[torch.LongTensor] = None,
@@ -995,7 +972,7 @@ index 224769fdf..6af548437 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1716,114 +2158,37 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1123,6 +1100,7 @@ index 224769fdf..6af548437 100644
 -        )
 +        return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2])
  
+-    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
      def prepare_inputs_for_generation(
          self,
          input_ids,
@@ -1134,7 +1112,7 @@ index 224769fdf..6af548437 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1834,8 +2199,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1145,12 +1123,7 @@ index 224769fdf..6af548437 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
-             else:
-                 # Default to old behavior: keep only final ID
-                 remove_prefix_length = input_ids.shape[1] - 1
--
-             input_ids = input_ids[:, remove_prefix_length:]
+@@ -1848,7 +2213,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
  
          return {
              "decoder_input_ids": input_ids,
@@ -1162,15 +1135,7 @@ index 224769fdf..6af548437 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
-             "decoder_attention_mask": decoder_attention_mask,
-             "cross_attn_head_mask": cross_attn_head_mask,
-             "use_cache": use_cache,
-+            
-         }
- 
-     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1893,6 +2261,419 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1461,7 +1426,7 @@ index 224769fdf..6af548437 100644
 +        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 +        
 +        # keep track of which sequences are already finished
-+        if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration":
++        if self.is_mindie or self.config.architectures[0]=="MT5ForConditionalGeneration":
 +            num_layers = self.config.num_layers
 +            num_heads = self.config.num_heads
 +            d_kv = self.config.d_kv
@@ -1541,26 +1506,12 @@ index 224769fdf..6af548437 100644
 +        return input_ids
 +
 +    def invert_attention_mask(self, encoder_attention_mask):
-+        """
-+        Invert an attention mask (e.g., switches 0. and 1.).
-+
-+        Args:
-+            encoder_attention_mask (`torch.Tensor`): An attention mask.
-+
-+        Returns:
-+            `torch.Tensor`: The inverted attention mask.
-+        """
 +        if encoder_attention_mask.dim() == 3:
 +            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
 +        if encoder_attention_mask.dim() == 2:
 +            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-+        # /transformer/transformer_layers.py#L270
-+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-+        # encoder_extended_attention_mask.transpose(-1, -2))
 +        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-+        #encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
++
 +        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1000
 +
 +        return encoder_extended_attention_mask
@@ -1574,37 +1525,19 @@ index 224769fdf..6af548437 100644
 +        return self.get_device()
 +    
 +    def get_extended_attention_mask(
-+        self, attention_mask, input_shape, devic=None, dtype=None
++        self, attention_mask, input_shape, deviceNone, dtype=None
 +    ):
-+        """
-+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-+
-+        Arguments:
-+            attention_mask (`torch.Tensor`):
-+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-+            input_shape (`Tuple[int]`):
-+                The shape of the input to the model.
-+
-+        Returns:
-+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
-+        """
 +        if dtype is None:
 +            dtype = self.dtype
 +
 +        if not (attention_mask.dim() == 2 and self.config.is_decoder):
-+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
 +            if device is not None:
 +                warnings.warn(
 +                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
 +                )
-+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-+        # ourselves in which case we just need to make it broadcastable to all heads.
 +        if attention_mask.dim() == 3:
 +            extended_attention_mask = attention_mask[:, None, :, :]
 +        elif attention_mask.dim() == 2:
-+            # Provided a padding mask of dimensions [batch_size, seq_length]
-+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
 +            if self.config.is_decoder:
 +                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
 +                    input_shape, attention_mask, device
@@ -1615,27 +1548,10 @@ index 224769fdf..6af548437 100644
 +            raise ValueError(
 +                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
 +            )
-+
-+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-+        # masked positions, this operation will create a tensor which is 0.0 for
-+        # positions we want to attend and the dtype's smallest value for masked positions.
-+        # Since we are adding it to the raw scores before the softmax, this is
-+        # effectively the same as removing these entirely.
 +        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
-+        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
 +        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
 +        return extended_attention_mask
-+
-+
 +
  
  @add_start_docstrings(
-     "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel):
-         >>> last_hidden_states = outputs.last_hidden_state
-         ```"""
-         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--
-         encoder_outputs = self.encoder(
-             input_ids=input_ids,
-             attention_mask=attention_mask,
+     "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-- 
Gitee


From 8076e460d04572e199c4b835d812d617e6162fde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 12 Sep 2024 04:32:49 +0000
Subject: [PATCH 075/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5.patch             | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index d0c6a08f48..e4bd899bde 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,5 +1,5 @@
 diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..6af548437 100644
+index 224769fdf..1c2d8d185 100644
 --- a/modeling_t5.py
 +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
 @@ -19,22 +19,26 @@ import math
@@ -464,7 +464,7 @@ index 224769fdf..6af548437 100644
  
          self.block = nn.ModuleList(
              [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -966,20 +1119,64 @@ class T5Stack(T5PreTrainedModel):
+@@ -966,20 +1119,63 @@ class T5Stack(T5PreTrainedModel):
      def set_input_embeddings(self, new_embeddings):
          self.embed_tokens = new_embeddings
  
@@ -530,7 +530,7 @@ index 224769fdf..6af548437 100644
      ):
          # Model parallel
          if self.model_parallel:
-@@ -998,8 +1195,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -998,8 +1194,10 @@ class T5Stack(T5PreTrainedModel):
                  f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
              )
          elif input_ids is not None:
@@ -541,7 +541,7 @@ index 224769fdf..6af548437 100644
          elif inputs_embeds is not None:
              input_shape = inputs_embeds.size()[:-1]
          else:
-@@ -1012,18 +1211,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -1012,18 +1210,19 @@ class T5Stack(T5PreTrainedModel):
              inputs_embeds = self.embed_tokens(input_ids)
  
          batch_size, seq_length = input_shape
@@ -566,7 +566,7 @@ index 224769fdf..6af548437 100644
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1054,7 +1254,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1054,7 +1253,8 @@ class T5Stack(T5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -576,7 +576,7 @@ index 224769fdf..6af548437 100644
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1062,8 +1263,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1062,8 +1262,8 @@ class T5Stack(T5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -587,7 +587,7 @@ index 224769fdf..6af548437 100644
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1112,7 +1313,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -1112,7 +1312,10 @@ class T5Stack(T5PreTrainedModel):
                      encoder_decoder_position_bias=encoder_decoder_position_bias,
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
@@ -599,7 +599,7 @@ index 224769fdf..6af548437 100644
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1120,19 +1324,20 @@ class T5Stack(T5PreTrainedModel):
+@@ -1120,19 +1323,20 @@ class T5Stack(T5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -625,7 +625,7 @@ index 224769fdf..6af548437 100644
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1146,7 +1351,7 @@ class T5Stack(T5PreTrainedModel):
+@@ -1146,7 +1350,7 @@ class T5Stack(T5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -634,7 +634,7 @@ index 224769fdf..6af548437 100644
  
          # Add last layer
          if output_hidden_states:
-@@ -1164,13 +1369,216 @@ class T5Stack(T5PreTrainedModel):
+@@ -1164,13 +1368,216 @@ class T5Stack(T5PreTrainedModel):
                  ]
                  if v is not None
              )
@@ -857,7 +857,7 @@ index 224769fdf..6af548437 100644
  
  
  T5_START_DOCSTRING = r"""
-@@ -1541,6 +1949,38 @@ class T5Model(T5PreTrainedModel):
+@@ -1541,6 +1948,38 @@ class T5Model(T5PreTrainedModel):
          )
  
  
@@ -896,7 +896,7 @@ index 224769fdf..6af548437 100644
  @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
  class T5ForConditionalGeneration(T5PreTrainedModel):
      _keys_to_ignore_on_load_unexpected = [
-@@ -1548,28 +1988,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1548,28 +1987,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
      ]
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
@@ -967,7 +967,7 @@ index 224769fdf..6af548437 100644
  
          # Model parallel
          self.model_parallel = False
-@@ -1637,25 +2100,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1637,25 +2099,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
  
      @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
      @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -994,7 +994,7 @@ index 224769fdf..6af548437 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1687,113 +2132,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1687,113 +2131,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1133,7 +1133,7 @@ index 224769fdf..6af548437 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1804,8 +2173,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1804,8 +2172,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1144,7 +1144,7 @@ index 224769fdf..6af548437 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1813,12 +2182,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1813,12 +2181,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              else:
                  # Default to old behavior: keep only final ID
                  remove_prefix_length = input_ids.shape[1] - 1
@@ -1161,7 +1161,7 @@ index 224769fdf..6af548437 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1826,6 +2197,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1826,6 +2196,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              "decoder_attention_mask": decoder_attention_mask,
              "cross_attn_head_mask": cross_attn_head_mask,
              "use_cache": use_cache,
@@ -1169,7 +1169,7 @@ index 224769fdf..6af548437 100644
          }
  
      def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-@@ -1861,6 +2233,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1861,6 +2232,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1630,7 +1630,7 @@ index 224769fdf..6af548437 100644
  
  @add_start_docstrings(
      "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-@@ -1967,7 +2793,6 @@ class T5EncoderModel(T5PreTrainedModel):
+@@ -1967,7 +2792,6 @@ class T5EncoderModel(T5PreTrainedModel):
          >>> last_hidden_states = outputs.last_hidden_state
          ```"""
          return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-- 
Gitee


From c99410d6e6c9018eeb9cc7975291fd242a343315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 12 Sep 2024 07:08:13 +0000
Subject: [PATCH 076/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/export=5Fmt5.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/export_mt5.py | 181 ------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/export_mt5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py b/MindIE/MindIE-Torch/built-in/T5/export_mt5.py
deleted file mode 100644
index dc8308e362..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/export_mt5.py
+++ /dev/null
@@ -1,181 +0,0 @@
-
-import torch
-import torch_npu
-import argparse
-import os
-import mindietorch
-from transformers import MT5ForConditionalGeneration
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./models",
-        help="save dir"
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="./T5-Small",
-        help="T5 model path"
-    )
-    parser.add_argument(
-        "--max_batchsize",
-        type=int,
-        default=1,
-        help="max batchsize when running"
-    )
-
-    parser.add_argument(
-        "--max_input_seq_len",
-        type=int,
-        default=256,
-        help="max input_sequence length when running"
-    )
-
-    
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="npu device id"
-    )
-    return parser.parse_args()
-
-
-class TextEncoderExport(torch.nn.Module):
-    def __init__(self, textencoder_model):
-        super(TextEncoderExport, self).__init__()
-        self.textencoder_model = textencoder_model
-    
-    def forward(self, input_ids):
-        return self.textencoder_model(input_ids=input_ids)
-
-class TextDecoderExport(torch.nn.Module):
-    def __init__(self, textdecoder_model):
-        super(TextDecoderExport, self).__init__()
-        self.textdecoder_model = textdecoder_model
-    
-    def forward(self,
-                input_ids,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                past_key_values,
-                past_cross_key_values):
-        return self.textdecoder_model(input_ids=input_ids,
-                                      encoder_hidden_states=encoder_hidden_states,
-                                      encoder_attention_mask=encoder_attention_mask,
-                                      past_key_values=past_key_values,
-                                      past_cross_key_values=past_cross_key_values,
-                                      return_dict=True)
-
-def export_textencoder(args, model, save_dir, batch_size):
-    encoder_path = os.path.join(save_dir, "encoder")
-    if not os.path.exists(encoder_path):
-        os.makedirs(encoder_path, mode=0o640)
-    traced_path = os.path.join(encoder_path, "encoder.pt")
-    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
-    if not os.path.exists(traced_path):
-        text_encoder = model.encoder
-        dummy_input = (
-            torch.ones([1, 128], dtype=torch.int64).npu()
-        )
-        encoder = TextEncoderExport(text_encoder)
-        encoder.eval()
-        torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
-    if not os.path.exists(compiled_path):
-        traced_model = torch.jit.load(traced_path).eval()
-        
-        inputs0 = []
-        # inputs1 = []
-        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
-        print("compiling encoder")
-        compiled_model = mindietorch.compile(
-            traced_model,
-            inputs=inputs0,
-            allow_tensor_replace_int=True,
-            require_full_compilation=False,
-            truncate_long_and_double=True,
-            precision_policy=mindietorch.PrecisionPolicy.FP16,
-            soc_version="Ascend910B4",
-            optimization_level=0
-        )
-        compiled_model.save(compiled_path)
-
-def export_textdecoder(args, model, save_dir, batch_size):
-    decoder_path = os.path.join(save_dir, "decoder")
-    if not os.path.exists(decoder_path):
-        os.makedirs(decoder_path, mode=0o640)
-    traced_path = os.path.join(decoder_path, "decoder.pt")
-    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
-    model_path = args.model_path
-    max_lenth = 120
-    if not os.path.exists(traced_path):
-        text_decoder = model.decoder
-        dummy_input = (
-            torch.ones([1, 1], dtype=torch.int64).npu(),
-            torch.randn(1, 16, model.config.d_model).to(torch.float16).npu(),
-            torch.ones(1,16).npu(),
-            torch.randn(model.config.num_layers, 2, 1, model.config.num_heads, 1, model.config.d_kv).to(torch.float16).npu(),
-            torch.randn(model.config.num_layers, 2, 1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()
-        )
-        decoder = TextDecoderExport(text_decoder).npu()
-        decoder.eval()
-        torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
-    if not os.path.exists(compiled_path):
-        traced_model = torch.jit.load(traced_path).eval()
-        print("compiling decoder")
-        compiled_model = mindietorch.compile(
-            traced_model,
-            inputs=[mindietorch.Input(min_shape =(1, 1),
-                                      max_shape = (args.max_batchsize,1),
-                                      dtype=mindietorch.dtype.INT64),
-
-                    mindietorch.Input(min_shape =(1, 1, model.config.d_model),
-                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
-                                      dtype=mindietorch.dtype.FLOAT16),
-                                      
-                    mindietorch.Input(min_shape = (1,1),
-                                      max_shape =(args.max_batchsize,args.max_input_seq_len),
-                                      dtype=mindietorch.dtype.INT64),
-                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, model.config.num_heads, 0, model.config.d_kv),
-                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, model.config.num_heads, args.max_input_seq_len, model.config.d_kv),
-                                      dtype=mindietorch.dtype.FLOAT16),
-
-                    mindietorch.Input(min_shape = (model.config.num_layers, 2, 1, 1, model.config.d_kv*model.config.num_heads),
-                                      max_shape = (model.config.num_layers, 2, args.max_batchsize, args.max_input_seq_len,model.config.d_kv*model.config.num_heads),
-                                      dtype=mindietorch.dtype.FLOAT16)],
-            allow_tensor_replace_int=True,
-            require_full_compilation=False,
-            truncate_long_and_double=True,
-            precision_policy=mindietorch.PrecisionPolicy.FP16,
-            soc_version="Ascend910B4",
-            optimization_level=0
-        )
-        compiled_model.save(compiled_path)
-
-def main():
-    args = parse_arguments()
-    device_id = args.device_id
-    save_dir = args.output_dir
-    torch.npu.set_device(device_id)
-    batch_size = 1
-    model = MT5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu()
-    encoder_path = os.path.join(save_dir, "encoder")
-    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
-    if not os.path.exists(compiled_path):
-        export_textencoder(args, model, save_dir, batch_size)
-    print("export encoder_model done!")
-
-    decoder_path = os.path.join(save_dir, "decoder")
-    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
-    if not os.path.exists(compiled_path):
-        export_textdecoder(args, model, save_dir, batch_size)
-    print("export decoder_model done!")
-    
-
-
-
-if __name__ == "__main__":
-    main()
-- 
Gitee


From 216d9ee41b56e57fea983cb995f0367f8b4c99fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 12 Sep 2024 07:08:30 +0000
Subject: [PATCH 077/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/test=5Fmt5.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/T5/test_mt5.py | 54 ---------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/test_mt5.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py b/MindIE/MindIE-Torch/built-in/T5/test_mt5.py
deleted file mode 100644
index af441392d4..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/test_mt5.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import torch
-import time
-import argparse
-import torch_npu
-from transformers import MT5ForConditionalGeneration, AutoTokenizer, MT5Config
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hf_model_path", type=str, required=True)
-
-    parser.add_argument("--encoder_aie_path", type=str, required=True)
-    parser.add_argument("--decoder_aie_path", type=str, required=True)
-
-    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
-
-    args = parser.parse_args()
-    return args
-
-def main():
-    args = parse_args()
-    torch.npu.set_device(args.device_id)
-    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path)
-    text = [
-                "translate English to German: The house is wonderful.",
-                "summarize: I am a high-performance inference optimizer and runtime.",
-                "During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world",
-                ]
-    model = MT5ForConditionalGeneration.from_pretrained(args.hf_model_path, torch_dtype=torch.float16).npu()
-    encoder = model.encoder
-    decoder = model.decoder
-    encoder_input = torch.randint(0,2000,(8,10), dtype=torch.int64).npu()
-    t5_config = MT5Config.from_pretrained(args.hf_model_path)
-
-    encoder_output = encoder(encoder_input)[0]
-    model = MT5ForConditionalGeneration(config=t5_config,
-                                        encoder_path=args.encoder_aie_path,
-                                        decoder_path=args.decoder_aie_path,
-                                        device_id=args.device_id).half().npu()
-    
-    encoder_mindie = model.encoder_mindie
-    decoder_mindie = model.decoder_mindie
-    mindie_stream = model.stream
-    with torch.npu.stream(mindie_stream): # set stream
-        mindie_encoder_output = encoder_mindie(encoder_input)[0]
-    mindie_stream.synchronize() # synchronize
-    if (torch.cosine_similarity(encoder_output.cpu().flatten(), mindie_encoder_output.cpu().flatten(),dim=0)) < 0.99:
-        print("encoder precision failed")
-    else:
-        print("test OK")
-
-
-if __name__ == "__main__":
-    main()
-
-- 
Gitee


From e3358cabfb41bbd3b2c4d9a9fcaeb2ceb01394ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 23 Sep 2024 11:05:31 +0000
Subject: [PATCH 078/110] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?MindIE/MindIE-Torch/built-in/T5/perf=5Ftest=5Faie.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MindIE-Torch/built-in/T5/perf_test_aie.py | 115 ------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
deleted file mode 100644
index 97c02916fe..0000000000
--- a/MindIE/MindIE-Torch/built-in/T5/perf_test_aie.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import time
-import argparse
-import json
-
-import numpy as np
-import torch
-import torch_npu
-import mindietorch
-from tqdm import tqdm
-
-def test_encoder(aie_path, args, device_id = 0):
-    batch_size = args.batch_size
-    device_id = args.device_id
-    seq_len = args.seq_len
-    device = f'npu:{device_id}'
-    stream = torch.npu.Stream(f"npu:{device_id}")
-    print("Start loading ts module...")
-    ts = torch.jit.load(aie_path)
-    print("Ts module loaded.")
-    ts.eval()
-    dummy_input = (torch.ones([batch_size, seq_len], dtype=torch.int64).npu())
-    print("Start infering...")
-    # warmup
-    for _ in range(10):
-        with torch.npu.stream(stream):
-            ts(dummy_input)
-            stream.synchronize()
-
-    # performance test
-    num_infer = 100
-
-    start = time.time()
-    for _ in tqdm(range(num_infer)):
-        with torch.npu.stream(stream):
-            ts(dummy_input)
-            stream.synchronize()
-    end = time.time()
-    print(f"Encoder latency: {(end - start) / num_infer * 1000:.2f} ms")
-    print(f"Encoder throughput: {num_infer * batch_size / (end - start):.2f} fps")
-
-
-def test_decoder(aie_path, args):
-    batch_size = args.batch_size
-    device_id = args.device_id
-    seq_len = args.seq_len
-    device = f'npu:{device_id}'
-    stream = torch.npu.Stream(f"npu:{device_id}")
-    print("Start loading ts module...")
-    ts = torch.jit.load(aie_path)
-    print("Ts module loaded.")
-    ts.eval()
-    dummy_input = (
-            torch.ones([batch_size, 1], dtype=torch.int64).npu(),
-            torch.randn(batch_size,seq_len,512).to(torch.float16).npu(),
-            torch.ones(batch_size,seq_len, dtype=torch.int64).npu(),
-            torch.randn(6,2,batch_size,8,1,64).to(torch.float16).npu(),
-            torch.randn(6,2,batch_size,8,24,64).to(torch.float16).npu()
-        )
-
-    # warmup
-    for _ in range(10):
-        with torch.npu.stream(stream):
-            ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4])
-            stream.synchronize()
-
-    # performance test
-    num_infer = 100
-    start = time.time()
-    for _ in tqdm(range(num_infer)):
-        with torch.npu.stream(stream):
-            ts.forward(dummy_input[0],dummy_input[1],dummy_input[2],dummy_input[3],dummy_input[4])
-            stream.synchronize()
-    end = time.time()
-
-    print(f"Decoder latency: {(end - start) / num_infer * 1000:.2f} ms")
-    print(f"Decoder throughput: {num_infer * batch_size / (end - start):.2f} fps")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--encoder_aie_path", type=str, required=True)
-    parser.add_argument("--decoder_aie_path", type=str, required=True)
-    parser.add_argument("--batch_size", type=int, help="NPU device id", default=1)
-    parser.add_argument("--seq_len", type=int, help="NPU device id", default=128)
-
-    parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
-
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-    torch.npu.set_device(args.device_id)
-    test_encoder(args.encoder_aie_path, args)
-    test_decoder(args.decoder_aie_path, args)
-
-
-if __name__ == "__main__":
-    main()
-- 
Gitee


From d26ac1dd029258a464c01bd5f967881596a6149e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 23 Sep 2024 11:14:20 +0000
Subject: [PATCH 079/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5.patch             | 115 +++++++++---------
 1 file changed, 55 insertions(+), 60 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index e4bd899bde..74bda2bb86 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,8 +1,8 @@
-diff --git a/modeling_t5.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..1c2d8d185 100644
---- a/modeling_t5.py
+diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+index 224769fdf..cfa27e8c6 100644
+--- a/modeling_t5_origin.py
 +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-@@ -19,22 +19,26 @@ import math
+@@ -19,7 +19,7 @@ import math
  import os
  import warnings
  from typing import List, Optional, Tuple, Union
@@ -11,13 +11,7 @@ index 224769fdf..1c2d8d185 100644
  import torch
  from torch import nn
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-+# import torch_npu
-+import mindietorch
-+
-+
-+
- 
- from ...activations import ACT2FN
+@@ -28,13 +28,12 @@ from ...activations import ACT2FN
  from ...modeling_outputs import (
      BaseModelOutput,
      BaseModelOutputWithPastAndCrossAttentions,
@@ -32,7 +26,7 @@ index 224769fdf..1c2d8d185 100644
  from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
  from ...utils import (
      DUMMY_INPUTS,
-@@ -47,8 +51,44 @@ from ...utils import (
+@@ -47,7 +46,43 @@ from ...utils import (
  )
  from ...utils.model_parallel_utils import assert_device_map, get_device_map
  from .configuration_t5 import T5Config
@@ -40,13 +34,13 @@ index 224769fdf..1c2d8d185 100644
 +from transformers.generation.stopping_criteria import StoppingCriteriaList
 +from transformers.generation.configuration_utils import GenerationMode
 +from transformers.utils.generic import ModelOutput
- 
- 
++
++
 +@dataclass
 +class Seq2SeqLMOutput(ModelOutput):
 +    """
 +    Base class for model's outputs, with potential hidden states and attentions.
-+
+ 
 +    Args:
 +        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
 +            Sequence of hidden-states at the output of the last layer of the model.
@@ -73,11 +67,10 @@ index 224769fdf..1c2d8d185 100644
 +    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
 +    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 +    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-+
+ 
  logger = logging.get_logger(__name__)
  
- _CONFIG_FOR_DOC = "T5Config"
-@@ -448,7 +488,10 @@ class T5Attention(nn.Module):
+@@ -448,7 +483,10 @@ class T5Attention(nn.Module):
          mask=None,
          key_value_states=None,
          position_bias=None,
@@ -89,7 +82,7 @@ index 224769fdf..1c2d8d185 100644
          layer_head_mask=None,
          query_length=None,
          use_cache=False,
-@@ -464,12 +507,8 @@ class T5Attention(nn.Module):
+@@ -464,12 +502,8 @@ class T5Attention(nn.Module):
  
          real_seq_length = seq_length
  
@@ -104,7 +97,7 @@ index 224769fdf..1c2d8d185 100644
  
          key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
  
-@@ -493,16 +532,17 @@ class T5Attention(nn.Module):
+@@ -493,16 +527,17 @@ class T5Attention(nn.Module):
                  hidden_states = shape(proj_layer(key_value_states))
  
              if past_key_value is not None:
@@ -128,7 +121,7 @@ index 224769fdf..1c2d8d185 100644
                  else:
                      # cross-attn
                      hidden_states = past_key_value
-@@ -513,17 +553,16 @@ class T5Attention(nn.Module):
+@@ -513,17 +548,16 @@ class T5Attention(nn.Module):
  
          # get key/value states
          key_states = project(
@@ -149,7 +142,7 @@ index 224769fdf..1c2d8d185 100644
          if position_bias is None:
              if not self.has_relative_attention_bias:
                  position_bias = torch.zeros(
-@@ -536,7 +575,7 @@ class T5Attention(nn.Module):
+@@ -536,7 +570,7 @@ class T5Attention(nn.Module):
  
              # if key and values are already calculated
              # we want only the last query position bias
@@ -158,7 +151,7 @@ index 224769fdf..1c2d8d185 100644
                  position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
  
              if mask is not None:
-@@ -548,7 +587,6 @@ class T5Attention(nn.Module):
+@@ -548,7 +582,6 @@ class T5Attention(nn.Module):
              position_bias_masked = position_bias[:, mask.bool()]
          else:
              position_bias_masked = position_bias
@@ -166,7 +159,7 @@ index 224769fdf..1c2d8d185 100644
          scores += position_bias_masked
          attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
              scores
-@@ -564,18 +602,131 @@ class T5Attention(nn.Module):
+@@ -564,18 +597,131 @@ class T5Attention(nn.Module):
          attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
          attn_output = self.o(attn_output)
  
@@ -224,7 +217,7 @@ index 224769fdf..1c2d8d185 100644
 +                # cross-attn
 +                # (batch_size, n_heads, seq_length, dim_per_head)
 +                hidden_states = shape(proj_layer(hidden_states))
-+
+ 
 +            if past_key_value is not None:
 +                hidden_states = shape(proj_layer(hidden_states))
 +                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
@@ -281,7 +274,7 @@ index 224769fdf..1c2d8d185 100644
 +
 +        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
 +        attn_output = self.o(attn_output)
- 
++
 +        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
 +        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
 +        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
@@ -301,7 +294,7 @@ index 224769fdf..1c2d8d185 100644
          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
          self.dropout = nn.Dropout(config.dropout_rate)
  
-@@ -585,7 +736,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -585,7 +731,8 @@ class T5LayerSelfAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -311,7 +304,7 @@ index 224769fdf..1c2d8d185 100644
          use_cache=False,
          output_attentions=False,
      ):
-@@ -595,7 +747,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -595,7 +742,8 @@ class T5LayerSelfAttention(nn.Module):
              mask=attention_mask,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -321,7 +314,7 @@ index 224769fdf..1c2d8d185 100644
              use_cache=use_cache,
              output_attentions=output_attentions,
          )
-@@ -618,7 +771,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -618,7 +766,8 @@ class T5LayerCrossAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -331,7 +324,7 @@ index 224769fdf..1c2d8d185 100644
          use_cache=False,
          query_length=None,
          output_attentions=False,
-@@ -630,7 +784,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -630,7 +779,8 @@ class T5LayerCrossAttention(nn.Module):
              key_value_states=key_value_states,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -341,7 +334,7 @@ index 224769fdf..1c2d8d185 100644
              use_cache=use_cache,
              query_length=query_length,
              output_attentions=output_attentions,
-@@ -661,39 +816,34 @@ class T5Block(nn.Module):
+@@ -661,39 +811,34 @@ class T5Block(nn.Module):
          encoder_decoder_position_bias=None,
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
@@ -395,7 +388,7 @@ index 224769fdf..1c2d8d185 100644
  
          # clamp inf values to enable fp16 training
          if hidden_states.dtype == torch.float16:
-@@ -706,22 +856,23 @@ class T5Block(nn.Module):
+@@ -706,22 +851,23 @@ class T5Block(nn.Module):
  
          do_cross_attention = self.is_decoder and encoder_hidden_states is not None
          if do_cross_attention:
@@ -424,7 +417,7 @@ index 224769fdf..1c2d8d185 100644
                  output_attentions=output_attentions,
              )
              hidden_states = cross_attention_outputs[0]
-@@ -736,11 +887,9 @@ class T5Block(nn.Module):
+@@ -736,11 +882,9 @@ class T5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
@@ -438,7 +431,7 @@ index 224769fdf..1c2d8d185 100644
  
          # Apply Feed Forward layer
          hidden_states = self.layer[-1](hidden_states)
-@@ -757,7 +906,7 @@ class T5Block(nn.Module):
+@@ -757,7 +901,7 @@ class T5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
@@ -447,7 +440,7 @@ index 224769fdf..1c2d8d185 100644
          else:
              outputs = outputs + attention_outputs
  
-@@ -897,11 +1046,15 @@ class T5PreTrainedModel(PreTrainedModel):
+@@ -897,11 +1041,15 @@ class T5PreTrainedModel(PreTrainedModel):
  
  
  class T5Stack(T5PreTrainedModel):
@@ -464,7 +457,7 @@ index 224769fdf..1c2d8d185 100644
  
          self.block = nn.ModuleList(
              [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -966,20 +1119,63 @@ class T5Stack(T5PreTrainedModel):
+@@ -966,20 +1114,63 @@ class T5Stack(T5PreTrainedModel):
      def set_input_embeddings(self, new_embeddings):
          self.embed_tokens = new_embeddings
  
@@ -530,7 +523,7 @@ index 224769fdf..1c2d8d185 100644
      ):
          # Model parallel
          if self.model_parallel:
-@@ -998,8 +1194,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -998,8 +1189,10 @@ class T5Stack(T5PreTrainedModel):
                  f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
              )
          elif input_ids is not None:
@@ -541,7 +534,7 @@ index 224769fdf..1c2d8d185 100644
          elif inputs_embeds is not None:
              input_shape = inputs_embeds.size()[:-1]
          else:
-@@ -1012,18 +1210,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -1012,18 +1205,19 @@ class T5Stack(T5PreTrainedModel):
              inputs_embeds = self.embed_tokens(input_ids)
  
          batch_size, seq_length = input_shape
@@ -566,7 +559,7 @@ index 224769fdf..1c2d8d185 100644
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1054,7 +1253,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1054,7 +1248,8 @@ class T5Stack(T5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -576,7 +569,7 @@ index 224769fdf..1c2d8d185 100644
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1062,8 +1262,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1062,8 +1257,8 @@ class T5Stack(T5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -587,7 +580,7 @@ index 224769fdf..1c2d8d185 100644
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1112,7 +1312,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -1112,7 +1307,10 @@ class T5Stack(T5PreTrainedModel):
                      encoder_decoder_position_bias=encoder_decoder_position_bias,
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
@@ -599,7 +592,7 @@ index 224769fdf..1c2d8d185 100644
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1120,19 +1323,20 @@ class T5Stack(T5PreTrainedModel):
+@@ -1120,19 +1318,20 @@ class T5Stack(T5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -625,7 +618,7 @@ index 224769fdf..1c2d8d185 100644
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1146,7 +1350,7 @@ class T5Stack(T5PreTrainedModel):
+@@ -1146,7 +1345,7 @@ class T5Stack(T5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -634,7 +627,7 @@ index 224769fdf..1c2d8d185 100644
  
          # Add last layer
          if output_hidden_states:
-@@ -1164,13 +1368,216 @@ class T5Stack(T5PreTrainedModel):
+@@ -1164,13 +1363,216 @@ class T5Stack(T5PreTrainedModel):
                  ]
                  if v is not None
              )
@@ -857,7 +850,7 @@ index 224769fdf..1c2d8d185 100644
  
  
  T5_START_DOCSTRING = r"""
-@@ -1541,6 +1948,38 @@ class T5Model(T5PreTrainedModel):
+@@ -1541,6 +1943,38 @@ class T5Model(T5PreTrainedModel):
          )
  
  
@@ -896,7 +889,7 @@ index 224769fdf..1c2d8d185 100644
  @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
  class T5ForConditionalGeneration(T5PreTrainedModel):
      _keys_to_ignore_on_load_unexpected = [
-@@ -1548,28 +1987,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1548,28 +1982,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
      ]
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
@@ -967,7 +960,7 @@ index 224769fdf..1c2d8d185 100644
  
          # Model parallel
          self.model_parallel = False
-@@ -1637,25 +2099,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1637,25 +2094,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
  
      @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
      @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -994,7 +987,7 @@ index 224769fdf..1c2d8d185 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1687,113 +2131,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1687,113 +2126,40 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1077,6 +1070,9 @@ index 224769fdf..1c2d8d185 100644
 -
 -        lm_logits = self.lm_head(sequence_output)
 +        if self.is_mindie:
++            print("aaaaaaaaaaaaaaaa")
++            # import pdb
++            # pdb.set_trace()
 +            with torch.npu.stream(self.stream): # set stream
 +                decoder_outputs = self.decoder_mindie.forward(*args)
 +            self.stream.synchronize() # synchronize
@@ -1133,7 +1129,7 @@ index 224769fdf..1c2d8d185 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1804,8 +2172,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1804,8 +2170,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1144,7 +1140,7 @@ index 224769fdf..1c2d8d185 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1813,12 +2181,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1813,12 +2179,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              else:
                  # Default to old behavior: keep only final ID
                  remove_prefix_length = input_ids.shape[1] - 1
@@ -1161,7 +1157,7 @@ index 224769fdf..1c2d8d185 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1826,6 +2196,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1826,6 +2194,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              "decoder_attention_mask": decoder_attention_mask,
              "cross_attn_head_mask": cross_attn_head_mask,
              "use_cache": use_cache,
@@ -1169,7 +1165,7 @@ index 224769fdf..1c2d8d185 100644
          }
  
      def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-@@ -1861,6 +2232,460 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1861,6 +2230,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1191,11 +1187,12 @@ index 224769fdf..1c2d8d185 100644
 +        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
 +        encoder_kwargs["return_dict"] = True
 +        encoder_kwargs[model_input_name] = inputs_tensor
-+        import time
-+        start_time = time.time()
-+        with torch.npu.stream(self.stream): # set stream
-+            encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"])
-+        self.stream.synchronize() # synchronize
++        if self.is_mindie:
++            with torch.npu.stream(self.stream): # set stream
++                encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"])
++            self.stream.synchronize() # synchronize
++        else:
++            encoder_outputs=self.encoder.forward(**encoder_kwargs)
 +        model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]}
 +        model_kwargs["past_cross_keys"] = encoder_outputs[1]
 +        model_kwargs["past_cross_values"] =encoder_outputs[2]
@@ -1243,8 +1240,6 @@ index 224769fdf..1c2d8d185 100644
 +        **kwargs,
 +    ):
 +        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-+        import time 
-+        start_time = time.time()
 +        self._validate_model_class()
 +        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
 +        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
@@ -1630,7 +1625,7 @@ index 224769fdf..1c2d8d185 100644
  
  @add_start_docstrings(
      "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-@@ -1967,7 +2792,6 @@ class T5EncoderModel(T5PreTrainedModel):
+@@ -1967,7 +2789,6 @@ class T5EncoderModel(T5PreTrainedModel):
          >>> last_hidden_states = outputs.last_hidden_state
          ```"""
          return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-- 
Gitee


From 82ae72a9cf69f468af23edacd67b1c699dcb57ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 23 Sep 2024 11:14:39 +0000
Subject: [PATCH 080/110] update MindIE/MindIE-Torch/built-in/T5/main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py
index ccad949c44..28d85df24a 100644
--- a/MindIE/MindIE-Torch/built-in/T5/main.py
+++ b/MindIE/MindIE-Torch/built-in/T5/main.py
@@ -1,4 +1,6 @@
 import torch
+import torch_npu
+import mindietorch
 import time
 import argparse
 from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config
-- 
Gitee


From 1eb296aea57aec90db68e42e18f046a7c2396470 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 23 Sep 2024 12:26:03 +0000
Subject: [PATCH 081/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../MindIE-Torch/built-in/T5/modeling_t5.patch  | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index 74bda2bb86..26b0ce5e87 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,5 +1,5 @@
 diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..cfa27e8c6 100644
+index 224769fdf..65c058e6e 100644
 --- a/modeling_t5_origin.py
 +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
 @@ -19,7 +19,7 @@ import math
@@ -987,7 +987,7 @@ index 224769fdf..cfa27e8c6 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1687,113 +2126,40 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1687,113 +2126,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1070,9 +1070,6 @@ index 224769fdf..cfa27e8c6 100644
 -
 -        lm_logits = self.lm_head(sequence_output)
 +        if self.is_mindie:
-+            print("aaaaaaaaaaaaaaaa")
-+            # import pdb
-+            # pdb.set_trace()
 +            with torch.npu.stream(self.stream): # set stream
 +                decoder_outputs = self.decoder_mindie.forward(*args)
 +            self.stream.synchronize() # synchronize
@@ -1129,7 +1126,7 @@ index 224769fdf..cfa27e8c6 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1804,8 +2170,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1804,8 +2167,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1140,7 +1137,7 @@ index 224769fdf..cfa27e8c6 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1813,12 +2179,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1813,12 +2176,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              else:
                  # Default to old behavior: keep only final ID
                  remove_prefix_length = input_ids.shape[1] - 1
@@ -1157,7 +1154,7 @@ index 224769fdf..cfa27e8c6 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1826,6 +2194,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1826,6 +2191,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              "decoder_attention_mask": decoder_attention_mask,
              "cross_attn_head_mask": cross_attn_head_mask,
              "use_cache": use_cache,
@@ -1165,7 +1162,7 @@ index 224769fdf..cfa27e8c6 100644
          }
  
      def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-@@ -1861,6 +2230,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1861,6 +2227,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1625,7 +1622,7 @@ index 224769fdf..cfa27e8c6 100644
  
  @add_start_docstrings(
      "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-@@ -1967,7 +2789,6 @@ class T5EncoderModel(T5PreTrainedModel):
+@@ -1967,7 +2786,6 @@ class T5EncoderModel(T5PreTrainedModel):
          >>> last_hidden_states = outputs.last_hidden_state
          ```"""
          return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-- 
Gitee


From 25e9d1ce45045cef7a624a136174002cb532097e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 03:31:32 +0000
Subject: [PATCH 082/110] add
 MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/export_t5_800IA2.py           | 202 ++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py b/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py
new file mode 100644
index 0000000000..e150e8e93a
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5_800IA2.py
@@ -0,0 +1,202 @@
+
+import torch
+import torch_npu
+import argparse
+import os
+import math
+import mindietorch
+from transformers import T5ForConditionalGeneration
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./models",
+        help="save dir"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./T5-Small",
+        help="T5 model path"
+    )
+    parser.add_argument(
+        "--max_batchsize",
+        type=int,
+        default=1,
+        help="max batchsize when running"
+    )
+
+    parser.add_argument(
+        "--max_input_seq_len",
+        type=int,
+        default=256,
+        help="max input_sequence length when running"
+    )
+
+    
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="npu device id"
+    )
+    return parser.parse_args()
+
+
+class TextEncoderExport(torch.nn.Module):
+    def __init__(self, textencoder_model):
+        super(TextEncoderExport, self).__init__()
+        self.textencoder_model = textencoder_model
+    
+    def forward(self, input_ids,attention_mask):
+        return self.textencoder_model(input_ids=input_ids,attention_mask=attention_mask)
+
+class TextDecoderExport(torch.nn.Module):
+    def __init__(self, textdecoder_model):
+        super(TextDecoderExport, self).__init__()
+        self.textdecoder_model = textdecoder_model
+    
+    def forward(self,
+                *args):
+        return self.textdecoder_model(*args)
+
+def export_textencoder(args, model, save_dir, batch_size):
+    encoder_path = os.path.join(save_dir, "encoder")
+    if not os.path.exists(encoder_path):
+        os.makedirs(encoder_path, mode=0o640)
+    traced_path = os.path.join(encoder_path, "encoder.pt")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(traced_path):
+        text_encoder = model.encoder
+        dummy_input = (
+            torch.ones([1, 128], dtype=torch.int64).npu(),
+            torch.ones([1, 1,128,128], dtype=torch.bool).npu()
+        )
+        encoder = TextEncoderExport(text_encoder)
+        encoder.eval()
+        torch.jit.trace(encoder, dummy_input, strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        
+        inputs0 = []
+        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
+        inputs0.append(mindietorch.Input(min_shape = (1,1,1,1), max_shape= (args.max_batchsize, 1,args.max_input_seq_len,args.max_input_seq_len), dtype=torch.bool))
+        print("compiling encoder")
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=inputs0,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+
+def export_textdecoder(args, model, save_dir, batch_size):
+    decoder_path = os.path.join(save_dir, "decoder")
+    if not os.path.exists(decoder_path):
+        os.makedirs(decoder_path, mode=0o640)
+    traced_path = os.path.join(decoder_path, "decoder.pt")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    model_path = args.model_path
+    max_lenth = 120
+    if not os.path.exists(traced_path):
+        text_decoder = model
+        all_past_keys = [torch.randn([1,  1, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_values = [torch.randn([1,  1, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_keys = [torch.randn([1,  16, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers
+        all_past_cross_values = [torch.randn([1, 16, model.config.d_kv*model.config.num_heads]).to(torch.float16).npu()] * model.config.num_layers
+        dummy_input = [torch.randn(1, 16, model.config.d_kv*model.config.num_heads).to(torch.float16).npu()]
+        dummy_input.extend(all_past_cross_keys)
+        dummy_input.extend(all_past_cross_values)
+        dummy_input.extend(all_past_keys)
+        dummy_input.extend(all_past_values)
+        # encoder_attention_mask
+        dummy_input.append(torch.ones((1,1,16,16),dtype=torch.bool).npu())
+        # decoder_input_ids
+        dummy_input.append(torch.ones([1, 1], dtype=torch.int64).npu())
+        dummy_input.append(torch.ones([1, 1, 1, 1], dtype=torch.bool).npu())
+        # decoder_attention_mask
+        decoder = TextDecoderExport(text_decoder).npu() 
+        decoder.eval()
+        torch.jit.trace(decoder, dummy_input,strict=False).save(traced_path)
+    if not os.path.exists(compiled_path):
+        traced_model = torch.jit.load(traced_path).eval()
+        print("compiling decoder")
+        input_info = [mindietorch.Input(min_shape =(1, 1, model.config.d_model),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_model),
+                                      dtype=mindietorch.dtype.FLOAT16)]
+        past_cross_key_infos = [mindietorch.Input(min_shape =(1, 1, model.config.num_heads*model.config.d_kv),
+                                      max_shape=(args.max_batchsize,args.max_input_seq_len,  model.config.num_heads*model.config.d_kv),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_cross_value_infos = [mindietorch.Input(min_shape =(1,  1, model.config.d_kv*model.config.num_heads),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_key_infos = [mindietorch.Input(min_shape =(1,  0, model.config.d_kv*model.config.num_heads),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        past_value_infos = [mindietorch.Input(min_shape =(1, 0, model.config.d_kv*model.config.num_heads),
+                                      max_shape=(args.max_batchsize, args.max_input_seq_len, model.config.d_kv*model.config.num_heads),
+                                      dtype=mindietorch.dtype.FLOAT16)] * model.config.num_layers
+        decoder_input_ids_info = [mindietorch.Input(min_shape =(1, 1),
+                                      max_shape = (args.max_batchsize,1),
+                                      dtype=mindietorch.dtype.INT64)]
+        encoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1,1, 1),
+                                      max_shape = (args.max_batchsize, 1, args.max_input_seq_len,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.BOOL)]
+        decoder_attention_mask_info = [mindietorch.Input(min_shape =(1, 1,1,1),
+                                      max_shape = (args.max_batchsize,1,args.max_input_seq_len,args.max_input_seq_len),
+                                      dtype=mindietorch.dtype.BOOL)]
+        input_info.extend(past_cross_key_infos)
+        input_info.extend(past_cross_value_infos)
+        input_info.extend(past_key_infos)
+        input_info.extend(past_value_infos)
+        input_info.extend(encoder_attention_mask_info)
+        input_info.extend(decoder_input_ids_info)
+        input_info.extend(decoder_attention_mask_info)
+        buffer = []
+        for _ in range(2*model.config.num_layers):
+            buffer.append(math.ceil((args.max_batchsize * args.max_input_seq_len * model.config.d_model * 2) / 1024 / 1024)) 
+        buffer_size0 = math.ceil((args.max_batchsize * 1 * model.config.vocab_size * 4) / 1024 / 1024)
+        buffer.append(buffer_size0)
+        print("buffer=",buffer)
+        compiled_model = mindietorch.compile(
+            traced_model,
+            inputs=input_info,
+            allow_tensor_replace_int=True,
+            require_full_compilation=False,
+            truncate_long_and_double=True,
+            precision_policy=mindietorch.PrecisionPolicy.FP16,
+            soc_version="Ascend910B4",
+            default_buffer_size_vec=buffer,
+            optimization_level=0
+        )
+        compiled_model.save(compiled_path)
+        
+
+def main():
+    args = parse_arguments()
+    device_id = args.device_id
+    save_dir = args.output_dir
+    torch.npu.set_device(device_id)
+    batch_size = 1
+    model = T5ForConditionalGeneration.from_pretrained(args.model_path, torch_dtype=torch.float).npu()
+    encoder_path = os.path.join(save_dir, "encoder")
+    compiled_path = os.path.join(encoder_path, "encoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textencoder(args, model, save_dir, batch_size)
+    print("export encoder_model done!")
+
+    decoder_path = os.path.join(save_dir, "decoder")
+    compiled_path = os.path.join(decoder_path, "decoder_compiled.pt")
+    if not os.path.exists(compiled_path):
+        export_textdecoder(args, model, save_dir, batch_size)
+    print("export decoder_model done!")
+    
+
+if __name__ == "__main__":
+    main()
-- 
Gitee


From c00de3f5a57bb3d2d02b2d6b2043370d20863cab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 03:32:36 +0000
Subject: [PATCH 083/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index e152265ae9..995274d0bd 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -89,7 +89,7 @@ def export_textencoder(args, model, save_dir, batch_size):
             require_full_compilation=False,
             truncate_long_and_double=True,
             precision_policy=mindietorch.PrecisionPolicy.FP16,
-            soc_version="Ascend910B4",
+            soc_version="Ascend310P3",
             optimization_level=0
         )
         compiled_model.save(compiled_path)
@@ -161,7 +161,7 @@ def export_textdecoder(args, model, save_dir, batch_size):
             require_full_compilation=False,
             truncate_long_and_double=True,
             precision_policy=mindietorch.PrecisionPolicy.FP16,
-            soc_version="Ascend910B4",
+            soc_version="Ascend310P3",
             default_buffer_size_vec=buffer,
             optimization_level=0
         )
-- 
Gitee


From fc525501a51d52301798d67e0d4f9a81b133ee43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:03:48 +0000
Subject: [PATCH 084/110] add
 MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 538d828573d04f265b703086cc221c0f08988814 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:04:12 +0000
Subject: [PATCH 085/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5_800IA2.patch      | 1594 +++++++++++++++++
 1 file changed, 1594 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch
index e69de29bb2..664b4359ce 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5_800IA2.patch
@@ -0,0 +1,1594 @@
+diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+index 224769fdf..4f9ffd74f 100644
+--- a/modeling_t5_origin.py
++++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+@@ -19,22 +19,26 @@ import math
+ import os
+ import warnings
+ from typing import List, Optional, Tuple, Union
+-
++from dataclasses import dataclass
+ import torch
+ from torch import nn
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
++# import torch_npu
++# import mindietorch
++
++
++
+ 
+ from ...activations import ACT2FN
+ from ...modeling_outputs import (
+     BaseModelOutput,
+     BaseModelOutputWithPastAndCrossAttentions,
+-    Seq2SeqLMOutput,
+     Seq2SeqModelOutput,
+     Seq2SeqQuestionAnsweringModelOutput,
+     Seq2SeqSequenceClassifierOutput,
+     TokenClassifierOutput,
+ )
+-from ...modeling_utils import PreTrainedModel
++from ...modeling_utils import PreTrainedModel,ModuleUtilsMixin
+ from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+ from ...utils import (
+     DUMMY_INPUTS,
+@@ -47,7 +51,43 @@ from ...utils import (
+ )
+ from ...utils.model_parallel_utils import assert_device_map, get_device_map
+ from .configuration_t5 import T5Config
++from transformers.generation.logits_process import LogitsProcessorList
++from transformers.generation.stopping_criteria import StoppingCriteriaList
++from transformers.generation.configuration_utils import GenerationMode
++from transformers.utils.generic import ModelOutput
++
++
++@dataclass
++class Seq2SeqLMOutput(ModelOutput):
++    """
++    Base class for model's outputs, with potential hidden states and attentions.
+ 
++    Args:
++        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
++            Sequence of hidden-states at the output of the last layer of the model.
++        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
++            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
++            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
++
++            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
++        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
++            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
++            sequence_length)`.
++
++            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
++            heads.
++    """
++    loss: Optional[torch.FloatTensor] = None
++    logits: torch.FloatTensor = None
++    past_keys: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
++    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
++    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
++    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+ 
+ logger = logging.get_logger(__name__)
+ 
+@@ -360,6 +400,7 @@ class T5Attention(nn.Module):
+             self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+         self.pruned_heads = set()
+         self.gradient_checkpointing = False
++        self.lay_out = "BSH"
+ 
+     def prune_heads(self, heads):
+         if len(heads) == 0:
+@@ -448,7 +489,10 @@ class T5Attention(nn.Module):
+         mask=None,
+         key_value_states=None,
+         position_bias=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         layer_head_mask=None,
+         query_length=None,
+         use_cache=False,
+@@ -464,81 +508,86 @@ class T5Attention(nn.Module):
+ 
+         real_seq_length = seq_length
+ 
+-        if past_key_value is not None:
+-            if len(past_key_value) != 2:
+-                raise ValueError(
+-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+-                )
+-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
++        if past_key is not None:
++            real_seq_length += past_key.shape[1] if query_length is None else query_length
+ 
+         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
++        # BSH
++        query_states = self.q(hidden_states)
++        key_states = past_key
++        value_states = past_value
++        attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,attn_mask=mask)
++        # mask = mask.expand(3,1,16,mask.shape[3]).bool()
++        # attn_output = torch_npu.npu_prompt_flash_attention(query_states,key_states,value_states,atten_mask=mask,num_heads=self.n_heads,input_layout="BSH")
++        attn_output = self.o(attn_output)
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(),) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) +  (present_value_state,) + (position_bias,)
++        return outputs
++
++
++class T5SelfAttention(T5Attention):
++    def __init__(self, config: T5Config, has_relative_attention_bias=False):
++        super().__init__(config, has_relative_attention_bias)
+ 
+-        def shape(states):
+-            """projection"""
+-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
++    def forward(
++        self,
++        hidden_states,
++        mask=None,
++        position_bias=None,
++        past_key=None,
++        past_value=None,
++        layer_head_mask=None,
++        use_cache=False,
++        output_attentions=False,
++    ):
++        """
++        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
++        """
++        # Input is (batch_size, seq_length, dim)
++        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
++        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
++        batch_size, seq_length = hidden_states.shape[:2]
+ 
+-        def unshape(states):
+-            """reshape"""
+-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
++        real_seq_length = seq_length
++        
++        if past_key is not None:
++            real_seq_length += past_key.shape[1]
++        key_length = real_seq_length
+ 
+-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
++        def project(hidden_states, proj_layer, past_key_value):
+             """projects hidden states correctly to key/query states"""
+-            if key_value_states is None:
+-                # self-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(hidden_states))
+-            elif past_key_value is None:
+-                # cross-attn
+-                # (batch_size, n_heads, seq_length, dim_per_head)
+-                hidden_states = shape(proj_layer(key_value_states))
++            if past_key_value is None:
++                hidden_states = proj_layer(hidden_states)
+ 
+             if past_key_value is not None:
+-                if key_value_states is None:
+-                    # self-attn
+-                    # (batch_size, n_heads, key_length, dim_per_head)
+-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+-                elif past_key_value.shape[2] != key_value_states.shape[1]:
+-                    # checking that the `sequence_length` of the `past_key_value` is the same as
+-                    # the provided `key_value_states` to support prefix tuning
+-                    # cross-attn
+-                    # (batch_size, n_heads, seq_length, dim_per_head)
+-                    hidden_states = shape(proj_layer(key_value_states))
+-                else:
+-                    # cross-attn
+-                    hidden_states = past_key_value
++                hidden_states = proj_layer(hidden_states)
++                hidden_states = torch.cat([past_key_value, hidden_states], dim=1)
+             return hidden_states
+ 
+         # get query states
+-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+-
++        query_states = self.q(hidden_states)
+         # get key/value states
+         key_states = project(
+-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
++            hidden_states, self.k, past_key if past_key is not None else None
+         )
+         value_states = project(
+-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
++            hidden_states, self.v, past_value if past_value is not None else None
+         )
+-
+-        # compute scores
+-        scores = torch.matmul(
+-            query_states, key_states.transpose(3, 2)
+-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+-
+         if position_bias is None:
+             if not self.has_relative_attention_bias:
+                 position_bias = torch.zeros(
+-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
++                    (1, self.n_heads, real_seq_length, key_length), device=query_states.device, dtype=query_states.dtype
+                 )
+                 if self.gradient_checkpointing and self.training:
+                     position_bias.requires_grad = True
+             else:
+-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
++                position_bias = self.compute_bias(real_seq_length, key_length, device=query_states.device)
+ 
+             # if key and values are already calculated
+             # we want only the last query position bias
+-            if past_key_value is not None:
++            if past_key is not None:
+                 position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+-
+             if mask is not None:
+                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+ 
+@@ -548,34 +597,26 @@ class T5Attention(nn.Module):
+             position_bias_masked = position_bias[:, mask.bool()]
+         else:
+             position_bias_masked = position_bias
+-
+-        scores += position_bias_masked
+-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+-            scores
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-        attn_weights = nn.functional.dropout(
+-            attn_weights, p=self.dropout, training=self.training
+-        )  # (batch_size, n_heads, seq_length, key_length)
+-
+-        # Mask heads if we want to
+-        if layer_head_mask is not None:
+-            attn_weights = attn_weights * layer_head_mask
+-
+-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
++        # scores += position_bias_masked
++        # attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,pse=position_bias_masked)
++        attn_output = torch.ops.aie.flash_attention(query_states,key_states,value_states,self.n_heads,pse=position_bias_masked,attn_mask=mask)
++        # print("mask=",mask,mask.shape)
++        # mask = mask.expand(3,1,16,mask.shape[3]).bool()
++        # attn_output = torch_npu.npu_prompt_flash_attention(query_states,key_states,value_states,pse_shift=position_bias_masked, atten_mask=mask,num_heads=self.n_heads,input_layout="BSH")
+         attn_output = self.o(attn_output)
++        # present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
++        present_key_state = (key_states.half(), ) if (self.is_decoder and use_cache) else None
++        present_value_state = (value_states.half(), ) if (self.is_decoder and use_cache) else None
++        outputs = (attn_output,) + (present_key_state,) + (present_value_state,) + (position_bias,)
++        return outputs
+ 
+-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+ 
+-        if output_attentions:
+-            outputs = outputs + (attn_weights,)
+-        return outputs
+ 
+ 
+ class T5LayerSelfAttention(nn.Module):
+     def __init__(self, config, has_relative_attention_bias=False):
+         super().__init__()
+-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
++        self.SelfAttention = T5SelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+         self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+         self.dropout = nn.Dropout(config.dropout_rate)
+ 
+@@ -585,7 +626,8 @@ class T5LayerSelfAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         output_attentions=False,
+     ):
+@@ -595,7 +637,8 @@ class T5LayerSelfAttention(nn.Module):
+             mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+@@ -618,7 +661,8 @@ class T5LayerCrossAttention(nn.Module):
+         attention_mask=None,
+         position_bias=None,
+         layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
+         use_cache=False,
+         query_length=None,
+         output_attentions=False,
+@@ -630,7 +674,8 @@ class T5LayerCrossAttention(nn.Module):
+             key_value_states=key_value_states,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=past_key_value,
++            past_key=past_key,
++            past_value=past_value,
+             use_cache=use_cache,
+             query_length=query_length,
+             output_attentions=output_attentions,
+@@ -661,39 +706,34 @@ class T5Block(nn.Module):
+         encoder_decoder_position_bias=None,
+         layer_head_mask=None,
+         cross_attn_layer_head_mask=None,
+-        past_key_value=None,
++        past_key=None,
++        past_value=None,
++        past_cross_key=None,
++        past_cross_value=None,
+         use_cache=False,
+         output_attentions=False,
+         return_dict=True,
+     ):
+-        if past_key_value is not None:
+-            if not self.is_decoder:
+-                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+-
+-            if len(past_key_value) != expected_num_past_key_values:
+-                raise ValueError(
+-                    f"There should be {expected_num_past_key_values} past states. "
+-                    f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+-                    f"Got {len(past_key_value)} past key / value states"
+-                )
+-
+-            self_attn_past_key_value = past_key_value[:2]
+-            cross_attn_past_key_value = past_key_value[2:]
++        if past_key is not None:
++            self_attn_past_key = past_key
++            self_attn_past_value = past_value
++            cross_attn_past_key = past_cross_key
++            cross_attn_past_value = past_cross_value
+         else:
+-            self_attn_past_key_value, cross_attn_past_key_value = None, None
++            self_attn_past_key, self_attn_past_value, cross_attn_past_key, cross_attn_past_value = None, None, None, None
+ 
+         self_attention_outputs = self.layer[0](
+             hidden_states,
+             attention_mask=attention_mask,
+             position_bias=position_bias,
+             layer_head_mask=layer_head_mask,
+-            past_key_value=self_attn_past_key_value,
++            past_key=self_attn_past_key,
++            past_value=self_attn_past_value,
+             use_cache=use_cache,
+             output_attentions=output_attentions,
+         )
+-        hidden_states, present_key_value_state = self_attention_outputs[:2]
+-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
++        hidden_states, present_key_state, present_value_state = self_attention_outputs[:3]
++        attention_outputs = self_attention_outputs[3:]  # Keep self-attention outputs and relative position weights
+ 
+         # clamp inf values to enable fp16 training
+         if hidden_states.dtype == torch.float16:
+@@ -706,22 +746,23 @@ class T5Block(nn.Module):
+ 
+         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+         if do_cross_attention:
++            
+             # the actual query length is unknown for cross attention
+             # if using past key value states. Need to inject it here
+-            if present_key_value_state is not None:
+-                query_length = present_key_value_state[0].shape[2]
++            if present_key_state is not None:
++                query_length = present_key_state[0].shape[1]
+             else:
+                 query_length = None
+-
+             cross_attention_outputs = self.layer[1](
+                 hidden_states,
+                 key_value_states=encoder_hidden_states,
+                 attention_mask=encoder_attention_mask,
+                 position_bias=encoder_decoder_position_bias,
+                 layer_head_mask=cross_attn_layer_head_mask,
+-                past_key_value=cross_attn_past_key_value,
++                past_key=cross_attn_past_key,
++                past_value=cross_attn_past_value,
+                 query_length=query_length,
+-                use_cache=use_cache,
++                use_cache=use_cache, 
+                 output_attentions=output_attentions,
+             )
+             hidden_states = cross_attention_outputs[0]
+@@ -736,11 +777,9 @@ class T5Block(nn.Module):
+                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+ 
+             # Combine self attn and cross attn key value states
+-            if present_key_value_state is not None:
+-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+-
++            # cross_attn_past_key_values = cross_attention_outputs[1]
+             # Keep cross-attention outputs and relative position weights
+-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
++            attention_outputs = attention_outputs + cross_attention_outputs[3:]
+ 
+         # Apply Feed Forward layer
+         hidden_states = self.layer[-1](hidden_states)
+@@ -757,7 +796,7 @@ class T5Block(nn.Module):
+         outputs = (hidden_states,)
+ 
+         if use_cache:
+-            outputs = outputs + (present_key_value_state,) + attention_outputs
++            outputs = outputs + (present_key_state,) +(present_value_state,)+ attention_outputs
+         else:
+             outputs = outputs + attention_outputs
+ 
+@@ -897,11 +936,15 @@ class T5PreTrainedModel(PreTrainedModel):
+ 
+ 
+ class T5Stack(T5PreTrainedModel):
+-    def __init__(self, config, embed_tokens=None):
++    def __init__(self, config, embed_tokens=None,lm_head=None, encodecrosskey=None, encodecrossvalue=None):
+         super().__init__(config)
+ 
+         self.embed_tokens = embed_tokens
+         self.is_decoder = config.is_decoder
++        self.lm_head=lm_head
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
+ 
+         self.block = nn.ModuleList(
+             [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+@@ -966,16 +1009,48 @@ class T5Stack(T5PreTrainedModel):
+     def set_input_embeddings(self, new_embeddings):
+         self.embed_tokens = new_embeddings
+ 
++
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
+     def forward(
+         self,
+         input_ids=None,
+-        attention_mask=None,
+         encoder_hidden_states=None,
++        past_keys=None,
++        past_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
+         encoder_attention_mask=None,
++        attention_mask=None,
+         inputs_embeds=None,
+         head_mask=None,
+         cross_attn_head_mask=None,
+-        past_key_values=None,
+         use_cache=None,
+         output_attentions=None,
+         output_hidden_states=None,
+@@ -998,8 +1073,10 @@ class T5Stack(T5PreTrainedModel):
+                 f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+             )
+         elif input_ids is not None:
++            
+             input_shape = input_ids.size()
+             input_ids = input_ids.view(-1, input_shape[-1])
++            input_shape = input_ids.size()
+         elif inputs_embeds is not None:
+             input_shape = inputs_embeds.size()[:-1]
+         else:
+@@ -1012,25 +1089,29 @@ class T5Stack(T5PreTrainedModel):
+             inputs_embeds = self.embed_tokens(input_ids)
+ 
+         batch_size, seq_length = input_shape
+-
+         # required mask seq length can be calculated via length of past
+-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
++        mask_seq_length = past_keys[0].shape[1] + seq_length if past_keys is not None else seq_length
+ 
+         if use_cache is True:
+             if not self.is_decoder:
+                 raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+ 
+         # initialize past_key_values with `None` if past does not exist
+-        if past_key_values is None:
+-            past_key_values = [None] * len(self.block)
+-
++        if not self.is_decoder:
++            past_keys = [None] * len(self.block)
++            past_values = [None] * len(self.block)
++            past_cross_keys = [None] * len(self.block)
++            past_cross_values = [None] * len(self.block)
+         if attention_mask is None:
+-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
++            print("aaaaaaaaaaaaaaaaa")
++            attention_mask = torch.zeros(batch_size, mask_seq_length, device=inputs_embeds.device)
++            attention_mask = attention_mask[:,None,None,:].expand(batch_size,1,mask_seq_length,mask_seq_length).bool()
+ 
+         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+         # ourselves in which case we just need to make it broadcastable to all heads.
+-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+-
++        # extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
++        extended_attention_mask = attention_mask
++        # print("extended_attention_mask=",extended_attention_mask)
+         # If a 2D or 3D attention mask is provided for the cross-attention
+         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+         if self.is_decoder and encoder_hidden_states is not None:
+@@ -1040,7 +1121,7 @@ class T5Stack(T5PreTrainedModel):
+                 encoder_attention_mask = torch.ones(
+                     encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
+                 )
+-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
++            encoder_extended_attention_mask = encoder_attention_mask
+         else:
+             encoder_extended_attention_mask = None
+ 
+@@ -1054,7 +1135,8 @@ class T5Stack(T5PreTrainedModel):
+         # Prepare head mask if needed
+         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+-        present_key_value_states = () if use_cache else None
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
+         all_hidden_states = () if output_hidden_states else None
+         all_attentions = () if output_attentions else None
+         all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+@@ -1062,8 +1144,8 @@ class T5Stack(T5PreTrainedModel):
+         encoder_decoder_position_bias = None
+ 
+         hidden_states = self.dropout(inputs_embeds)
+-
+-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
++        # for i, layer_module in enumerate(self.block):
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
+             layer_head_mask = head_mask[i]
+             cross_attn_layer_head_mask = cross_attn_head_mask[i]
+             # Model parallel
+@@ -1112,7 +1194,10 @@ class T5Stack(T5PreTrainedModel):
+                     encoder_decoder_position_bias=encoder_decoder_position_bias,
+                     layer_head_mask=layer_head_mask,
+                     cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+-                    past_key_value=past_key_value,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+@@ -1120,19 +1205,20 @@ class T5Stack(T5PreTrainedModel):
+             # layer_outputs is a tuple with:
+             # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+             if use_cache is False:
+-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
+ 
+-            hidden_states, present_key_value_state = layer_outputs[:2]
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
+ 
+             # We share the position biases between the layers - the first layer store them
+             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+             # (cross-attention position bias), (cross-attention weights)
+-            position_bias = layer_outputs[2]
++            position_bias = layer_outputs[3]
+             if self.is_decoder and encoder_hidden_states is not None:
+-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
++                encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 4]
+             # append next layer key value states
+             if use_cache:
+-                present_key_value_states = present_key_value_states + (present_key_value_state,)
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
+ 
+             if output_attentions:
+                 all_attentions = all_attentions + (layer_outputs[3],)
+@@ -1146,31 +1232,158 @@ class T5Stack(T5PreTrainedModel):
+                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+ 
+         hidden_states = self.final_layer_norm(hidden_states)
+-        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
+ 
+         # Add last layer
+         if output_hidden_states:
+             all_hidden_states = all_hidden_states + (hidden_states,)
++        if self.config.tie_word_embeddings:
++            hidden_states = hidden_states * (self.model_dim ** -0.5)
++        lm_logits = self.lm_head(hidden_states)
++        return tuple((lm_logits, present_key_states, present_value_states))
+ 
+-        if not return_dict:
+-            return tuple(
+-                v
+-                for v in [
+-                    hidden_states,
+-                    present_key_value_states,
+-                    all_hidden_states,
+-                    all_attentions,
+-                    all_cross_attentions,
+-                ]
+-                if v is not None
+-            )
+-        return BaseModelOutputWithPastAndCrossAttentions(
+-            last_hidden_state=hidden_states,
+-            past_key_values=present_key_value_states,
+-            hidden_states=all_hidden_states,
+-            attentions=all_attentions,
+-            cross_attentions=all_cross_attentions,
++
++class T5Stack_Encoder(T5PreTrainedModel):
++    def __init__(self, config, embed_tokens=None, encodecrosskey=None, encodecrossvalue=None):
++        super().__init__(config)
++        self.embed_tokens = embed_tokens
++        self.is_decoder = config.is_decoder
++        self.encodecrosskey = encodecrosskey
++        self.encodecrossvalue = encodecrossvalue
++        self.model_dim = config.d_model
++
++        self.block = nn.ModuleList(
++            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
++        )
++        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
++        self.dropout = nn.Dropout(config.dropout_rate)
++
++        # Initialize weights and apply final processing
++        self.post_init()
++        # Model parallel
++        self.model_parallel = False
++        self.device_map = None
++        self.gradient_checkpointing = False
++    
++    def get_input_embeddings(self):
++        return self.embed_tokens
++
++    def set_input_embeddings(self, new_embeddings):
++        self.embed_tokens = new_embeddings
++    
++
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, device=None, dtype=None
++    ):
++        extended_attention_mask = attention_mask[:,None,None,:].expand(input_shape[0],1,input_shape[1],input_shape[1]).bool()
++        extended_attention_mask = ~extended_attention_mask
++        return extended_attention_mask
++    
++    def forward(
++        self,
++        input_ids=None,
++        attention_mask=None,
++        head_mask=None,
++        cross_attn_head_mask=None,
++        use_cache=None,
++        output_attentions=None,
++        output_hidden_states=None,
++        return_dict=None,
++    ):
++        # Model parallel
++        use_cache = use_cache if use_cache is not None else self.config.use_cache
++        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
++        output_hidden_states = (
++            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+         )
++        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++
++        input_shape = input_ids.size()
++        input_ids = input_ids.view(-1, input_shape[-1])
++
++        inputs_embeds = self.embed_tokens(input_ids)
++
++        batch_size, seq_length = input_shape
++        # required mask seq length can be calculated via length of past
++        mask_seq_length = seq_length
++
++        # initialize past_key_values with `None` if past does not exist
++        past_keys = [None] * len(self.block)
++        past_values = [None] * len(self.block)
++        past_cross_keys = [None] * len(self.block)
++        past_cross_values = [None] * len(self.block)
++        # print("attention_mask=",attention_mask)
++        if attention_mask is None:
++            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
++        encoder_extended_attention_mask = None
++        # Prepare head mask if needed
++        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
++        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
++        present_key_states = () if use_cache else None
++        present_value_states = () if use_cache else None
++        all_hidden_states = () if output_hidden_states else None
++        all_attentions = () if output_attentions else None
++        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
++        position_bias = None
++        encoder_decoder_position_bias = None
++
++        hidden_states = self.dropout(inputs_embeds)
++        for i, (layer_module, past_key, past_value, past_cross_key, past_cross_value) in enumerate(zip(self.block, past_keys, past_values, past_cross_keys, past_cross_values)):
++            layer_head_mask = head_mask[i]
++            cross_attn_layer_head_mask = cross_attn_head_mask[i]
++            if output_hidden_states:
++                all_hidden_states = all_hidden_states + (hidden_states,)
++            layer_outputs = layer_module(
++                    hidden_states,
++                    attention_mask=attention_mask,
++                    position_bias=position_bias,
++                    encoder_hidden_states=None,
++                    encoder_attention_mask=encoder_extended_attention_mask,
++                    encoder_decoder_position_bias=encoder_decoder_position_bias,
++                    layer_head_mask=layer_head_mask,
++                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
++                    past_key=past_key,
++                    past_value=past_value,
++                    past_cross_key=past_cross_key,
++                    past_cross_value=past_cross_value,
++                    use_cache=use_cache,
++                    output_attentions=output_attentions,
++                )
++
++            # layer_outputs is a tuple with:
++            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
++            if use_cache is False:
++                layer_outputs = layer_outputs[:1] + (None,) +(None,) + layer_outputs[1:]
++
++            hidden_states, present_key_state, present_value_state = layer_outputs[:3]
++
++            # We share the position biases between the layers - the first layer store them
++            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
++            # (cross-attention position bias), (cross-attention weights)
++            position_bias = layer_outputs[3]
++            # append next layer key value states
++            if use_cache:
++                present_key_states = present_key_states + present_key_state
++                present_value_states = present_value_states + present_value_state
++
++            if output_attentions:
++                all_attentions = all_attentions + (layer_outputs[3],)
++                if self.is_decoder:
++                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
++
++        hidden_states = self.final_layer_norm(hidden_states)
++        hidden_states = self.dropout(hidden_states).half()
++
++        # Add last layer
++        if output_hidden_states:
++            all_hidden_states = all_hidden_states + (hidden_states,)
++
++        if self.encodecrosskey:
++            cross_keys = self.encodecrosskey(hidden_states)
++        if self.encodecrossvalue:
++            cross_values = self.encodecrossvalue(hidden_states)
++        return tuple((hidden_states, cross_keys, cross_values))
+ 
+ 
+ T5_START_DOCSTRING = r"""
+@@ -1541,6 +1754,41 @@ class T5Model(T5PreTrainedModel):
+         )
+ 
+ 
++class EncoderToCrossKey(nn.Module):
++    def __init__(self, cross_key, num_heads, d_kv):
++        super().__init__()
++        self.cross_key = cross_key
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_keys = ()
++        for i in range(len(self.cross_key)):
++        #    past_cross_keys +=  (self.cross_key[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1,2),)
++           past_cross_keys +=  (self.cross_key[i](hidden_states),)
++        return past_cross_keys
++
++
++class EncoderToCrossValue(nn.Module):
++    def __init__(self, cross_value, num_heads, d_kv):
++        super().__init__()
++        self.cross_value = cross_value
++        self.num_heads = num_heads
++        self.d_kv = d_kv
++
++
++    def forward(self, hidden_states):
++        batch_size = hidden_states.shape[0]
++        past_cross_values = ()
++        for i in range(len(self.cross_value)):
++        #    past_cross_values +=  (self.cross_value[i](hidden_states).view(batch_size, -1, self.num_heads, self.d_kv).transpose(1,2),)
++            past_cross_values +=  (self.cross_value[i](hidden_states),)
++        # print("aaa",past_cross_values[0].shape)
++        return past_cross_values
++
++
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
+ class T5ForConditionalGeneration(T5PreTrainedModel):
+     _keys_to_ignore_on_load_unexpected = [
+@@ -1548,28 +1796,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+     ]
+     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+ 
+-    def __init__(self, config: T5Config):
++    def __init__(self, config: T5Config, encoder_path=None, decoder_path=None, device_id=0):
+         super().__init__(config)
+-        self.model_dim = config.d_model
+-
+-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+-
+-        encoder_config = copy.deepcopy(config)
+-        encoder_config.is_decoder = False
+-        encoder_config.use_cache = False
+-        encoder_config.is_encoder_decoder = False
+-        self.encoder = T5Stack(encoder_config, self.shared)
+-
+-        decoder_config = copy.deepcopy(config)
+-        decoder_config.is_decoder = True
+-        decoder_config.is_encoder_decoder = False
+-        decoder_config.num_layers = config.num_decoder_layers
+-        self.decoder = T5Stack(decoder_config, self.shared)
+-
+-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++        self.encoder_path = encoder_path
++        self.decoder_path = decoder_path
++        self.is_mindie = False
++        if not self.encoder_path or not self.decoder_path:
++            self.model_dim = config.d_model
++
++            self.shared = nn.Embedding(config.vocab_size, config.d_model)
++
++            decoder_config = copy.deepcopy(config)
++            decoder_config.is_decoder = True
++            decoder_config.is_encoder_decoder = False
++            decoder_config.num_layers = config.num_decoder_layers
++       
++            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
++            self.decoder = T5Stack(decoder_config, self.shared, self.lm_head)
++
++            cross_key = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.k for i in range(config.num_decoder_layers))
++            cross_value = nn.ModuleList(self.decoder.block[i].layer[1].EncDecAttention.v for i in range(config.num_decoder_layers))
++            encodecrosskey = EncoderToCrossKey(cross_key, config.num_heads, config.d_kv)
++            encodecrossvalue = EncoderToCrossValue(cross_value, config.num_heads, config.d_kv)
++
++            encoder_config = copy.deepcopy(config)
++            encoder_config.is_decoder = False
++            encoder_config.use_cache = False
++            encoder_config.is_encoder_decoder = False
++            self.encoder = T5Stack_Encoder(encoder_config, self.shared, encodecrosskey=encodecrosskey, encodecrossvalue=encodecrossvalue)
++        self.encoder_mindie = None
++        self.decoder_mindie = None
++        if self.encoder_path:
++            self.encoder_mindie = torch.jit.load(self.encoder_path)
++            self.is_mindie = True
++        if self.decoder_path:
++            self.decoder_mindie = torch.jit.load(self.decoder_path)
++            
++            self.stream = torch.npu.Stream(f"npu:{device_id}")
++            self.device_id = device_id
++
++    
++    def get_device(self):
++        return f"npu:{self.device_id}"
+ 
+         # Initialize weights and apply final processing
+-        self.post_init()
++        # self.post_init()
+ 
+         # Model parallel
+         self.model_parallel = False
+@@ -1637,25 +1908,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+ 
+     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+-    def forward(
+-        self,
+-        input_ids: Optional[torch.LongTensor] = None,
+-        attention_mask: Optional[torch.FloatTensor] = None,
+-        decoder_input_ids: Optional[torch.LongTensor] = None,
+-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+-        head_mask: Optional[torch.FloatTensor] = None,
+-        decoder_head_mask: Optional[torch.FloatTensor] = None,
+-        cross_attn_head_mask: Optional[torch.Tensor] = None,
+-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+-        inputs_embeds: Optional[torch.FloatTensor] = None,
+-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+-        labels: Optional[torch.LongTensor] = None,
+-        use_cache: Optional[bool] = None,
+-        output_attentions: Optional[bool] = None,
+-        output_hidden_states: Optional[bool] = None,
+-        return_dict: Optional[bool] = None,
+-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
++    def forward(self,*args) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+         r"""
+         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+             Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+@@ -1687,113 +1940,36 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+         >>> # studies have shown that owning a dog is good for you.
+         ```"""
+-        use_cache = use_cache if use_cache is not None else self.config.use_cache
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+-        if head_mask is not None and decoder_head_mask is None:
+-            if self.config.num_layers == self.config.num_decoder_layers:
+-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+-                decoder_head_mask = head_mask
+-
+-        # Encode if needed (training, first prediction pass)
+-        if encoder_outputs is None:
+-            # Convert encoder inputs in embeddings if needed
+-            encoder_outputs = self.encoder(
+-                input_ids=input_ids,
+-                attention_mask=attention_mask,
+-                inputs_embeds=inputs_embeds,
+-                head_mask=head_mask,
+-                output_attentions=output_attentions,
+-                output_hidden_states=output_hidden_states,
+-                return_dict=return_dict,
+-            )
+-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+-            encoder_outputs = BaseModelOutput(
+-                last_hidden_state=encoder_outputs[0],
+-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+-            )
+-
+-        hidden_states = encoder_outputs[0]
+-
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-
+-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+-            # get decoder inputs from shifting lm labels to the right
+-            decoder_input_ids = self._shift_right(labels)
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.decoder.first_device)
+-            hidden_states = hidden_states.to(self.decoder.first_device)
+-            if decoder_input_ids is not None:
+-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+-            if attention_mask is not None:
+-                attention_mask = attention_mask.to(self.decoder.first_device)
+-            if decoder_attention_mask is not None:
+-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+-
+-        # Decode
+-        decoder_outputs = self.decoder(
+-            input_ids=decoder_input_ids,
+-            attention_mask=decoder_attention_mask,
+-            inputs_embeds=decoder_inputs_embeds,
+-            past_key_values=past_key_values,
+-            encoder_hidden_states=hidden_states,
+-            encoder_attention_mask=attention_mask,
+-            head_mask=decoder_head_mask,
+-            cross_attn_head_mask=cross_attn_head_mask,
+-            use_cache=use_cache,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
+-
+-        sequence_output = decoder_outputs[0]
+-
+-        # Set device for model parallelism
+-        if self.model_parallel:
+-            torch.cuda.set_device(self.encoder.first_device)
+-            self.lm_head = self.lm_head.to(self.encoder.first_device)
+-            sequence_output = sequence_output.to(self.lm_head.weight.device)
+-
+-        if self.config.tie_word_embeddings:
+-            # Rescale output before projecting on vocab
+-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+-            sequence_output = sequence_output * (self.model_dim**-0.5)
+-
+-        lm_logits = self.lm_head(sequence_output)
+-
+-        loss = None
+-        if labels is not None:
+-            loss_fct = CrossEntropyLoss(ignore_index=-100)
+-            # move labels to correct device to enable PP
+-            labels = labels.to(lm_logits.device)
+-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+-
+-        if not return_dict:
+-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+-            return ((loss,) + output) if loss is not None else output
+-
+-        return Seq2SeqLMOutput(
+-            loss=loss,
+-            logits=lm_logits,
+-            past_key_values=decoder_outputs.past_key_values,
+-            decoder_hidden_states=decoder_outputs.hidden_states,
+-            decoder_attentions=decoder_outputs.attentions,
+-            cross_attentions=decoder_outputs.cross_attentions,
+-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+-            encoder_hidden_states=encoder_outputs.hidden_states,
+-            encoder_attentions=encoder_outputs.attentions,
+-        )
++        if self.is_mindie:
++            with torch.npu.stream(self.stream): # set stream
++                decoder_outputs = self.decoder_mindie.forward(*args)
++            self.stream.synchronize() # synchronize
++        else:
++            hidden_states = args[0]
++            past_cross_keys = args[1:self.config.num_decoder_layers+1]
++            past_cross_values = args[self.config.num_decoder_layers+1:2*self.config.num_decoder_layers+1]
++            past_keys= args[2*self.config.num_decoder_layers+1:3*self.config.num_decoder_layers+1]
++            past_values= args[3*self.config.num_decoder_layers+1:4*self.config.num_decoder_layers+1]
++            encoder_attention_mask = args[-3]
++            decoder_input_ids = args[-2]
++            decoder_attention_mask = args[-1]
++            decoder_outputs = self.decoder(input_ids=decoder_input_ids,
++                                           encoder_hidden_states=hidden_states,
++                                           past_keys=past_keys,
++                                           past_values=past_values,
++                                           past_cross_keys=past_cross_keys,
++                                           past_cross_values=past_cross_values,
++                                           encoder_attention_mask=encoder_attention_mask,
++                                           attention_mask=decoder_attention_mask)
++        return (decoder_outputs[0],decoder_outputs[1],decoder_outputs[2])
+ 
+     def prepare_inputs_for_generation(
+         self,
+         input_ids,
+-        past_key_values=None,
++        past_cross_keys=None,
++        past_cross_values=None,
++        past_keys=None,
++        past_values=None,
+         attention_mask=None,
+         head_mask=None,
+         decoder_head_mask=None,
+@@ -1804,8 +1980,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+         **kwargs,
+     ):
+         # cut decoder_input_ids if past_key_values is used
+-        if past_key_values is not None:
+-            past_length = past_key_values[0][0].shape[2]
++        if past_keys is not None:
++            past_length = past_keys[0].shape[1]
+ 
+             # Some generation methods already pass only the last input ID
+             if input_ids.shape[1] > past_length:
+@@ -1813,12 +1989,19 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             else:
+                 # Default to old behavior: keep only final ID
+                 remove_prefix_length = input_ids.shape[1] - 1
+-
+             input_ids = input_ids[:, remove_prefix_length:]
+ 
++        batch_size, seq_length = input_ids.shape
++        # required mask seq length can be calculated via length of past
++        mask_seq_length = past_keys[0].shape[1] + seq_length if past_keys is not None else seq_length
++        decoder_attention_mask = torch.zeros(batch_size, mask_seq_length, device=input_ids.device)
++        decoder_attention_mask = decoder_attention_mask[:,None,None,:].expand(batch_size,1,mask_seq_length,mask_seq_length).bool()
+         return {
+             "decoder_input_ids": input_ids,
+-            "past_key_values": past_key_values,
++            "past_cross_keys":past_cross_keys,
++            "past_cross_values":past_cross_values,
++            "past_keys":past_keys,
++            "past_values":past_values,
+             "encoder_outputs": encoder_outputs,
+             "attention_mask": attention_mask,
+             "head_mask": head_mask,
+@@ -1826,6 +2009,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             "decoder_attention_mask": decoder_attention_mask,
+             "cross_attn_head_mask": cross_attn_head_mask,
+             "use_cache": use_cache,
++            
+         }
+ 
+     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+@@ -1861,6 +2045,440 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+         return reordered_decoder_past
+ 
++    def _prepare_encoder_decoder_kwargs_for_generation(
++        self,
++        inputs_tensor: torch.Tensor,
++        model_kwargs,
++        model_input_name,
++        generation_config,
++    ):
++        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
++        encoder_kwargs = {
++            argument: value
++            for argument, value in model_kwargs.items()
++            if not any(argument.startswith(p) for p in irrelevant_prefix)
++        }
++        encoder_kwargs["output_attentions"] = generation_config.output_attentions
++        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
++        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
++        encoder_kwargs["return_dict"] = True
++        encoder_kwargs[model_input_name] = inputs_tensor
++        encoder_outputs = None
++        if self.is_mindie:
++            with torch.npu.stream(self.stream): # set stream
++                encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"],encoder_kwargs["attention_mask"])
++            self.stream.synchronize() # synchronize
++        else:
++            encoder_outputs=self.encoder.forward(**encoder_kwargs)
++        model_kwargs["encoder_outputs"]={"last_hidden_state":encoder_outputs[0]}
++        model_kwargs["past_cross_keys"] = encoder_outputs[1]
++        model_kwargs["past_cross_values"] =encoder_outputs[2]
++        # print("model_kwargs=",model_kwargs)
++        return model_kwargs
++
++    def _update_model_kwargs_for_generation(
++        self,
++        outputs,
++        model_kwargs,
++        is_encoder_decoder = False,
++        standardize_cache_format = False,
++        num_new_tokens = 1,
++    ):
++        # update past_key_values keeping its naming used in model code
++        cache_name, cache = self._extract_past_from_model_output(
++            outputs, standardize_cache_format=standardize_cache_format
++        )
++        model_kwargs[cache_name] = cache
++        if "past_keys" in outputs:
++            past_keys = outputs.past_keys
++            model_kwargs["past_keys"] = past_keys
++        if "past_values" in outputs:
++            past_values = outputs.past_values
++            model_kwargs["past_values"] = past_values
++        # update decoder attention mask
++        if "decoder_attention_mask" in model_kwargs:
++            decoder_attention_mask = model_kwargs["decoder_attention_mask"]
++            model_kwargs["decoder_attention_mask"] = torch.cat(
++                 [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
++                dim=-1,
++            )
++        return model_kwargs
++
++    @torch.no_grad()
++    def generate(
++        self,
++        inputs = None,
++        generation_config = None,
++        logits_processor = None,
++        stopping_criteria = None,
++        prefix_allowed_tokens_fn = None,
++        assistant_model = None,
++        negative_prompt_ids = None,
++        negative_prompt_attention_mask = None,
++        **kwargs,
++    ):
++        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
++        self._validate_model_class()
++        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
++        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
++        self._validate_model_kwargs(model_kwargs.copy())
++
++
++        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
++        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
++
++        accepts_attention_mask = True
++        requires_attention_mask = "encoder_outputs" not in model_kwargs
++        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
++
++        # 3. Define model inputs
++        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
++            inputs, generation_config.bos_token_id, model_kwargs
++        )
++        batch_size = inputs_tensor.shape[0]
++        seq_len = inputs_tensor.shape[1]
++        device = inputs_tensor.device
++        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
++
++        # 4. Define other model kwargs
++        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
++        # generating the first new token or not, and we only want to use the embeddings for the first new token)
++        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
++            model_kwargs["use_cache"] = True
++        else:
++            model_kwargs["use_cache"] = generation_config.use_cache
++        if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
++            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
++                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
++            )
++        attention_mask = model_kwargs["attention_mask"]
++        attention_mask = attention_mask[:,None,None,:].expand(batch_size,1,seq_len,seq_len).bool()
++        model_kwargs["attention_mask"] = ~attention_mask
++        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
++            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
++            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
++                inputs_tensor, model_kwargs, model_input_name, generation_config
++            )
++
++        # 5. Prepare `input_ids` which will be used for auto-regressive generation
++        if self.config.is_encoder_decoder:
++            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
++                batch_size=batch_size,
++                model_input_name=model_input_name,
++                model_kwargs=model_kwargs,
++                decoder_start_token_id=generation_config.decoder_start_token_id,
++                device=inputs_tensor.device,
++            )
++        else:
++            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
++
++        if generation_config.token_healing:
++            input_ids = self.heal_tokens(input_ids, tokenizer)
++
++        # 6. Prepare `max_length` depending on other stopping criteria.
++        input_ids_length = input_ids.shape[-1]
++        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
++        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
++        generation_config = self._prepare_generated_length(
++            generation_config=generation_config,
++            has_default_max_length=has_default_max_length,
++            has_default_min_length=has_default_min_length,
++            model_input_name=model_input_name,
++            inputs_tensor=inputs_tensor,
++            input_ids_length=input_ids_length,
++        )
++
++        use_dynamic_cache_by_default = False
++        if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None:
++            raise ValueError(
++                "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
++                "Cache object) is unsupported. Please use only one of the two."
++            )
++        elif generation_config.cache_implementation is not None:
++            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
++                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
++                    raise ValueError(
++                        "This model does not support `cache_implementation='static'`. Please check the following "
++                        "issue: https://github.com/huggingface/transformers/issues/28981"
++                    )
++                model_kwargs["past_key_values"] = self._get_cache(
++                    generation_config.cache_implementation,
++                    getattr(generation_config, "num_beams", 1) * batch_size,
++                    generation_config.max_length,
++                )
++            elif generation_config.cache_implementation == "quantized":
++                if not self._supports_quantized_cache:
++                    raise ValueError(
++                        "This model does not support the quantized cache. If you want your model to support quantized "
++                        "cache, please open an issue."
++                    )
++
++                cache_config = (
++                    generation_config.cache_config
++                    if generation_config.cache_config is not None
++                    else QuantizedCacheConfig()
++                )
++                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
++
++                if cache_config.backend == "quanto" and not is_quanto_available():
++                    raise ImportError(
++                        "You need to install `quanto` in order to use KV cache quantization with quanto backend. "
++                        "Please install it via  with `pip install quanto`"
++                    )
++                elif cache_config.backend == "HQQ" and not is_hqq_available():
++                    raise ImportError(
++                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
++                        "Please install it via  with `pip install hqq`"
++                    )
++
++                model_kwargs["past_key_values"] = cache_class(cache_config)
++        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
++        # keeps copying the cache thus using much more memory
++        elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache():
++            past = model_kwargs.get("past_key_values", None)
++            if past is None:
++                model_kwargs["past_key_values"] = DynamicCache()
++                use_dynamic_cache_by_default = True
++            elif isinstance(past, tuple):
++                model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(past)
++                use_dynamic_cache_by_default = True
++
++        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
++
++        # 7. determine generation mode
++        generation_mode = generation_config.get_generation_mode(assistant_model)
++        # 8. prepare distribution pre_processing samplers
++        prepared_logits_processor = self._get_logits_processor(
++            generation_config=generation_config,
++            input_ids_seq_length=input_ids_length,
++            encoder_input_ids=inputs_tensor,
++            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
++            logits_processor=logits_processor,
++            device=inputs_tensor.device,
++            model_kwargs=model_kwargs,
++            negative_prompt_ids=negative_prompt_ids,
++            negative_prompt_attention_mask=negative_prompt_attention_mask,
++        )
++
++        # 9. prepare stopping criteria
++        prepared_stopping_criteria = self._get_stopping_criteria(
++            generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
++        )
++
++        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
++            # 11. prepare logits warper
++            prepared_logits_warper = (
++                self._get_logits_warper(generation_config, device=input_ids.device)
++                if generation_config.do_sample
++                else None
++            )
++
++            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
++            input_ids, model_kwargs = self._expand_inputs_for_generation(
++                input_ids=input_ids,
++                expand_size=generation_config.num_return_sequences,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++                **model_kwargs,
++            )
++            # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
++            result = self._sample(
++                input_ids,
++                logits_processor=prepared_logits_processor,
++                logits_warper=prepared_logits_warper,
++                stopping_criteria=prepared_stopping_criteria,
++                generation_config=generation_config,
++                **model_kwargs,
++            )
++        return result
++    
++    def _sample(
++        self,
++        input_ids,
++        logits_processor,
++        stopping_criteria,
++        generation_config,
++        logits_warper = None,
++        **model_kwargs,
++    ):
++        # init values
++        pad_token_id = generation_config.pad_token_id
++        output_attentions = generation_config.output_attentions
++        output_hidden_states = generation_config.output_hidden_states
++        output_scores = generation_config.output_scores
++        output_logits = generation_config.output_logits
++        return_dict_in_generate = generation_config.return_dict_in_generate
++        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
++        do_sample = generation_config.do_sample
++        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
++            raise ValueError(
++                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
++                f"{logits_warper})."
++            )
++
++        # init attention / hidden states / scores tuples
++        scores = () if (return_dict_in_generate and output_scores) else None
++        raw_logits = () if (return_dict_in_generate and output_logits) else None
++        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
++        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
++        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
++
++        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
++        if return_dict_in_generate and self.config.is_encoder_decoder:
++            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
++            encoder_hidden_states = (
++                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
++            )
++       
++        this_peer_finished = False
++        batch_size = input_ids.shape[0]
++        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
++        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
++        
++        # keep track of which sequences are already finished
++        if self.is_mindie or self.config.architectures[0]=="T5ForConditionalGeneration":
++            num_layers = self.config.num_layers
++            num_heads = self.config.num_heads
++            d_kv = self.config.d_kv
++            model_kwargs["past_keys"] = [torch.randn(batch_size, 0, num_heads*d_kv).half().npu() for _ in range(num_layers)]
++            model_kwargs["past_values"] = [torch.randn(batch_size, 0, num_heads*d_kv).half().npu() for _ in range(num_layers)]
++       
++
++        while self._has_unfinished_sequences(this_peer_finished, False, device=input_ids.device):
++            # prepare model inputs
++            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
++            model_args = [model_kwargs["encoder_outputs"]["last_hidden_state"]]
++            model_args.extend(model_kwargs["past_cross_keys"])
++            model_args.extend(model_kwargs["past_cross_values"])
++            model_args.extend(model_inputs["past_keys"])
++            model_args.extend(model_inputs["past_values"])
++            model_args.append(model_inputs["attention_mask"])
++            model_args.append(model_inputs["decoder_input_ids"])
++            model_args.append(model_inputs["decoder_attention_mask"])
++            
++            # forward pass to get next token
++            outputs = self(*model_args)
++            outputs = Seq2SeqLMOutput(logits=outputs[0],
++                                      past_keys=outputs[1],
++                                      past_values=outputs[2])
++
++            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
++            # (the clone itself is always small)
++            next_token_logits = outputs.logits[:, -1, :].clone()
++
++            # pre-process distribution
++            next_token_scores = logits_processor(input_ids, next_token_logits)
++            if do_sample:
++                next_token_scores = logits_warper(input_ids, next_token_scores)
++
++            # Store scores, attentions and hidden_states when required
++            if return_dict_in_generate:
++                if output_scores:
++                    scores += (next_token_scores,)
++                if output_logits:
++                    raw_logits += (next_token_logits,)
++                if output_attentions:
++                    decoder_attentions += (
++                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
++                    )
++                    if self.config.is_encoder_decoder:
++                        cross_attentions += (outputs.cross_attentions,)
++
++                if output_hidden_states:
++                    decoder_hidden_states += (
++                        (outputs.decoder_hidden_states,)
++                        if self.config.is_encoder_decoder
++                        else (outputs.hidden_states,)
++                    )
++
++            # token selection
++            if do_sample:
++                probs = nn.functional.softmax(next_token_scores, dim=-1)
++                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
++            else:
++                next_tokens = torch.argmax(next_token_scores, dim=-1)
++
++            # finished sentences should have their next token be a padding token
++            if has_eos_stopping_criteria:
++                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
++
++            # update generated ids, model inputs, and length for next step
++            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
++            model_kwargs = self._update_model_kwargs_for_generation(
++                outputs,
++                model_kwargs,
++                is_encoder_decoder=self.config.is_encoder_decoder,
++            )
++            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
++            this_peer_finished = unfinished_sequences.max() == 0
++            # This is needed to properly delete outputs.logits which may be very large for first iteration
++            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
++            del outputs
++        return input_ids
++
++    
++    @property
++    def device(self) -> torch.device:
++        """
++        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
++        device).
++        """
++        return self.get_device()
++    
++    def get_extended_attention_mask(
++        self, attention_mask, input_shape, devic=None, dtype=None
++    ):
++        """
++        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
++
++        Arguments:
++            attention_mask (`torch.Tensor`):
++                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
++            input_shape (`Tuple[int]`):
++                The shape of the input to the model.
++
++        Returns:
++            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
++        """
++        if dtype is None:
++            dtype = self.dtype
++
++        if not (attention_mask.dim() == 2 and self.config.is_decoder):
++            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
++            if device is not None:
++                warnings.warn(
++                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
++                )
++        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
++        # ourselves in which case we just need to make it broadcastable to all heads.
++        if attention_mask.dim() == 3:
++            extended_attention_mask = attention_mask[:, None, :, :]
++        elif attention_mask.dim() == 2:
++            # Provided a padding mask of dimensions [batch_size, seq_length]
++            # - if the model is a decoder, apply a causal mask in addition to the padding mask
++            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
++            if self.config.is_decoder:
++                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
++                    input_shape, attention_mask, device
++                )
++            else:
++                extended_attention_mask = attention_mask[:, None, None, :]
++        else:
++            raise ValueError(
++                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
++            )
++
++        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
++        # masked positions, this operation will create a tensor which is 0.0 for
++        # positions we want to attend and the dtype's smallest value for masked positions.
++        # Since we are adding it to the raw scores before the softmax, this is
++        # effectively the same as removing these entirely.
++        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
++        #extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
++        extended_attention_mask = (1.0 - extended_attention_mask) * -1000
++        return extended_attention_mask
++
++
++
+ 
+ @add_start_docstrings(
+     "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+@@ -1878,6 +2496,9 @@ class T5EncoderModel(T5PreTrainedModel):
+         encoder_config.use_cache = False
+         encoder_config.is_encoder_decoder = False
+         self.encoder = T5Stack(encoder_config, self.shared)
++        self.decoder_mindie = torch.jit.load("encoder_model_path")
++            
++        self.stream = torch.npu.Stream(f"npu:{2}")
+ 
+         # Initialize weights and apply final processing
+         self.post_init()
+@@ -1966,17 +2587,21 @@ class T5EncoderModel(T5PreTrainedModel):
+         >>> outputs = model(input_ids=input_ids)
+         >>> last_hidden_states = outputs.last_hidden_state
+         ```"""
+-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+-
+-        encoder_outputs = self.encoder(
+-            input_ids=input_ids,
+-            attention_mask=attention_mask,
+-            inputs_embeds=inputs_embeds,
+-            head_mask=head_mask,
+-            output_attentions=output_attentions,
+-            output_hidden_states=output_hidden_states,
+-            return_dict=return_dict,
+-        )
++        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
++        # encoder_outputs = self.encoder(
++        #     input_ids=input_ids,
++        #     attention_mask=attention_mask,
++        #     inputs_embeds=inputs_embeds,
++        #     head_mask=head_mask,
++        #     output_attentions=output_attentions,
++        #     output_hidden_states=output_hidden_states,
++        #     return_dict=return_dict,
++        # )
++        attention_mask = attention_mask[:,None,None,:].expand(attention_mask.shape[0],1,attention_mask.shape[1],attention_mask.shape[1]).bool()
++        attention_mask = ~attention_mask
++        with torch.npu.stream(self.stream): # set stream
++            encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask)
++        self.stream.synchronize() # synchronize
+ 
+         return encoder_outputs
+ 
-- 
Gitee


From 1e09a5b7aa8d4b41ad09bd0e613439006ef0b10d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:07:53 +0000
Subject: [PATCH 086/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/T5_modeling_t5_patch.py          | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index e304f4f9f2..21678e06d2 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -16,13 +16,23 @@ import os
 import transformers
 
 
-def main():
+def main(args):
     transformers_path = transformers.__path__
     transformers_version = transformers.__version__
 
     assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
-    os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
+    if args.ascend_soc == "Ascend910B4":
+        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.py modeling_t5.patch')
+    elif args.ascend_soc == "Ascend310P3":
+        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ascend_soc", type=str, default="Ascend910B4",required=True)
+    return args
 
 
 if __name__ == '__main__':
-    main()
+    args = parse_args()
+    main(args)
-- 
Gitee


From 0e8fe3e1672c54480b1715ab812f29b2b1ab13f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:15:59 +0000
Subject: [PATCH 087/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 21678e06d2..4753b3ee0c 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -14,6 +14,7 @@
 
 import os
 import transformers
+import argparse
 
 
 def main(args):
-- 
Gitee


From fdc9df000188876398c910aa025f125330b7bfcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:18:06 +0000
Subject: [PATCH 088/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 4753b3ee0c..43d0caf25e 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -31,6 +31,7 @@ def main(args):
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--ascend_soc", type=str, default="Ascend910B4",required=True)
+    args = parser.parse_args()
     return args
 
 
-- 
Gitee


From 69b484910a92f239c40de01a7ed2494517e95f72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:19:49 +0000
Subject: [PATCH 089/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 43d0caf25e..0e3c076ca6 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -23,7 +23,7 @@ def main(args):
 
     assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
     if args.ascend_soc == "Ascend910B4":
-        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.py modeling_t5.patch')
+        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.patch modeling_t5.patch')
     elif args.ascend_soc == "Ascend310P3":
         os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
 
-- 
Gitee


From 79ac0bf58bc66d01c0efd7e8800d74af540f34ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:23:12 +0000
Subject: [PATCH 090/110] update
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
index 0e3c076ca6..c6733e6904 100644
--- a/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
+++ b/MindIE/MindIE-Torch/built-in/T5/T5_modeling_t5_patch.py
@@ -23,7 +23,7 @@ def main(args):
 
     assert transformers_version =='4.42.0', "expectation transformers==4.42.0"
     if args.ascend_soc == "Ascend910B4":
-        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5_800IA2.patch modeling_t5.patch')
+        os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5_800IA2.patch')
     elif args.ascend_soc == "Ascend310P3":
         os.system(f'patch -p0 {transformers_path[0]}/models/t5/modeling_t5.py modeling_t5.patch')
 
-- 
Gitee


From 457414d33d921b087149ef2005ebe39c320cf7e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 24 Sep 2024 06:29:37 +0000
Subject: [PATCH 091/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index b677c10796..45792728bd 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -68,11 +68,15 @@
 
    执行命令：
    ```bash
-   python T5_modeling_t5_patch.py
+   python T5_modeling_t5_patch.py --ascend_soc {Ascend910B4 or Ascend310P3}
    ```
 4.导出mindietorch模型
+300IDUO卡环境下：
  ```bash
    python export_t5.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
+800IA2卡环境下：
+ ```bash
+   python export_t5_800IA2.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
    ```
 参数说明：
 {output_path}是输出的目录
-- 
Gitee


From 88d86bfb27fafdd590228a064aa5b9bd1eccb172 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Fri, 27 Sep 2024 11:43:13 +0000
Subject: [PATCH 092/110] update MindIE/MindIE-Torch/built-in/T5/export_t5.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/export_t5.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/export_t5.py b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
index 995274d0bd..9c67b7c7ef 100644
--- a/MindIE/MindIE-Torch/built-in/T5/export_t5.py
+++ b/MindIE/MindIE-Torch/built-in/T5/export_t5.py
@@ -50,8 +50,8 @@ class TextEncoderExport(torch.nn.Module):
         super(TextEncoderExport, self).__init__()
         self.textencoder_model = textencoder_model
     
-    def forward(self, input_ids):
-        return self.textencoder_model(input_ids=input_ids)
+    def forward(self, input_ids,attention_mask):
+        return self.textencoder_model(input_ids=input_ids, attention_mask=attention_mask)
 
 class TextDecoderExport(torch.nn.Module):
     def __init__(self, textdecoder_model):
@@ -71,6 +71,7 @@ def export_textencoder(args, model, save_dir, batch_size):
     if not os.path.exists(traced_path):
         text_encoder = model.encoder
         dummy_input = (
+            torch.ones([1, 128], dtype=torch.int64).npu(),
             torch.ones([1, 128], dtype=torch.int64).npu()
         )
         encoder = TextEncoderExport(text_encoder)
@@ -81,6 +82,7 @@ def export_textencoder(args, model, save_dir, batch_size):
         
         inputs0 = []
         inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
+        inputs0.append(mindietorch.Input(min_shape = (1,1), max_shape= (args.max_batchsize, args.max_input_seq_len), dtype=torch.int64))
         print("compiling encoder")
         compiled_model = mindietorch.compile(
             traced_model,
-- 
Gitee


From 2db6e6fc1e69aeb9b13e634bbb9be436b5aa4878 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Sun, 29 Sep 2024 01:18:49 +0000
Subject: [PATCH 093/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index 26b0ce5e87..8923d7b3d4 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,5 +1,5 @@
 diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..65c058e6e 100644
+index 224769fdf..8a8f9a23a 100644
 --- a/modeling_t5_origin.py
 +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
 @@ -19,7 +19,7 @@ import math
@@ -1186,7 +1186,7 @@ index 224769fdf..65c058e6e 100644
 +        encoder_kwargs[model_input_name] = inputs_tensor
 +        if self.is_mindie:
 +            with torch.npu.stream(self.stream): # set stream
-+                encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"])
++                encoder_outputs=self.encoder_mindie.forward(encoder_kwargs["input_ids"],encoder_kwargs["attention_mask"])
 +            self.stream.synchronize() # synchronize
 +        else:
 +            encoder_outputs=self.encoder.forward(**encoder_kwargs)
-- 
Gitee


From 8296f4f76be970e9d521eb87d5d5141ed29e3753 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Sat, 12 Oct 2024 08:05:19 +0000
Subject: [PATCH 094/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 50 ++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 45792728bd..95a550f302 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -74,6 +74,7 @@
 300IDUO卡环境下：
  ```bash
    python export_t5.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
+   ```
 800IA2卡环境下：
  ```bash
    python export_t5_800IA2.py --output_dir {output_path} --model_path {model_path}  --max_batchsize {max_batchsize} --max_input_seq_len {max_input_seq_len} --device_id {device_id}
@@ -96,4 +97,51 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path
 {model_path}模型所在目录
 {encoder_aie_path}优化后的encoder的模型路径，要具体到.pt文件
 {decoder_aie_path}优化后的decoder的模型路径，要具体到.pt文件
-{device_id} 用哪个npu device
\ No newline at end of file
+{device_id} 用哪个npu device
+
+6.精度测试
+
+6.1 精度验收标准
+数据集：https://github.com/embeddings-benchmark/mteb（英文数据集选一种测试），精度和GPU推理结果对比误差小于1%
+6.2 精度测试方法
+
+6.2.1安装mteb
+
+ ```bash
+pip install mteb
+```
+6.2.2 下载mteb数据集（如果机器可以连接外部网络可以跳过这步）
+下载链接：https://github.com/embeddings-benchmark/mteb
+
+6.2.3 修改metb的读取数据集的路径地址（如果机器可以连接外部网络可以跳过这步）
+例如如果下载的是Banking77Classification数据集，修改mteb python包里的文件路径，例如
+D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classification.py文件里的path路径为6.2.2下载的数据集的路径
+
+6.2.4 修改代码
+800IA2卡环境下：
+修改transfoermers包下modeling_t5.py下的T5EncoderModel类，将self.decoder_mindie加载路径修改为编译好的encoder的路径
+
+300IDUO卡环境下：
+修改transfoermers包下modeling_t5.py下的T5EncoderModel类，增加一行，self.decoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径，再修改forward接口为
+```bash
+with torch.npu.stream(self.stream): # set stream
+    encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask)
+self.stream.synchronize() # synchronize
+return encoder_outputs
+```
+6.2.5测试代码
+
+```bash
+import torch
+
+import mteb
+from sentence_transformers import SentenceTransformer
+
+model_name = "D:\downloads\T5-v2"
+model = SentenceTransformer(model_name,model_kwargs={"torch_dtype":torch.float16})
+tasks = mteb.get_tasks(tasks=["CLSClusteringP2P"])
+evaluation = mteb.MTEB(tasks=tasks)
+results = evaluation.run(model, output_folder=f"./{model_name}")
+```
+6.2.6 结果输出
+会在当前目录输出结果文件
-- 
Gitee


From c1773fd718a61c4355b4f1868aee2889b9727427 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Sat, 12 Oct 2024 08:06:19 +0000
Subject: [PATCH 095/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 95a550f302..8e8dceb2fe 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -118,6 +118,7 @@ pip install mteb
 D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classification.py文件里的path路径为6.2.2下载的数据集的路径
 
 6.2.4 修改代码
+
 800IA2卡环境下：
 修改transfoermers包下modeling_t5.py下的T5EncoderModel类，将self.decoder_mindie加载路径修改为编译好的encoder的路径
 
-- 
Gitee


From 4736ea28dd106afe2c4aa4ceede7810e1a9447f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 14 Oct 2024 10:38:01 +0000
Subject: [PATCH 096/110] update MindIE/MindIE-Torch/built-in/T5/main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/main.py | 27 ++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/main.py b/MindIE/MindIE-Torch/built-in/T5/main.py
index 28d85df24a..6e20f1e05e 100644
--- a/MindIE/MindIE-Torch/built-in/T5/main.py
+++ b/MindIE/MindIE-Torch/built-in/T5/main.py
@@ -14,29 +14,38 @@ def parse_args():
 
     parser.add_argument("--device_id", type=int, help="NPU device id", default=0)
 
+    parser.add_argument("--performance", action="store_true")
+
     args = parser.parse_args()
     return args
 
 def main():
     args = parse_args()
+    
     torch.npu.set_device(args.device_id)
     tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path)
-    text = ["贵州毛台现在多少钱一瓶啊，想买两瓶尝尝味道。",
-        "能不能帮我买点淇淋，好久没吃了",
-        "脑子有点胡涂了，这道题冥冥学过还没有做出来"]
+    text = ["今年2月26日，阿富汗塔里班的最高领秀下令销毁全国范围内所有“非伊斯兰“的古文化遗产，其中包括矗立于巴米扬的世高(大界最约58米)的立式佛像。"]
     t5_config = T5Config.from_pretrained(args.hf_model_path)
+    # model = T5ForConditionalGeneration.from_pretrained(args.hf_model_path).half().npu()
     model = T5ForConditionalGeneration(config=t5_config,
                                         encoder_path=args.encoder_aie_path,
                                         decoder_path=args.decoder_aie_path,
                                         device_id=args.device_id).half().npu()
     input_ids = tokenizer(text, return_tensors = "pt", padding=True).input_ids
-    outputs = model.generate(input_ids.npu(),max_new_tokens=24)
+    if args.performance:
+        input_ids = torch.randint(0,32000,(1,512))
+    outputs = model.generate(input_ids.npu(),max_new_tokens=512)
+    print("token length : ", input_ids.shape)
     start_time = time.time()
-    outputs = model.generate(input_ids.npu(),max_new_tokens=24)
-    print("time_cost=", time.time()-start_time)
-    print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+   
+    outputs = model.generate(input_ids.npu(),max_new_tokens=512)
+    inference_time = time.time()-start_time
+    print("time_cost=", inference_time)
+    print("output token length : ", outputs[0].shape[0])
+    print("throught output is  : ", outputs[0].shape[0] / inference_time)
+    if not args.performance:
+        print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 
 
 if __name__ == "__main__":
-    main()
-
+    main()
\ No newline at end of file
-- 
Gitee


From bc9588c1f908c1e34e6f2753618552e942239900 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 14 Oct 2024 10:40:43 +0000
Subject: [PATCH 097/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 8e8dceb2fe..2be95322d9 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -88,11 +88,16 @@
 
 运行该命令后会自动生成encoder和decoder优化后的模型
 
-5.运行
+5.运行与性能测试
+导入环境变量：export TORCH_AIE_NPU_CACHE_MAX_SIZE=32
  ```bash
 python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2
 ```
-
+性能测试：
+ ```bash
+python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id 2 --performance
+```
+打屏可以看到输入长度为512，输出长度为512单batch下的吞吐
 参数说明：
 {model_path}模型所在目录
 {encoder_aie_path}优化后的encoder的模型路径，要具体到.pt文件
-- 
Gitee


From 893b55ac9144820e01f6bd6b8a98283b77e4fc59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 28 Oct 2024 05:58:52 +0000
Subject: [PATCH 098/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md
index 3ffa911ed6..f9c6f0ca65 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md
@@ -16,6 +16,7 @@
 # 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
 
   T5全称是Text-to-Text Transfer Transformer，是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务，如分类、相似度计算、文本生成都用一个Text-to-text（文本到文本）的框架里进行解决。
+  权重下载：https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918
 
 
 ## 输入输出数据<a name="section540883920406"></a>
-- 
Gitee


From 7806612346d3b0a0b7f44bd6f477416b9fa195b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 28 Oct 2024 05:59:26 +0000
Subject: [PATCH 099/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/readme.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md
index f9c6f0ca65..96f0c1cb00 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md
@@ -16,8 +16,7 @@
 # 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
 
   T5全称是Text-to-Text Transfer Transformer，是一种模型架构或者说是一种解决NLP任务的一种范式。把所有任务，如分类、相似度计算、文本生成都用一个Text-to-text（文本到文本）的框架里进行解决。
-  权重下载：https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918
-
+  权重下载：https://huggingface.co/collections/google/mt5-release-65005f1a520f8d7b4d039509
 
 ## 输入输出数据<a name="section540883920406"></a>
 
-- 
Gitee


From 8f4fc16e19840f81019c632371e99a34fb3b2fb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 28 Oct 2024 06:00:06 +0000
Subject: [PATCH 100/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 2be95322d9..8170a54c3f 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -16,6 +16,7 @@
 # 概述<a name="ZH-CN_TOPIC_0000001172161501"></a>
 
   T5的全称为Text to Text Transfer Transformer，是谷歌提出的预训练语言模型领域的通用模型，该模型将所有自然语言问题都转化成文本到文本的形式，并用一个统一的模型解决.T5最核心的理念是：使用前缀任务声明及文本答案生成，统一所有自然语言处理任务的输入和输出。在此之前的几乎所有预训练语言模型，在下游任务微调过程中都需要添加非线性层，将模型的输出转化为任务指定的输出格式。T5不需要对模型做任何改动，只需要提供下游任务的微调数据；不需要添加任何非线性层，唯一需要做的就是在输入数据前加上任务声明前缀.T5将自然语言处理任务都转化成几乎一致的格式，即输入是带有任务前缀声明的文本序列，输出的文本序列是相应任务的结果
+权重下载：https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918
 
 
 ## 输入输出数据<a name="section540883920406"></a>
-- 
Gitee


From 0d683e3f6791172aaf05b285d7809e44b3e6cc64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Mon, 28 Oct 2024 12:31:32 +0000
Subject: [PATCH 101/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 8170a54c3f..44feeb7415 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -129,10 +129,10 @@ D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classifica
 修改transfoermers包下modeling_t5.py下的T5EncoderModel类，将self.decoder_mindie加载路径修改为编译好的encoder的路径
 
 300IDUO卡环境下：
-修改transfoermers包下modeling_t5.py下的T5EncoderModel类，增加一行，self.decoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径，再修改forward接口为
+修改transfoermers包下modeling_t5.py下的T5EncoderModel类，增加一行，self.encoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径，再修改forward接口为
 ```bash
 with torch.npu.stream(self.stream): # set stream
-    encoder_outputs = self.decoder_mindie.forward(input_ids,attention_mask)
+    encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)
 self.stream.synchronize() # synchronize
 return encoder_outputs
 ```
@@ -140,7 +140,7 @@ return encoder_outputs
 
 ```bash
 import torch
-
+import mindietorch
 import mteb
 from sentence_transformers import SentenceTransformer
 
-- 
Gitee


From 55d4b5867adb9455c6ec8c8cd734393acfad2e42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 29 Oct 2024 06:44:42 +0000
Subject: [PATCH 102/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 44feeb7415..d7ef3160c2 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -108,7 +108,8 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path
 6.精度测试
 
 6.1 精度验收标准
-数据集：https://github.com/embeddings-benchmark/mteb（英文数据集选一种测试），精度和GPU推理结果对比误差小于1%
+数据集：<https://github.com/embeddings-benchmark/mteb>（英文数据集选一种测试），精度和GPU推理结果对比误差小于1%
+
 6.2 精度测试方法
 
 6.2.1安装mteb
@@ -129,10 +130,15 @@ D:\python3.9\Lib\site-packages\mteb\tasks\Classification\eng\Banking77Classifica
 修改transfoermers包下modeling_t5.py下的T5EncoderModel类，将self.decoder_mindie加载路径修改为编译好的encoder的路径
 
 300IDUO卡环境下：
-修改transfoermers包下modeling_t5.py下的T5EncoderModel类，增加一行，self.encoder_mindie = torch.jit.load("encoder_model_path"),其中encoder_model_path为编译好的encoder的路径，再修改forward接口为
+修改transfoermers包下modeling_t5.py下的T5EncoderModel类，增加2行，
+```bash
+self.encoder_mindie = torch.jit.load("encoder_model_path")
+self.stream = torch.npu.Stream(f"npu:{device_id}")
+```
+其中encoder_model_path为编译好的encoder的路径，device_id为当前设置的npu卡号，再修改forward接口为
 ```bash
 with torch.npu.stream(self.stream): # set stream
-    encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)
+    encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0]
 self.stream.synchronize() # synchronize
 return encoder_outputs
 ```
-- 
Gitee


From d049285be09fac63608dba6f5d92d427cc527a01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 29 Oct 2024 07:40:37 +0000
Subject: [PATCH 103/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index d7ef3160c2..f8b822ef23 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -140,7 +140,7 @@ self.stream = torch.npu.Stream(f"npu:{device_id}")
 with torch.npu.stream(self.stream): # set stream
     encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0]
 self.stream.synchronize() # synchronize
-return encoder_outputs
+return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=encoder_outputs)
 ```
 6.2.5测试代码
 
-- 
Gitee


From 730e775598128eb71b360e0f9ec35c50ac59887d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Wed, 30 Oct 2024 01:29:50 +0000
Subject: [PATCH 104/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index f8b822ef23..d2c038770d 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -138,9 +138,9 @@ self.stream = torch.npu.Stream(f"npu:{device_id}")
 其中encoder_model_path为编译好的encoder的路径，device_id为当前设置的npu卡号，再修改forward接口为
 ```bash
 with torch.npu.stream(self.stream): # set stream
-    encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)[0]
+    encoder_outputs = self.encoder_mindie.forward(input_ids,attention_mask)
 self.stream.synchronize() # synchronize
-return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=encoder_outputs)
+return encoder_outputs 
 ```
 6.2.5测试代码
 
-- 
Gitee


From 8e7c39921e83718866965117c4ad3f207be91da5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 31 Oct 2024 12:18:49 +0000
Subject: [PATCH 105/110] update
 MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 .../built-in/T5/modeling_t5.patch             | 83 ++++++++++---------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
index 8923d7b3d4..15f81df2a4 100644
--- a/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
+++ b/MindIE/MindIE-Torch/built-in/T5/modeling_t5.patch
@@ -1,17 +1,20 @@
-diff --git a/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-index 224769fdf..8a8f9a23a 100644
---- a/modeling_t5_origin.py
+diff --git a/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5_origin.py b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
+index 224769f..24f868b 100644
+--- a/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5_origin.py
 +++ b/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
-@@ -19,7 +19,7 @@ import math
+@@ -19,8 +19,10 @@ import math
  import os
  import warnings
  from typing import List, Optional, Tuple, Union
 -
 +from dataclasses import dataclass
  import torch
++import torch_npu
++import mindietorch
  from torch import nn
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-@@ -28,13 +28,12 @@ from ...activations import ACT2FN
+ 
+@@ -28,13 +30,12 @@ from ...activations import ACT2FN
  from ...modeling_outputs import (
      BaseModelOutput,
      BaseModelOutputWithPastAndCrossAttentions,
@@ -26,7 +29,7 @@ index 224769fdf..8a8f9a23a 100644
  from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
  from ...utils import (
      DUMMY_INPUTS,
-@@ -47,7 +46,43 @@ from ...utils import (
+@@ -47,7 +48,43 @@ from ...utils import (
  )
  from ...utils.model_parallel_utils import assert_device_map, get_device_map
  from .configuration_t5 import T5Config
@@ -70,7 +73,7 @@ index 224769fdf..8a8f9a23a 100644
  
  logger = logging.get_logger(__name__)
  
-@@ -448,7 +483,10 @@ class T5Attention(nn.Module):
+@@ -448,7 +485,10 @@ class T5Attention(nn.Module):
          mask=None,
          key_value_states=None,
          position_bias=None,
@@ -82,7 +85,7 @@ index 224769fdf..8a8f9a23a 100644
          layer_head_mask=None,
          query_length=None,
          use_cache=False,
-@@ -464,12 +502,8 @@ class T5Attention(nn.Module):
+@@ -464,12 +504,8 @@ class T5Attention(nn.Module):
  
          real_seq_length = seq_length
  
@@ -97,7 +100,7 @@ index 224769fdf..8a8f9a23a 100644
  
          key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
  
-@@ -493,16 +527,17 @@ class T5Attention(nn.Module):
+@@ -493,16 +529,17 @@ class T5Attention(nn.Module):
                  hidden_states = shape(proj_layer(key_value_states))
  
              if past_key_value is not None:
@@ -121,7 +124,7 @@ index 224769fdf..8a8f9a23a 100644
                  else:
                      # cross-attn
                      hidden_states = past_key_value
-@@ -513,17 +548,16 @@ class T5Attention(nn.Module):
+@@ -513,17 +550,16 @@ class T5Attention(nn.Module):
  
          # get key/value states
          key_states = project(
@@ -142,7 +145,7 @@ index 224769fdf..8a8f9a23a 100644
          if position_bias is None:
              if not self.has_relative_attention_bias:
                  position_bias = torch.zeros(
-@@ -536,7 +570,7 @@ class T5Attention(nn.Module):
+@@ -536,7 +572,7 @@ class T5Attention(nn.Module):
  
              # if key and values are already calculated
              # we want only the last query position bias
@@ -151,7 +154,7 @@ index 224769fdf..8a8f9a23a 100644
                  position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
  
              if mask is not None:
-@@ -548,7 +582,6 @@ class T5Attention(nn.Module):
+@@ -548,7 +584,6 @@ class T5Attention(nn.Module):
              position_bias_masked = position_bias[:, mask.bool()]
          else:
              position_bias_masked = position_bias
@@ -159,7 +162,7 @@ index 224769fdf..8a8f9a23a 100644
          scores += position_bias_masked
          attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
              scores
-@@ -564,18 +597,131 @@ class T5Attention(nn.Module):
+@@ -564,18 +599,131 @@ class T5Attention(nn.Module):
          attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
          attn_output = self.o(attn_output)
  
@@ -294,7 +297,7 @@ index 224769fdf..8a8f9a23a 100644
          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
          self.dropout = nn.Dropout(config.dropout_rate)
  
-@@ -585,7 +731,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -585,7 +733,8 @@ class T5LayerSelfAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -304,7 +307,7 @@ index 224769fdf..8a8f9a23a 100644
          use_cache=False,
          output_attentions=False,
      ):
-@@ -595,7 +742,8 @@ class T5LayerSelfAttention(nn.Module):
+@@ -595,7 +744,8 @@ class T5LayerSelfAttention(nn.Module):
              mask=attention_mask,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -314,7 +317,7 @@ index 224769fdf..8a8f9a23a 100644
              use_cache=use_cache,
              output_attentions=output_attentions,
          )
-@@ -618,7 +766,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -618,7 +768,8 @@ class T5LayerCrossAttention(nn.Module):
          attention_mask=None,
          position_bias=None,
          layer_head_mask=None,
@@ -324,7 +327,7 @@ index 224769fdf..8a8f9a23a 100644
          use_cache=False,
          query_length=None,
          output_attentions=False,
-@@ -630,7 +779,8 @@ class T5LayerCrossAttention(nn.Module):
+@@ -630,7 +781,8 @@ class T5LayerCrossAttention(nn.Module):
              key_value_states=key_value_states,
              position_bias=position_bias,
              layer_head_mask=layer_head_mask,
@@ -334,7 +337,7 @@ index 224769fdf..8a8f9a23a 100644
              use_cache=use_cache,
              query_length=query_length,
              output_attentions=output_attentions,
-@@ -661,39 +811,34 @@ class T5Block(nn.Module):
+@@ -661,39 +813,34 @@ class T5Block(nn.Module):
          encoder_decoder_position_bias=None,
          layer_head_mask=None,
          cross_attn_layer_head_mask=None,
@@ -388,7 +391,7 @@ index 224769fdf..8a8f9a23a 100644
  
          # clamp inf values to enable fp16 training
          if hidden_states.dtype == torch.float16:
-@@ -706,22 +851,23 @@ class T5Block(nn.Module):
+@@ -706,22 +853,23 @@ class T5Block(nn.Module):
  
          do_cross_attention = self.is_decoder and encoder_hidden_states is not None
          if do_cross_attention:
@@ -417,7 +420,7 @@ index 224769fdf..8a8f9a23a 100644
                  output_attentions=output_attentions,
              )
              hidden_states = cross_attention_outputs[0]
-@@ -736,11 +882,9 @@ class T5Block(nn.Module):
+@@ -736,11 +884,9 @@ class T5Block(nn.Module):
                  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  
              # Combine self attn and cross attn key value states
@@ -431,7 +434,7 @@ index 224769fdf..8a8f9a23a 100644
  
          # Apply Feed Forward layer
          hidden_states = self.layer[-1](hidden_states)
-@@ -757,7 +901,7 @@ class T5Block(nn.Module):
+@@ -757,7 +903,7 @@ class T5Block(nn.Module):
          outputs = (hidden_states,)
  
          if use_cache:
@@ -440,7 +443,7 @@ index 224769fdf..8a8f9a23a 100644
          else:
              outputs = outputs + attention_outputs
  
-@@ -897,11 +1041,15 @@ class T5PreTrainedModel(PreTrainedModel):
+@@ -897,11 +1043,15 @@ class T5PreTrainedModel(PreTrainedModel):
  
  
  class T5Stack(T5PreTrainedModel):
@@ -457,7 +460,7 @@ index 224769fdf..8a8f9a23a 100644
  
          self.block = nn.ModuleList(
              [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-@@ -966,20 +1114,63 @@ class T5Stack(T5PreTrainedModel):
+@@ -966,20 +1116,63 @@ class T5Stack(T5PreTrainedModel):
      def set_input_embeddings(self, new_embeddings):
          self.embed_tokens = new_embeddings
  
@@ -523,7 +526,7 @@ index 224769fdf..8a8f9a23a 100644
      ):
          # Model parallel
          if self.model_parallel:
-@@ -998,8 +1189,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -998,8 +1191,10 @@ class T5Stack(T5PreTrainedModel):
                  f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
              )
          elif input_ids is not None:
@@ -534,7 +537,7 @@ index 224769fdf..8a8f9a23a 100644
          elif inputs_embeds is not None:
              input_shape = inputs_embeds.size()[:-1]
          else:
-@@ -1012,18 +1205,19 @@ class T5Stack(T5PreTrainedModel):
+@@ -1012,18 +1207,19 @@ class T5Stack(T5PreTrainedModel):
              inputs_embeds = self.embed_tokens(input_ids)
  
          batch_size, seq_length = input_shape
@@ -559,7 +562,7 @@ index 224769fdf..8a8f9a23a 100644
          if attention_mask is None:
              attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  
-@@ -1054,7 +1248,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1054,7 +1250,8 @@ class T5Stack(T5PreTrainedModel):
          # Prepare head mask if needed
          head_mask = self.get_head_mask(head_mask, self.config.num_layers)
          cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -569,7 +572,7 @@ index 224769fdf..8a8f9a23a 100644
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None
          all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-@@ -1062,8 +1257,8 @@ class T5Stack(T5PreTrainedModel):
+@@ -1062,8 +1259,8 @@ class T5Stack(T5PreTrainedModel):
          encoder_decoder_position_bias = None
  
          hidden_states = self.dropout(inputs_embeds)
@@ -580,7 +583,7 @@ index 224769fdf..8a8f9a23a 100644
              layer_head_mask = head_mask[i]
              cross_attn_layer_head_mask = cross_attn_head_mask[i]
              # Model parallel
-@@ -1112,7 +1307,10 @@ class T5Stack(T5PreTrainedModel):
+@@ -1112,7 +1309,10 @@ class T5Stack(T5PreTrainedModel):
                      encoder_decoder_position_bias=encoder_decoder_position_bias,
                      layer_head_mask=layer_head_mask,
                      cross_attn_layer_head_mask=cross_attn_layer_head_mask,
@@ -592,7 +595,7 @@ index 224769fdf..8a8f9a23a 100644
                      use_cache=use_cache,
                      output_attentions=output_attentions,
                  )
-@@ -1120,19 +1318,20 @@ class T5Stack(T5PreTrainedModel):
+@@ -1120,19 +1320,20 @@ class T5Stack(T5PreTrainedModel):
              # layer_outputs is a tuple with:
              # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
              if use_cache is False:
@@ -618,7 +621,7 @@ index 224769fdf..8a8f9a23a 100644
  
              if output_attentions:
                  all_attentions = all_attentions + (layer_outputs[3],)
-@@ -1146,7 +1345,7 @@ class T5Stack(T5PreTrainedModel):
+@@ -1146,7 +1347,7 @@ class T5Stack(T5PreTrainedModel):
                          hidden_states = hidden_states.to("cuda:" + str(k + 1))
  
          hidden_states = self.final_layer_norm(hidden_states)
@@ -627,7 +630,7 @@ index 224769fdf..8a8f9a23a 100644
  
          # Add last layer
          if output_hidden_states:
-@@ -1164,13 +1363,216 @@ class T5Stack(T5PreTrainedModel):
+@@ -1164,13 +1365,216 @@ class T5Stack(T5PreTrainedModel):
                  ]
                  if v is not None
              )
@@ -850,7 +853,7 @@ index 224769fdf..8a8f9a23a 100644
  
  
  T5_START_DOCSTRING = r"""
-@@ -1541,6 +1943,38 @@ class T5Model(T5PreTrainedModel):
+@@ -1541,6 +1945,38 @@ class T5Model(T5PreTrainedModel):
          )
  
  
@@ -889,7 +892,7 @@ index 224769fdf..8a8f9a23a 100644
  @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
  class T5ForConditionalGeneration(T5PreTrainedModel):
      _keys_to_ignore_on_load_unexpected = [
-@@ -1548,28 +1982,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1548,28 +1984,51 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
      ]
      _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
  
@@ -960,7 +963,7 @@ index 224769fdf..8a8f9a23a 100644
  
          # Model parallel
          self.model_parallel = False
-@@ -1637,25 +2094,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1637,25 +2096,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
  
      @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
      @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -987,7 +990,7 @@ index 224769fdf..8a8f9a23a 100644
          r"""
          labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
              Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-@@ -1687,113 +2126,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1687,113 +2128,37 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
          >>> # studies have shown that owning a dog is good for you.
          ```"""
@@ -1126,7 +1129,7 @@ index 224769fdf..8a8f9a23a 100644
          attention_mask=None,
          head_mask=None,
          decoder_head_mask=None,
-@@ -1804,8 +2167,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1804,8 +2169,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
          **kwargs,
      ):
          # cut decoder_input_ids if past_key_values is used
@@ -1137,7 +1140,7 @@ index 224769fdf..8a8f9a23a 100644
  
              # Some generation methods already pass only the last input ID
              if input_ids.shape[1] > past_length:
-@@ -1813,12 +2176,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1813,12 +2178,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              else:
                  # Default to old behavior: keep only final ID
                  remove_prefix_length = input_ids.shape[1] - 1
@@ -1154,7 +1157,7 @@ index 224769fdf..8a8f9a23a 100644
              "encoder_outputs": encoder_outputs,
              "attention_mask": attention_mask,
              "head_mask": head_mask,
-@@ -1826,6 +2191,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1826,6 +2193,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              "decoder_attention_mask": decoder_attention_mask,
              "cross_attn_head_mask": cross_attn_head_mask,
              "use_cache": use_cache,
@@ -1162,7 +1165,7 @@ index 224769fdf..8a8f9a23a 100644
          }
  
      def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-@@ -1861,6 +2227,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
+@@ -1861,6 +2229,459 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
              reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
          return reordered_decoder_past
  
@@ -1622,7 +1625,7 @@ index 224769fdf..8a8f9a23a 100644
  
  @add_start_docstrings(
      "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-@@ -1967,7 +2786,6 @@ class T5EncoderModel(T5PreTrainedModel):
+@@ -1967,7 +2788,6 @@ class T5EncoderModel(T5PreTrainedModel):
          >>> last_hidden_states = outputs.last_hidden_state
          ```"""
          return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-- 
Gitee


From a06d09d9f5f23604a72340a31b1d048fd2dc8895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 31 Oct 2024 12:20:27 +0000
Subject: [PATCH 106/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index d2c038770d..49bd28021c 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -146,6 +146,7 @@ return encoder_outputs
 
 ```bash
 import torch
+import torch_npu
 import mindietorch
 import mteb
 from sentence_transformers import SentenceTransformer
-- 
Gitee


From c69001ff85670380dc7da2d882c40c1a48a1e92a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 31 Oct 2024 13:05:50 +0000
Subject: [PATCH 107/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 49bd28021c..3423c7c155 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -150,7 +150,7 @@ import torch_npu
 import mindietorch
 import mteb
 from sentence_transformers import SentenceTransformer
-
+torch.npu.set_device(0)
 model_name = "D:\downloads\T5-v2"
 model = SentenceTransformer(model_name,model_kwargs={"torch_dtype":torch.float16})
 tasks = mteb.get_tasks(tasks=["CLSClusteringP2P"])
-- 
Gitee


From 3df3f5bd0dec297e8b768e99d2b6ab690b408cfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Thu, 5 Dec 2024 09:29:52 +0000
Subject: [PATCH 108/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md
index 96f0c1cb00..3d5e155a81 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md
@@ -84,6 +84,7 @@
 运行该命令后会自动生成encoder和decoder优化后的模型
 
 5.精度测试
+sentense-transformers版本必须是3.1.1
  ```bash
 python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id
 ```
-- 
Gitee


From 3fa2f9067a94978f7e484c03400f68b15072ba63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 10 Dec 2024 06:17:48 +0000
Subject: [PATCH 109/110] update MindIE/MindIE-Torch/built-in/MT5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/MT5/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/MT5/readme.md b/MindIE/MindIE-Torch/built-in/MT5/readme.md
index 3d5e155a81..4862823913 100644
--- a/MindIE/MindIE-Torch/built-in/MT5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/MT5/readme.md
@@ -84,7 +84,7 @@
 运行该命令后会自动生成encoder和decoder优化后的模型
 
 5.精度测试
-sentense-transformers版本必须是3.1.1
+
  ```bash
 python test_mt5.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path} --decoder_aie_path {decoder_aie_path} --device_id device_id
 ```
-- 
Gitee


From 2354286269b76e811e9422137267499cf0480909 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=84=E6=96=87=E5=A5=87?= <zhenwenqi@huawei.com>
Date: Tue, 10 Dec 2024 06:19:48 +0000
Subject: [PATCH 110/110] update MindIE/MindIE-Torch/built-in/T5/readme.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 甄文奇 <zhenwenqi@huawei.com>
---
 MindIE/MindIE-Torch/built-in/T5/readme.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/T5/readme.md b/MindIE/MindIE-Torch/built-in/T5/readme.md
index 3423c7c155..a8cd519940 100644
--- a/MindIE/MindIE-Torch/built-in/T5/readme.md
+++ b/MindIE/MindIE-Torch/built-in/T5/readme.md
@@ -112,9 +112,10 @@ python main.py --hf_model_path {model_path} --encoder_aie_path {encoder_aie_path
 
 6.2 精度测试方法
 
-6.2.1安装mteb
+6.2.1安装mteb和sentence_transformes
 
  ```bash
+pip sentence_transformes==3.1.1
 pip install mteb
 ```
 6.2.2 下载mteb数据集（如果机器可以连接外部网络可以跳过这步）
-- 
Gitee