From 56a289314972b92ee146dc24454ff38aaf598390 Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Fri, 29 Mar 2024 14:08:05 +0800
Subject: [PATCH 01/25] add gaudi modeling support in itrex

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../multi_modal/eval/mmmu_eval/run_llava.py   |    2 +-
 .../neural_chat/models/model_utils.py         |    2 +-
 .../modeling/modeling_gaudi/__init__.py       |    1 +
 .../modeling_gaudi/generation/__init__.py     |    6 +
 .../generation/configuration_utils.py         |   54 +
 .../generation/stopping_criteria.py           |   46 +
 .../modeling_gaudi/generation/utils.py        | 3301 +++++++++++++++++
 .../modeling/modeling_gaudi/modeling_utils.py |  365 ++
 .../modeling_gaudi/models/__init__.py         |  149 +
 .../modeling_gaudi/models/albert/__init__.py  |    1 +
 .../models/albert/modeling_albert.py          |  101 +
 .../modeling_gaudi/models/bart/__init__.py    |   11 +
 .../models/bart/modeling_bart.py              |  801 ++++
 .../modeling_gaudi/models/blip/__init__.py    |   10 +
 .../models/blip/modeling_blip.py              |  124 +
 .../models/blip/modeling_blip_text.py         |  538 +++
 .../modeling_gaudi/models/bloom/__init__.py   |    9 +
 .../models/bloom/modeling_bloom.py            |  612 +++
 .../modeling_gaudi/models/codegen/__init__.py |    6 +
 .../models/codegen/modeling_codegen.py        |  420 +++
 .../modeling_gaudi/models/esm/__init__.py     |    6 +
 .../models/esm/modeling_esmfold.py            |  347 ++
 .../modeling_gaudi/models/falcon/__init__.py  |    7 +
 .../models/falcon/modeling_falcon.py          |  679 ++++
 .../modeling_gaudi/models/gpt2/__init__.py    |    1 +
 .../models/gpt2/modeling_gpt2.py              |  573 +++
 .../models/gpt_bigcode/__init__.py            |    6 +
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  494 +++
 .../models/gpt_neox/__init__.py               |    7 +
 .../models/gpt_neox/modeling_gpt_neox.py      |  429 +++
 .../modeling_gaudi/models/gptj/__init__.py    |    6 +
 .../models/gptj/modeling_gptj.py              |  530 +++
 .../modeling_gaudi/models/llama/__init__.py   |   11 +
 .../models/llama/modeling_llama.py            | 1029 +++++
 .../modeling_gaudi/models/mistral/__init__.py |    7 +
 .../models/mistral/modeling_mistral.py        |  687 ++++
 .../modeling_gaudi/models/mixtral/__init__.py |    8 +
 .../models/mixtral/modeling_mixtral.py        |  718 ++++
 .../models/modeling_all_models.py             |  171 +
 .../models/modeling_attn_mask_utils.py        |  106 +
 .../modeling_gaudi/models/mpt/__init__.py     |    6 +
 .../modeling_gaudi/models/mpt/modeling_mpt.py |  384 ++
 .../modeling_gaudi/models/opt/__init__.py     |    8 +
 .../modeling_gaudi/models/opt/modeling_opt.py |  537 +++
 .../modeling_gaudi/models/phi/__init__.py     |    6 +
 .../modeling_gaudi/models/phi/modeling_phi.py |  475 +++
 .../models/speecht5/__init__.py               |    7 +
 .../models/speecht5/modeling_speecht5.py      |  552 +++
 .../modeling_gaudi/models/swin/__init__.py    |    1 +
 .../models/swin/modeling_swin.py              |   52 +
 .../modeling_gaudi/models/t5/__init__.py      |    9 +
 .../modeling_gaudi/models/t5/modeling_t5.py   |  636 ++++
 .../modeling_gaudi/models/vit/__init__.py     |    1 +
 .../modeling_gaudi/models/vit/modeling_vit.py |   61 +
 .../models/wav2vec2/__init__.py               |    8 +
 .../models/wav2vec2/modeling_wav2vec2.py      |  376 ++
 56 files changed, 15498 insertions(+), 2 deletions(-)
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py
 create mode 100755 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
 create mode 100755 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
 create mode 100755 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py

diff --git a/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py b/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py
index cb4080b7457..cf84987a809 100644
--- a/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py
+++ b/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py
@@ -17,7 +17,7 @@
 from optimum.habana.transformers.generation.utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
 if "llava" not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
     MODELS_OPTIMIZED_WITH_STATIC_SHAPES.append("llava")
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from intel_extension_for_transformers.transformers.modeling.modeling_gaudi import adapt_transformers_to_gaudi
 adapt_transformers_to_gaudi()
 
 import torch
diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
index 38005429e23..a55d4da8b25 100644
--- a/intel_extension_for_transformers/neural_chat/models/model_utils.py
+++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -458,7 +458,7 @@ def load_model(
         # Tweak generation so that it runs faster on Gaudi
         # pylint: disable=E0401
         # pylint: disable=E0611
-        from optimum.habana.transformers.modeling_utils import (
+        from intel_extension_for_transformers.transformers.modeling.modeling_gaudi import (
             adapt_transformers_to_gaudi,
         )
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
new file mode 100644
index 00000000000..d6539dee1f5
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
@@ -0,0 +1 @@
+from .modeling_utils import adapt_transformers_to_gaudi
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
new file mode 100644
index 00000000000..15f567b0be4
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
@@ -0,0 +1,6 @@
+from .configuration_utils import GaudiGenerationConfig
+from .stopping_criteria import (
+    gaudi_MaxLengthCriteria_call,
+    gaudi_MaxNewTokensCriteria_call,
+)
+from .utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES, GaudiGenerationMixin
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
new file mode 100644
index 00000000000..e75e48a7c7f
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -0,0 +1,54 @@
+from transformers.generation import GenerationConfig
+
+
+class GaudiGenerationConfig(GenerationConfig):
+    """
+    This class extends [`transformers.generation.GenerationConfig`](https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py)
+    to add HPU-specific arguments for generation.
+
+    Arg:
+    trim_logit (`bool`, *optional):
+        Calculate logits only for the last token to save memory in the first step.
+    static_shapes (`bool`, *optional*):
+        Whether to use static shapes for generation or not. It will run faster on HPUs with static shapes
+        but not all models support it. If not specified, it will automatically be set to `True` if the given
+        model supports it.
+    ignore_eos (`bool`, *optional*):
+        Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
+        If not specified, it will automatically be set to `True` if lazy mode is on.
+    attn_softmax_bf16 (`bool`, *optional*):
+        Whether to run attention softmax layer in lower precision provided that the model supports it and
+        is also running in lower precision.
+    limit_hpu_graphs (`bool`, *optional*):
+        Skip HPU Graph usage for first token to save memory
+    reuse_cache (`bool`, *optional*):
+        Whether to reuse key/value cache for decoding. It should save memory.
+    bucket_size (`int`, *optional*):
+        If negative (default=-1) pad to max if `static_shapes` is set. Else start with
+        `shape = bucket_size * ceil(prompt_len/bucket_size)` and then grow space by `bucket_size` when needed.
+        Only active if `static_shapes` is used. Can't be used with `reuse_cache`.
+    bucket_internal (`bool`, *optional*):
+        Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.
+    kv_cache_fp8 (`bool`, *optional*):
+        Store kv-cache in float8 when kv-cache is used
+    use_flash_attention (`bool`, *optional*):
+        Whether to use flash attention optimization.
+    flash_attention_recompute (`bool`, *optional*):
+        Whether to enable recompute if use Habana flash attention.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.trim_logits = kwargs.get("trim_logits", None)
+        self.static_shapes = kwargs.get("static_shapes", None)
+        self.ignore_eos = kwargs.get("ignore_eos", None)
+        self.attn_softmax_bf16 = kwargs.get("attn_softmax_bf16", None)
+        self.limit_hpu_graphs = kwargs.get("limit_hpu_graphs", None)
+        self.reuse_cache = kwargs.get("reuse_cache", None)
+        self.bucket_size = kwargs.get("bucket_size", -1)
+        self.bucket_internal = kwargs.get("bucket_internal", None)
+        self.reduce_recompile = kwargs.get("reduce_recompile", None)
+        self.kv_cache_fp8 = kwargs.get("kv_cache_fp8", None)
+        self.use_flash_attention = kwargs.get("use_flash_attention", None)
+        self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
+        self.use_fused_rope = kwargs.get("use_fused_rope", None)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py
new file mode 100644
index 00000000000..4c6eedae615
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from optimum.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_MaxLengthCriteria_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+    token_idx = kwargs.get("token_idx", None)
+    if token_idx is not None:
+        return token_idx >= self.max_length
+    else:
+        cur_len = input_ids.shape[-1]
+        is_done = cur_len >= self.max_length
+        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
+            logger.warning_once(
+                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
+                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
+                "exceptions, performance degradation, or nothing at all."
+            )
+        return is_done
+
+
+def gaudi_MaxNewTokensCriteria_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+    token_idx = kwargs.get("token_idx", None)
+    if token_idx is not None:
+        return token_idx >= self.max_length
+    else:
+        return input_ids.shape[-1] >= self.max_length
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
new file mode 100755
index 00000000000..faec4696cd7
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -0,0 +1,3301 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import math
+import warnings
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from transformers.generation.beam_constraints import DisjunctiveConstraint, PhrasalConstraint
+from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+from transformers.generation.candidate_generator import CandidateGenerator
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import (
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+from transformers.generation.utils import (
+    NEED_SETUP_CACHE_CLASSES_MAPPING,
+    GenerateBeamDecoderOnlyOutput,
+    GenerateBeamEncoderDecoderOutput,
+    GenerateBeamOutput,
+    GenerateDecoderOnlyOutput,
+    GenerateEncoderDecoderOutput,
+    GenerateNonBeamOutput,
+    GenerateOutput,
+    GenerationMixin,
+    GenerationMode,
+    _split_model_inputs,
+    stack_model_outputs,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import ModelOutput
+
+from optimum.utils import logging
+
+from optimum.habana.utils import HabanaProfile
+from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model
+from .configuration_utils import GaudiGenerationConfig
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from .streamers import BaseStreamer
+
+
+MODELS_OPTIMIZED_WITH_STATIC_SHAPES = [
+    "bloom",
+    "gpt2",
+    "opt",
+    "gptj",
+    "gpt_neox",
+    "llama",
+    "falcon",
+    "codegen",
+    "gpt_bigcode",
+    "bart",
+    "mpt",
+    "t5",
+    "mistral",
+    "phi",
+    "mixtral",
+    "blip_text_model",
+]
+
+
+logger = logging.get_logger(__name__)
+
+
+def incrementor(bucket_size, prompt_len):
+    assert bucket_size > 0
+    passnum = -1
+    while True:
+        passnum += 1
+        if passnum == 0:
+            token_idx = prompt_len
+            allocated_space = int(math.ceil(prompt_len / bucket_size) * bucket_size)
+            if prompt_len % bucket_size == 0:
+                allocated_space += bucket_size
+            need_expansion = True
+        else:
+            token_idx += 1
+            need_expansion = token_idx >= allocated_space
+            if need_expansion:
+                assert (allocated_space - token_idx) <= bucket_size
+                allocated_space += bucket_size
+        yield {
+            "allocated_space": allocated_space,
+            "passnum": passnum,
+            "token_idx": token_idx,
+            "need_expansion": need_expansion,
+        }
+
+
+class GaudiGenerationMixin(GenerationMixin):
+    """
+    This class enables to perform fast generation in lazy mode and with HPU graphs.
+    The only difference with GenerationMixin is that the various generation
+    methods will generate sequences whose size is max_length. Having constant
+    sizes allows to make the most of lazy mode and HPU graphs.
+    """
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """
+        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...].
+
+        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L704
+        The tensor `token_idx` is not expanded.
+        """
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    dict_to_expand[key] is not None
+                    and key != "token_idx"
+                    and key != "decoder_input_ids"
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+    def _get_hpu_graphs_kwargs(self, model_kwargs):
+        hpu_graphs_kwargs = {}
+        if model_kwargs["limit_hpu_graphs"]:
+            hpu_graphs_kwargs.update({"bypass_hpu_graphs": False})
+            if "first_token" not in model_kwargs.keys():
+                model_kwargs["first_token"] = True
+                hpu_graphs_kwargs.update({"bypass_hpu_graphs": True})
+        return hpu_graphs_kwargs
+
+    def _prepare_decoder_attention_mask(
+        self,
+        max_steps: int,  # current stopping criteria
+        batch_size: int,
+        pad_token_id: int,
+        device: str,
+        dtype: str = bool,
+    ) -> torch.Tensor:
+        x = torch.zeros((batch_size, max_steps), device=device, dtype=dtype)
+        return x.index_fill(1, torch.tensor([0]), 1)  # First the position with pad_token_id
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: Union[int, List[int]] = None,
+        bos_token_id: int = None,
+        device: torch.device = None,
+        max_new_tokens: int = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        token_idx = model_kwargs.get("token_idx", None)
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        if token_idx is None:
+            if isinstance(decoder_start_token_id, list):
+                if len(decoder_start_token_id) != batch_size:
+                    raise ValueError(
+                        f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}"
+                    )
+                decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
+                decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
+            else:
+                decoder_input_ids_start = (
+                    torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+                )
+        else:
+            # creating padded decoder_input_ids to achieve static shapes. Later new tokens once generated are copied in to decoder_input_ids based on token_idx
+            max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length
+            decoder_input_ids_start = (
+                torch.ones((batch_size, max_length), dtype=torch.long, device=device) * decoder_start_token_id
+            )
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token
+        elif self.config.model_type == "vision-encoder-decoder" and "donut" in self.name_or_path.lower():
+            pass
+        elif self.config.model_type in ["whisper"]:
+            pass
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (
+            isinstance(decoder_start_token_id, int)
+            and (decoder_input_ids[:, 0] != decoder_start_token_id).all().item()
+        ) or (
+            isinstance(decoder_start_token_id, torch.Tensor)
+            and (decoder_input_ids[:, 0] != decoder_start_token_id[:, 0]).all().item()
+        ):
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+        return decoder_input_ids, model_kwargs
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L745
+
+        Adds support for `token_idx`, which is necessary for using static shapes.
+        """
+        # mark to identify starting from second token
+        model_kwargs["first_token"] = False
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        token_idx = model_kwargs.get("token_idx", None)
+
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                if token_idx is not None:
+                    attention_mask.index_fill_(1, token_idx, 1)
+                else:
+                    attention_mask = torch.cat(
+                        [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                    )
+                model_kwargs["attention_mask"] = attention_mask
+        else:
+            # update decoder attention mask
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                if token_idx is not None:
+                    decoder_attention_mask.index_fill_(1, token_idx, 1)
+                else:
+                    decoder_attention_mask = torch.cat(
+                        [
+                            decoder_attention_mask,
+                            decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1)),
+                        ],
+                        dim=-1,
+                    )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        if token_idx is not None:
+            token_idx.add_(1)
+            if "token_idx_cpu" in model_kwargs:
+                model_kwargs["token_idx_cpu"] += 1
+
+        return model_kwargs
+
+    @torch.no_grad()
+    def update_model_kwargs_for_bucketing(
+        self, params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile=False
+    ):
+        if params["need_expansion"]:
+            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
+            pad_amount = params["allocated_space"] - input_ids.shape[-1]
+            input_ids = torch.nn.functional.pad(input_ids, (0, pad_amount), value=pad_token_id)
+            if model_kwargs["attention_mask"] is not None:
+                model_kwargs["attention_mask"] = torch.nn.functional.pad(
+                    model_kwargs["attention_mask"], (0, pad_amount), value=0
+                )
+            else:
+                assert False, "Not tested for cases where attn_mask isnt passed"
+            if reduce_recompile and params["passnum"] == 0:
+                position_ids_cpu = model_kwargs["attention_mask"].long().cumsum(-1) - 1
+                position_ids_cpu.masked_fill_(model_kwargs["attention_mask"] == 0, 1)
+                input_ids = input_ids.to(self.device)
+                model_kwargs["attention_mask"] = model_kwargs["attention_mask"].to(self.device)
+
+            if "past_key_values" in model_kwargs:
+
+                def create_pad_arg(pad_amount, i, j):
+                    if model_kwargs["past_key_values"][0][0].dim() == 3:
+                        assert self.config.model_type == "bloom"
+                        if j == 0:
+                            return (0, pad_amount)
+                        elif j == 1:
+                            return (0, 0, 0, pad_amount)
+                        else:
+                            assert False
+                    elif model_kwargs["past_key_values"][0][0].dim() == 4:
+                        return (0, 0, 0, pad_amount)  # llama, falcon
+                    else:
+                        assert False, "Unknown case, please handle, or dont use bucketing"
+
+                new_kv = [None for i in range(len(model_kwargs["past_key_values"]))]
+                for i in range(len(model_kwargs["past_key_values"])):
+                    tmp_lst = [None for j in range(len(model_kwargs["past_key_values"][i]))]
+                    for j in range(len(model_kwargs["past_key_values"][i])):
+                        pad_tuple = create_pad_arg(pad_amount, i, j)
+                        # Different models might have different shapes of kv-cache
+                        # create_pad_arg handles them on a per-model basis
+                        # This is a necessary (but not sufficient) condition: what ever dimension we are padding, should be a multiple of bucket_size
+                        # This check is added in case we get a new model with a new kv-cache structure, and we attempt to pad some wrong dimension
+                        assert model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == 0
+                        tmp_lst[j] = torch.nn.functional.pad(
+                            model_kwargs["past_key_values"][i][j], pad_tuple, value=pad_token_id
+                        )
+                    new_kv[i] = tuple(tmp_lst)
+                model_kwargs["past_key_values"] = tuple(new_kv)
+
+        if "token_idx" not in model_kwargs:
+            model_kwargs["token_idx"] = torch.tensor(params["token_idx"], device=self.device)
+        return input_ids, model_kwargs
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GaudiGenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        lazy_mode: Optional[bool] = False,
+        hpu_graphs: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in [`transformers.generation.generation_config`] which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Most of these parameters are explained in more detail in [this blog
+        post](https://huggingface.co/blog/how-to-generate).
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`transformers.generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
+                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
+                intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*):
+                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
+                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
+                generating before other GPUs. Otherwise it'll be set to `False`.
+            assistant_model (`PreTrainedModel`, *optional*):
+                An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                is much faster than running generation with the model you're calling generate from. As such, the
+                assistant model should be much smaller.
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
+                size. This is an experimental feature, subject to breaking API changes in future versions.
+            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Attention_mask for `negative_prompt_ids`.
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            hpu_graphs (`bool`, *optional*, defaults to `False`):
+                Whether to use HPU graphs for inference.
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`transformers.utils.ModelOutput`] or `torch.LongTensor`: A [`transformers.generationutils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`transformers.generationutils.ModelOutput`] types are:
+                    - [`transformers.generation.GenerateDecoderOnlyOutput`],
+                    - [`transformers.generation.GenerateBeamDecoderOnlyOutput`]
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`transformers.generationutils.ModelOutput`] types are:
+                    - [`transformers.generation.GenerateEncoderDecoderOutput`],
+                    - [`transformers.generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        if synced_gpus is None:
+            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
+                synced_gpus = True
+            else:
+                synced_gpus = False
+
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        self._validate_model_class()
+        if hpu_graphs and not lazy_mode:
+            raise ValueError(
+                "`hpu_graphs` is True but `lazy_mode` is False. HPU graphs require `lazy_mode` to be set to True."
+            )
+
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        if generation_config is None:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # three conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same);
+            # 3) the user must have set generation parameters in the model config.
+            if (
+                self.generation_config._from_model_config
+                and self.generation_config._original_object_hash == hash(self.generation_config)
+                and self.config._has_non_default_generation_parameters()
+            ):
+                new_generation_config = GaudiGenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed soon, in a future version."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+                    )
+                    self.generation_config = new_generation_config
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        if generation_config.static_shapes is None:
+            generation_config.static_shapes = self.config.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+        self.generation_config.static_shapes = generation_config.static_shapes
+        if generation_config.ignore_eos is None:
+            generation_config.ignore_eos = kwargs.get("ignore_eos", lazy_mode)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        if self.config.model_type == "falcon" and "token_type_ids" in kwargs.keys():
+            for key in ["token_type_ids"]:
+                model_kwargs.pop(key, None)
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask", None) is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            logger.warning(
+                f"Setting `pad_token_id` to `eos_token_id`:{generation_config.eos_token_id} for open-end generation."
+            )
+            generation_config.pad_token_id = eos_token_id
+
+        # 3. Define model inputs
+        # inputs_tensor has to be defined
+        # model_input_name is defined if model-specific keyword input is passed
+        # otherwise model_input_name is None
+        # all model-specific keyword inputs are removed from `model_kwargs`
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        # 4. Define other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
+        # generating the first new token or not, and we only want to use the embeddings for the first new token)
+        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
+            model_kwargs["use_cache"] = True
+        else:
+            model_kwargs["use_cache"] = generation_config.use_cache
+        self.generation_config.max_length = generation_config.max_length
+        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        is_greedy_or_beam_and_bucket = (
+            not generation_config.bucket_internal
+            and generation_config.bucket_size > 0
+            and (
+                self._get_generation_mode(generation_config, assistant_model) == GenerationMode.GREEDY_SEARCH
+                or self._get_generation_mode(generation_config, assistant_model) == GenerationMode.BEAM_SEARCH
+            )
+        )
+        model_kwargs["bucket_size"] = generation_config.bucket_size if generation_config.static_shapes else -1
+        model_kwargs["bucket_internal"] = generation_config.bucket_internal
+        model_kwargs["reduce_recompile"] = (
+            generation_config.reduce_recompile if generation_config.reduce_recompile is not None else False
+        )
+        if model_kwargs["reduce_recompile"]:
+            assert generation_config.bucket_size
+        if generation_config.reuse_cache:
+            assert self.config.model_type in [
+                "llama",
+                "mistral",
+            ], "reuse_cache only supported by llama and mistral at the moment"
+            if not generation_config.bucket_internal:
+                assert (
+                    generation_config.bucket_size <= 0
+                ), "please set bucket_internal along with reuse_cache and bucket_size"
+            else:
+                assert generation_config.bucket_size >= 0, "please set valid bucket_size to use bucket_internal"
+
+        if generation_config.static_shapes:
+            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
+            # In encoder_decoder models, Inputs are already padded
+
+            if not self.config.is_encoder_decoder:
+                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), then that is taken care in greedy_search()
+                if not is_greedy_or_beam_and_bucket:
+                    # token_idx is the current index in the generation process, it is incremented each time a new token is generated
+                    token_idx = inputs_tensor.shape[-1]
+                    model_kwargs["token_idx"] = torch.tensor(token_idx, device=inputs_tensor.device)
+                    model_kwargs["token_idx_cpu"] = token_idx
+                    if generation_config.max_new_tokens is None:
+                        generation_config.max_new_tokens = generation_config.max_length - token_idx
+                    inputs_tensor = torch.nn.functional.pad(
+                        inputs_tensor, (0, generation_config.max_new_tokens), value=generation_config.pad_token_id
+                    )
+                    for other_inputs in ["attention_mask", "token_type_ids"]:
+                        if model_kwargs.get(other_inputs) is not None:
+                            model_kwargs[other_inputs] = torch.nn.functional.pad(
+                                model_kwargs[other_inputs], (0, generation_config.max_new_tokens), value=0
+                            )
+            else:
+                assert generation_config.bucket_size <= 0, "Untested path for bucket>0"
+                token_idx = 1
+                model_kwargs["token_idx"] = torch.tensor(token_idx, device=inputs_tensor.device)
+                if model_kwargs.get("decoder_attention_mask", None) is None and generation_config.use_cache:
+                    max_length = (
+                        generation_config.max_new_tokens + 1
+                        if generation_config.max_new_tokens is not None
+                        else generation_config.max_length
+                    )
+                    model_kwargs["decoder_attention_mask"] = self._prepare_decoder_attention_mask(
+                        max_length,
+                        inputs_tensor.shape[0],
+                        generation_config.pad_token_id,
+                        inputs_tensor.device,
+                    )
+
+        # decoder-only models should use left-padding for generation
+        if not self.config.is_encoder_decoder:
+            # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
+            # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
+            if generation_config.pad_token_id is not None:
+                position = model_kwargs["token_idx"] - 1 if "token_idx" in model_kwargs else -1
+                if (
+                    len(inputs_tensor.shape) == 2
+                    and torch.sum(inputs_tensor[:, position] == generation_config.pad_token_id) > 0
+                ):
+                    logger.warning(
+                        "A decoder-only architecture is being used, but right-padding was detected! For correct "
+                        "generation results, please set `padding_side='left'` when initializing the tokenizer."
+                    )
+
+        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created
+            # and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        if self.config.is_encoder_decoder:
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
+                decoder_start_token_id=generation_config.decoder_start_token_id,
+                bos_token_id=generation_config.bos_token_id,
+                device=inputs_tensor.device,
+                max_new_tokens=generation_config.max_new_tokens,
+            )
+        else:
+            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            if "token_idx" in model_kwargs:
+                generation_config.max_length = input_ids_length
+            else:
+                generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+        # otherwise the total length [inputs-embeds-len + new-tokens-len] will go beyond indicated `max_length`
+        elif (
+            model_input_name == "inputs_embeds"
+            and inputs_tensor.shape[:-1] != input_ids.shape
+            and not self.config.is_encoder_decoder
+        ):
+            generation_config.max_length -= inputs_tensor.shape[1]
+
+        # if we don't pass `past_key_values` and a cache_implementation is specified
+        if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING and not model_kwargs.get(
+            "past_key_values", False
+        ):
+            cache_cls = NEED_SETUP_CACHE_CLASSES_MAPPING[generation_config.cache_implementation]
+            if not callable(getattr(self, "_setup_cache", None)):
+                raise ValueError(
+                    "The `generation_config` defines a `cache_implementation` that is not compatible with this model."
+                    " Make sure it has a `_setup_cache` function."
+                )
+            self._setup_cache(cache_cls, max_batch_size=batch_size, max_cache_len=generation_config.max_length)
+
+        self._validate_generated_length(
+            generation_config,
+            model_kwargs["token_idx"].item() if "token_idx" in model_kwargs else input_ids_length,
+            has_default_max_length,
+        )
+
+        # determine whether introduce trim_logits feature
+        model_kwargs["trim_logits"] = generation_config.trim_logits
+
+        # determine whether attention softmax needs to execute in lower precision
+        model_kwargs["attn_softmax_bf16"] = generation_config.attn_softmax_bf16
+
+        # determine whether limit_hpu_graphs needs to be used
+        model_kwargs["limit_hpu_graphs"] = generation_config.limit_hpu_graphs
+
+        # prepare for allocate kv cache
+        model_kwargs["reuse_cache"] = generation_config.reuse_cache
+
+        # determine whether flash attention needs to be used
+        model_kwargs["use_flash_attention"] = generation_config.use_flash_attention
+        model_kwargs["flash_attention_recompute"] = True if generation_config.flash_attention_recompute else False
+        if not self.config.is_encoder_decoder:
+            calculated_max_length = input_ids.shape[-1]
+            if not generation_config.static_shapes and generation_config.max_new_tokens is not None:
+                calculated_max_length = input_ids.shape[-1] + generation_config.max_new_tokens
+            if generation_config.use_cache and generation_config.reuse_cache:
+                bs, _ = input_ids.shape
+                if not is_greedy_or_beam_and_bucket:
+                    unwrap_deepspeed_model(self).allocate_kv_cache(
+                        bs * generation_config.num_beams,
+                        calculated_max_length,
+                        token_idx,
+                        generation_config.kv_cache_fp8,
+                    )
+                    model_kwargs["kv_cache_len"] = calculated_max_length
+
+            if self.config.model_type in ["llama"]:
+                if self.config.max_position_embeddings < calculated_max_length:
+                    unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
+
+        # 7. determine generation mode
+        generation_mode = self._get_generation_mode(generation_config, assistant_model)
+        if generation_config.bucket_size > 0:
+            assert generation_config.static_shapes, "bucket_size > 0 can be set only when static_shapes is set"
+        # if generation_config.bucket_size <= 0, padding is handled by the generating fn (like greedy_search)
+        if generation_config.static_shapes and generation_config.bucket_size > 0:
+            assert (
+                generation_mode == GenerationMode.GREEDY_SEARCH or generation_mode == GenerationMode.BEAM_SEARCH
+            ), "generation_config.bucket_size > 0 supported only for greedy mode"
+
+        if streamer is not None and (generation_config.num_beams > 1):
+            raise ValueError(
+                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
+            )
+
+        if self.device.type != input_ids.device.type:
+            warnings.warn(
+                (
+                    "You are calling .generate() with the `input_ids` being on a device type different"
+                    f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                    f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                    " Please make sure that you have put `input_ids` to the"
+                    f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                    " running `.generate()`."
+                ),
+                UserWarning,
+            )
+
+        # 8. prepare distribution pre_processing samplers
+        prepared_logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+
+        # 9. prepare stopping criteria
+        self.generation_config.generation_mode = generation_mode
+        prepared_stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        # In lazy mode, import Habana torch to be able to add mark_step()
+        if lazy_mode:
+            import habana_frameworks.torch.core as htcore
+
+            self.htcore_generation = htcore
+
+        # 10. go into different generation modes
+        if generation_mode == GenerationMode.ASSISTED_GENERATION:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing assisted generate, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+            if batch_size > 1:
+                raise ValueError("assisted generate is only supported for batch_size = 1")
+            if not model_kwargs["use_cache"]:
+                raise ValueError("assisted generate requires `use_cache=True`")
+
+            # 11. Get the candidate generator, given the parameterization
+            candidate_generator = self._get_candidate_generator(
+                generation_config=generation_config,
+                input_ids=input_ids,
+                inputs_tensor=inputs_tensor,
+                assistant_model=assistant_model,
+                logits_processor=logits_processor,
+                model_kwargs=model_kwargs,
+            )
+
+            # 12. run assisted generate
+            return self.assisted_decoding(
+                input_ids,
+                candidate_generator=candidate_generator,
+                do_sample=generation_config.do_sample,
+                logits_processor=prepared_logits_processor,
+                logits_warper=self._get_logits_warper(generation_config) if generation_config.do_sample else None,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+        if generation_mode == GenerationMode.GREEDY_SEARCH:
+            # 11. run greedy search
+            return self.greedy_search(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                lazy_mode=lazy_mode,
+                ignore_eos=generation_config.ignore_eos,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
+            if not model_kwargs["use_cache"]:
+                raise ValueError("Contrastive search requires `use_cache=True`")
+
+            return self.contrastive_search(
+                input_ids,
+                top_k=generation_config.top_k,
+                penalty_alpha=generation_config.penalty_alpha,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                sequential=generation_config.low_memory,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.SAMPLE:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 13. run sample
+            return self.sample(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                lazy_mode=lazy_mode,
+                ignore_eos=generation_config.ignore_eos,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.BEAM_SEARCH:
+            # 11. prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+            # 13. run beam search
+            return self.beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                sequential=generation_config.low_memory,
+                lazy_mode=lazy_mode,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.BEAM_SAMPLE:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # 12. prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+
+            # 13. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 14. run beam sample
+            return self.beam_sample(
+                input_ids,
+                beam_scorer,
+                logits_processor=prepared_logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                lazy_mode=lazy_mode,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
+            # 11. prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                num_beam_groups=generation_config.num_beam_groups,
+                max_length=generation_config.max_length,
+            )
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+            # 13. run beam search
+            return self.group_beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                lazy_mode=lazy_mode,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
+            final_constraints = []
+            if generation_config.constraints is not None:
+                final_constraints = generation_config.constraints
+
+            if generation_config.force_words_ids is not None:
+
+                def typeerror():
+                    raise ValueError(
+                        "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
+                        f"of positive integers, but is {generation_config.force_words_ids}."
+                    )
+
+                if (
+                    not isinstance(generation_config.force_words_ids, list)
+                    or len(generation_config.force_words_ids) == 0
+                ):
+                    typeerror()
+
+                for word_ids in generation_config.force_words_ids:
+                    if isinstance(word_ids[0], list):
+                        if not isinstance(word_ids, list) or len(word_ids) == 0:
+                            typeerror()
+                        if any(not isinstance(token_ids, list) for token_ids in word_ids):
+                            typeerror()
+                        if any(
+                            any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+                            for token_ids in word_ids
+                        ):
+                            typeerror()
+
+                        constraint = DisjunctiveConstraint(word_ids)
+                    else:
+                        if not isinstance(word_ids, list) or len(word_ids) == 0:
+                            typeerror()
+                        if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
+                            typeerror()
+
+                        constraint = PhrasalConstraint(word_ids)
+                    final_constraints.append(constraint)
+
+            # 11. prepare beam search scorer
+            constrained_beam_scorer = ConstrainedBeamSearchScorer(
+                constraints=final_constraints,
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+            # 13. run beam search
+            return self.constrained_beam_search(
+                input_ids,
+                constrained_beam_scorer=constrained_beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                output_logits=generation_config.output_logits,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                lazy_mode=lazy_mode,
+                profiling_warmup_steps=profiling_warmup_steps,
+                profiling_steps=profiling_steps,
+                **model_kwargs,
+            )
+
+    @torch.no_grad()
+    def contrastive_search(
+        self,
+        input_ids: torch.LongTensor,
+        top_k: Optional[int] = 1,
+        penalty_alpha: Optional[float] = 0,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        sequential: Optional[bool] = None,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
+        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.contrastive_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            top_k (`int`, *optional*, defaults to 1):
+                The size of the candidate set that is used to re-rank for contrastive search
+            penalty_alpha (`float`, *optional*, defaults to 0):
+                The degeneration penalty for contrastive search; activate when it is larger than 0
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
+                for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateDecoderOnlyOutput`],
+            [`transformers.generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor`
+            containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+        >>> model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        >>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> input_prompt = "DeepMind Company is"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt")
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=64)])
+        >>> outputs = model.contrastive_search(
+        ...     **input_ids, penalty_alpha=0.6, top_k=4, stopping_criteria=stopping_criteria
+        ... )
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
+        ```"""
+
+        raise NotImplementedError("Contrastive search is not supported by optimum-habana yet.")
+
+    def greedy_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        lazy_mode: Optional[bool] = False,
+        ignore_eos: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
+                for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            ignore_eos (`bool`, *optional*, defaults to `False`):
+                Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`]
+            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+
+        >>> input_prompt = "It might be possible to"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+
+        >>> outputs = model.greedy_search(
+        ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
+        ... )
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+        ```"""
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                (
+                    "`max_length` is deprecated in this function, use"
+                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
+                ),
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        if not ignore_eos:
+            unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+
+        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
+        hb_profer.start()
+        this_peer_finished = False  # used by synced_gpus only
+        bucket_size = model_kwargs.get("bucket_size", -1)
+        prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
+        bucket_internal = model_kwargs.get("bucket_internal", None)
+        reduce_recompile = model_kwargs.get("reduce_recompile", False)
+
+        prompt_len = input_ids.shape[-1]
+        if not bucket_internal:
+            if bucket_size >= 0:
+                inc = iter(incrementor(bucket_size, prompt_len))
+            if bucket_size > 0:
+                assert "position_ids" not in model_kwargs, "Untested path"
+        cur_len = prompt_len
+        token_idx = model_kwargs.get("token_idx", None)
+        if token_idx is not None:
+            # Update cur_len in case of static shapes
+            cur_len = token_idx.item()
+        while True:
+            if lazy_mode:
+                self.htcore_generation.mark_step()
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            if bucket_size > 0 and not bucket_internal:
+                # it will not have been padded if bucket_size > 0
+                params = next(inc)
+                input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
+                    params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
+                )
+
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **hpu_graphs_kwargs,
+            )
+
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+
+            token_idx = model_kwargs.get("token_idx", None)
+            if token_idx is not None and outputs.logits.shape[-2] > 1:
+                # case1 (w/o KV caching): outputs.logits.shape: [batch_size, max_length, vocab_size]
+                if self.config.is_encoder_decoder:
+                    next_token_logits = outputs.logits[:, token_idx - 1, :]
+                    next_tokens_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
+                else:
+                    next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
+                    next_tokens_scores = logits_processor(input_ids, next_token_logits)
+            else:
+                next_token_logits = outputs.logits[:, -1, :]
+                if token_idx is not None and self.config.is_encoder_decoder:
+                    # case2 (with KV caching): outputs.logits.shape: [batch_size, 1, vocab_size]
+                    next_tokens_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
+                else:
+                    # case3 (default case): token_idx is None
+                    next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_tokens_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # argmax
+            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+            # finished sentences should have their next token be a padding token
+            if not ignore_eos and eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            if token_idx is not None:
+                input_ids.index_copy_(
+                    1, token_idx, next_tokens.unsqueeze(-1) if next_tokens.dim() == 1 else next_tokens
+                )
+            else:
+                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if bucket_size > 0 and bucket_internal:
+                # Calculate slice idx for kv cache during the decode phase.
+                # Breaking down the kv cache in the attention block helps to reduce computation time.
+                if model_kwargs.get("token_idx_cpu") <= (model_kwargs["kv_cache_len"] // bucket_size) * bucket_size:
+                    idx = (model_kwargs.get("token_idx_cpu") - 1) // bucket_size
+                    if prev_idx != idx:
+                        model_kwargs["cache_idx"] = (idx + 1) * bucket_size
+                        prev_idx = idx
+                else:
+                    model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
+            cur_len = cur_len + 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if not ignore_eos and eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if not ignore_eos and unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores, token_idx=cur_len):
+                this_peer_finished = True
+
+            hb_profer.step()
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        hb_profer.stop()
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
+    def sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        lazy_mode: Optional[bool] = False,
+        ignore_eos: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+        For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            ignore_eos (`bool`, *optional*, defaults to `False`):
+                Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> model.generation_config.pad_token_id = model.config.eos_token_id
+
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
+
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+
+        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+        >>> outputs = model.sample(
+        ...     input_ids,
+        ...     logits_processor=logits_processor,
+        ...     logits_warper=logits_warper,
+        ...     stopping_criteria=stopping_criteria,
+        ... )
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
+        ```"""
+
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                (
+                    "`max_length` is deprecated in this function, use"
+                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                ),
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        # TODO: no ignore_eos check here since there is a compilation error, will add ignore_eos here if fixed
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
+        hb_profer.start()
+        this_peer_finished = False  # used by synced_gpus only
+        cur_len = input_ids.shape[-1]
+        token_idx = model_kwargs.get("token_idx", None)
+        if token_idx is not None:
+            # Update cur_len in case of static shapes
+            cur_len = token_idx.item()
+
+        # auto-regressive generation
+        while True:
+            if lazy_mode:
+                self.htcore_generation.mark_step()
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **hpu_graphs_kwargs,
+            )
+
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+
+            token_idx = model_kwargs.get("token_idx", None)
+            if token_idx is not None and outputs.logits.shape[-2] > 1:
+                next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
+            else:
+                next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # sample
+            probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+            # finished sentences should have their next token be a padding token
+            # TODO: no ignore_eos check here since there is a compilation error, will add ignore_eos here if fixed
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            if token_idx is not None:
+                input_ids.index_copy_(
+                    1, token_idx, next_tokens.unsqueeze(-1) if next_tokens.dim() == 1 else next_tokens
+                )
+            else:
+                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+            # if eos_token was found in one sentence, set sentence to finished
+            if not ignore_eos and eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+
+                # stop when each sentence is finished
+                if not ignore_eos and unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores, token_idx=cur_len):
+                this_peer_finished = True
+
+            hb_profer.step()
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        hb_profer.stop()
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
+    def beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        sequential: Optional[bool] = None,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            sequential (`bool`, defaults to `False`):
+                By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for
+                more details). This flag will avoid parallelizing the beam search and will instead run beam search
+                sequentially.
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     BeamSearchScorer,
+        ... )
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
+        ... }
+
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ... )
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
+
+        >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Wie alt bist du?']
+        ```"""
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        sequential = sequential if sequential is not None else self.generation_config.low_memory
+        if max_length is not None:
+            warnings.warn(
+                (
+                    "`max_length` is deprecated in this function, use"
+                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                ),
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+        token_idx = model_kwargs.get("token_idx", None)
+        if token_idx is not None:
+            # Update cur_len in case of static shapes
+            cur_len = token_idx.item()
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        if self.generation_config.static_shapes:
+            beam_trace_scores = torch.zeros(
+                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.float32
+            )
+            beam_trace_indices = torch.zeros(
+                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.int64
+            )
+            beam_trace_tokens = torch.zeros(
+                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.int64
+            )
+            beam_trace_idx = torch.tensor(0, device=input_ids.device)
+            num_eos_tokens = torch.zeros((1), device=input_ids.device, dtype=torch.int64)
+            num_beams_tensor = torch.tensor(num_beams, device=input_ids.device, dtype=torch.int64)
+
+        def finalize_beams(initial_ids, beam_trace, model_config, length_penalty):
+            beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens = beam_trace
+            bs = initial_ids.shape[0]
+            num_beams = beam_trace_scores.shape[1] // (2 * bs)
+
+            beam_trace_idx = beam_trace_idx.item()
+            beam_trace_scores = beam_trace_scores[:beam_trace_idx, :]
+            beam_trace_indices = beam_trace_indices[:beam_trace_idx, :]
+            beam_trace_tokens = beam_trace_tokens[:beam_trace_idx, :]
+
+            # (score, parent_beam, token_id, is_finished)
+            root = (float("-inf"), None, None, False)
+
+            def resolve_beam(beam):
+                if beam == root:
+                    return []
+                score, prev, tok, is_finished = beam
+                rest = resolve_beam(prev)
+                rest.append(tok)
+                return rest
+
+            prev_beams = [[root] * num_beams] * bs
+            best = [root] * bs
+
+            def beam_score(beam):
+                return (beam[3], beam[0])
+
+            for step, (scores, indices, tokens) in enumerate(
+                zip(beam_trace_scores, beam_trace_indices, beam_trace_tokens)
+            ):
+                cur_beams = [[] for _ in range(bs)]
+                for idx, (s, i, t) in enumerate(zip(scores, indices, tokens)):
+                    batch = idx // (num_beams * 2)
+                    idx = idx % (num_beams * 2)
+                    b_len = 1 + step
+                    b_score = s.item() / (b_len**length_penalty)
+                    b_tok = t.item()
+                    is_finished = b_tok == model_config.eos_token_id
+                    if len(cur_beams[batch]) >= num_beams:
+                        continue
+                    beam = (b_score, prev_beams[batch][i], b_tok, is_finished)
+                    if not is_finished:
+                        cur_beams[batch].append(beam)
+                    if is_finished or (step + 1 == beam_trace_idx):
+                        if beam_score(best[batch]) < beam_score(beam):
+                            best[batch] = beam
+                prev_beams = cur_beams
+
+            def expand_if_needed(tensor, new_size, value, dim=-1):
+                orig_len = tensor.shape[dim]
+                padding_len = new_size - orig_len
+                import torch.nn.functional as F
+
+                if padding_len > 0:
+                    if dim == -1:
+                        return F.pad(tensor, (0, padding_len), value=value)
+                    elif dim == -2:
+                        return F.pad(tensor, (0, 0, 0, padding_len), value=value)
+                    else:
+                        assert False, f"Unsupported dim value: {dim}"
+                return tensor
+
+            result = [
+                torch.cat(
+                    [initial_ids[i], torch.tensor(resolve_beam(b), dtype=initial_ids.dtype, device=initial_ids.device)]
+                )
+                for i, b in enumerate(best)
+            ]
+            max_length = max([t.shape[-1] for t in result])
+            result = [expand_if_needed(res, max_length, model_config.pad_token_id) for res in result]
+            input_ids = torch.stack(result)
+            return input_ids
+
+        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
+        hb_profer.start()
+        this_peer_finished = False  # used by synced_gpus only
+
+        bucket_size = model_kwargs.get("bucket_size", -1)
+        reduce_recompile = model_kwargs.get("reduce_recompile", False)
+        prompt_len = input_ids.shape[-1]
+        if bucket_size >= 0:
+            inc = iter(incrementor(bucket_size, prompt_len))
+        if bucket_size > 0:
+            assert "position_ids" not in model_kwargs, "Untested path"
+        if self.generation_config.static_shapes:
+            initial_ids = input_ids[::num_beams, 0:cur_len]
+        while True:
+            if lazy_mode:
+                self.htcore_generation.mark_step()
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            if bucket_size > 0:
+                # it will not have been padded if bucket_size > 0
+                params = next(inc)
+                input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
+                    params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
+                )
+
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # if sequential is True, split the input to batches of batch_size and run sequentially
+            if sequential:
+                if any(
+                    model_name in self.__class__.__name__.lower()
+                    for model_name in [
+                        "fsmt",
+                        "reformer",
+                        "bloom",
+                        "ctrl",
+                        "gpt_bigcode",
+                        "transo_xl",
+                        "xlnet",
+                        "cpm",
+                    ]
+                ):
+                    raise RuntimeError(
+                        f"Currently generation for {self.__class__.__name__} is not supported "
+                        f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
+                    )
+
+                inputs_per_sub_batches = _split_model_inputs(
+                    model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
+                )
+                outputs_per_sub_batch = [
+                    self(
+                        **inputs_per_sub_batch,
+                        return_dict=True,
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                    )
+                    for inputs_per_sub_batch in inputs_per_sub_batches
+                ]
+
+                outputs = stack_model_outputs(outputs_per_sub_batch)
+            else:
+                hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    **hpu_graphs_kwargs,
+                )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            token_idx = model_kwargs.get("token_idx", None)
+            if token_idx is not None and outputs.logits.shape[-2] > 1:
+                next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
+            else:
+                next_token_logits = outputs.logits[:, -1, :]
+
+            next_token_scores = torch.nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            if token_idx is not None:
+                next_token_scores_processed = logits_processor(input_ids[:, :token_idx], next_token_scores)
+            else:
+                next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores_processed,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+            )
+
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+            if self.generation_config.static_shapes:
+                beam_scores = next_token_scores.flatten()
+                static_beam_indices = next_indices.flatten()
+
+                beam_tokens = next_tokens.remainder(vocab_size).flatten()
+
+                beam_trace_scores.index_copy_(0, beam_trace_idx, beam_scores.unsqueeze(0))
+                beam_trace_indices.index_copy_(0, beam_trace_idx, static_beam_indices.unsqueeze(0))
+                beam_trace_tokens.index_copy_(0, beam_trace_idx, beam_tokens.unsqueeze(0))
+                beam_trace_idx.add_(1)
+
+                if self.generation_config.early_stopping:
+                    num_eos_tokens.add_(beam_tokens[0:num_beams].eq(self.config.eos_token_id).sum())
+
+                beam_scores.add_(torch.where(beam_tokens.eq(self.config.eos_token_id), float("-inf"), 0.0))
+                beam_scores = beam_scores.view(batch_size, -1).unsqueeze(0)
+                _, selected = torch.topk(beam_scores, k=num_beams, dim=-1, largest=True, sorted=True)
+                offset = torch.arange(0, torch.numel(beam_scores), beam_scores.shape[-1]).unsqueeze(-1)
+                selected = (selected + offset).flatten()
+                beam_scores = beam_scores.flatten().index_select(0, selected)
+                beam_tokens = beam_tokens.index_select(0, selected)
+                static_beam_indices = static_beam_indices.index_select(0, selected)
+
+                prev_beams = outputs.logits.shape[0] // batch_size
+
+                beam_offsets = torch.arange(0, 1, prev_beams, dtype=torch.int32)
+                beam_offsets = beam_offsets.to(device=outputs.logits.device)
+                static_beam_indices = (static_beam_indices.view(batch_size, -1) + beam_offsets.unsqueeze(-1)).flatten()
+
+                next_tokens = beam_tokens.unsqueeze(-1)
+                beam_next_tokens = next_tokens
+                beam_idx = static_beam_indices
+            else:
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                beam_outputs = beam_scorer.process(
+                    input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=beam_indices,
+                    decoder_prompt_len=prompt_len,
+                )
+                beam_scores = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 0, beam_idx)
+                input_ids.index_copy_(
+                    1, token_idx, beam_next_tokens.unsqueeze(-1) if beam_next_tokens.dim() == 1 else beam_next_tokens
+                )
+            else:
+                input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past_key_values"] is not None:
+                if model_kwargs["reuse_cache"]:
+                    model_kwargs["past_key_values"] = unwrap_deepspeed_model(self).reorder_kv_cache(beam_idx)
+                else:
+                    model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                        model_kwargs["past_key_values"], beam_idx
+                    )
+
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            hb_profer.step()
+            if self.generation_config.static_shapes:
+                is_min_length_reached = (
+                    self.generation_config.min_length and cur_len >= self.generation_config.min_length
+                )
+                if (
+                    self.generation_config.early_stopping
+                    and is_min_length_reached
+                    and num_eos_tokens >= num_beams_tensor
+                ):
+                    break
+                elif stopping_criteria(input_ids, scores, token_idx=cur_len):
+                    break
+            elif stopping_criteria(input_ids, scores) or (beam_scorer.is_done and not lazy_mode):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+        hb_profer.stop()
+
+        if self.generation_config.static_shapes:
+            beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens)
+            from collections import UserDict
+
+            def map_tensors(obj, fn):
+                constructor = type(obj)
+                if isinstance(obj, tuple):
+                    return constructor(map_tensors(v, fn) for v in obj)
+                if isinstance(obj, list):
+                    return constructor([map_tensors(v, fn) for v in obj])
+                if isinstance(obj, dict) or isinstance(obj, UserDict):
+                    return constructor({k: map_tensors(v, fn) for k, v in obj.items()})
+                if isinstance(obj, torch.Tensor):
+                    return fn(obj)
+                return obj
+
+            def move(obj, device):
+                return map_tensors(obj, lambda t: t.to(device))
+
+            sequence_outputs = {}
+            sequence_outputs["sequences"] = finalize_beams(
+                initial_ids.cpu(), move(beam_trace, "cpu"), self.config, self.generation_config.length_penalty
+            )
+        else:
+            sequence_outputs = beam_scorer.finalize(
+                input_ids,
+                beam_scores,
+                next_tokens,
+                beam_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                max_length=stopping_criteria.max_length,
+                beam_indices=beam_indices,
+                decoder_prompt_len=prompt_len,
+            )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def beam_sample(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **beam search multinomial
+        sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.beam_sample`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
+        ...     BeamSearchScorer,
+        ... )
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
+        ... }
+
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     max_length=model.config.max_length,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ... )
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
+        ... )
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
+
+        >>> outputs = model.beam_sample(
+        ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
+        ... )
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Wie alt bist du?']
+        ```"""
+
+        raise NotImplementedError("Beam search sampling is not supported by optimum-habana yet.")
+
+    def group_beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **diverse beam search
+        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.group_beam_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if [`transformers.generation.BeamSearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     HammingDiversityLogitsProcessor,
+        ...     BeamSearchScorer,
+        ... )
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        >>> # lets run diverse beam search using 6 beams
+        >>> num_beams = 6
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
+        ... }
+
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     max_length=model.config.max_length,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ...     num_beam_groups=3,
+        ... )
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
+
+        >>> outputs = model.group_beam_search(
+        ...     input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs
+        ... )
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Wie alt bist du?']
+        ```"""
+
+        raise NotImplementedError("Group beam search is not supported by optimum-habana yet.")
+
+    def constrained_beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        constrained_beam_scorer: ConstrainedBeamSearchScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **constrained beam search
+        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.constrained_beam_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
+                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation, while satisfying a list of positive constraints. For more information, the
+                documentation of [`ConstrainedBeamSearchScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     ConstrainedBeamSearchScorer,
+        ...     PhrasalConstraint,
+        ... )
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
+        ... }
+
+        >>> constraint_str = "Sie"
+        >>> constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # slice to remove eos token
+        >>> constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]
+
+        >>> # instantiate beam scorer
+        >>> beam_scorer = ConstrainedBeamSearchScorer(
+        ...     batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
+        ... )
+
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
+
+        >>> outputs = model.constrained_beam_search(
+        ...     input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
+        ... )
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Wie alt sind Sie?']
+        ```"""
+
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        batch_size = len(constrained_beam_scorer._beam_hyps)
+        num_beams = constrained_beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+        token_idx = model_kwargs.get("token_idx", None)
+        if token_idx is not None:
+            # Update cur_len in case of static shapes
+            cur_len = token_idx.item()
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False  # used by synced_gpus only
+        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+
+        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
+        hb_profer.start()
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
+
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **hpu_graphs_kwargs,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            if token_idx is not None and outputs.logits.shape[-2] > 1:
+                next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
+            else:
+                next_token_logits = outputs.logits[:, -1, :]
+
+            next_token_scores = torch.nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+
+            scores_for_all_vocab = next_token_scores.clone()
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+            )
+
+            next_indices = (next_tokens / vocab_size).long()
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = constrained_beam_scorer.process(
+                input_ids[:, :cur_len],
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                scores_for_all_vocab,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+                decoder_prompt_len=decoder_prompt_len,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            if token_idx is not None:
+                input_ids = input_ids[beam_idx, :]
+                input_ids.index_copy_(
+                    1, token_idx, beam_next_tokens.unsqueeze(-1) if beam_next_tokens.dim() == 1 else beam_next_tokens
+                )
+            else:
+                input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past_key_values"] is not None:
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
+
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            hb_profer.step()
+
+            if constrained_beam_scorer.is_done or stopping_criteria(input_ids, scores, token_idx=cur_len):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        hb_profer.stop()
+        sequence_outputs = constrained_beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def assisted_decoding(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        candidate_generator: Optional["CandidateGenerator"] = None,
+        do_sample: bool = False,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        lazy_mode: Optional[bool] = False,
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
+        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
+        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
+        models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`transformers.generation.GenerationMixin.candidate_decoding`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            candidate_generator (`CandidateGenerator`, *optional*):
+                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
+                more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
+            assistant_model (`PreTrainedModel`, *optional*):
+                An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                is much faster than running generation with the model you're calling generate from. As such, the
+                assistant model should be much smaller.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            lazy_mode (`bool`, *optional*, defaults to `False`):
+                Whether the run is executed in lazy mode or not (i.e. eager mode).
+            profiling_warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*, defaults to 0):
+                Number of steps to be captured when enabling profiling.
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        >>> input_prompt = "It might be possible to"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+        >>> outputs = model.assisted_decoding(
+        ...     input_ids,
+        ...     assistant_model=assistant_model,
+        ...     logits_processor=logits_processor,
+        ...     stopping_criteria=stopping_criteria,
+        ... )
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+        ```"""
+        raise NotImplementedError("Assisted decoding is not supported by optimum-habana yet.")
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
new file mode 100644
index 00000000000..9d4e473aab3
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
@@ -0,0 +1,365 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import transformers
+
+from .generation import (
+    GaudiGenerationConfig,
+    GaudiGenerationMixin,
+    gaudi_MaxLengthCriteria_call,
+    gaudi_MaxNewTokensCriteria_call,
+)
+from .models import (
+    GaudiBloomForCausalLM,
+    GaudiBloomMLP,
+    GaudiCodeGenAttention,
+    GaudiCodeGenForCausalLM,
+    GaudiFalconForCausalLM,
+    GaudiFalconModel,
+    GaudiGPT2Attention,
+    GaudiGPT2LMHeadModel,
+    GaudiGPTBigCodeForCausalLM,
+    GaudiGPTJAttention,
+    GaudiGPTJForCausalLM,
+    GaudiGPTNeoXForCausalLM,
+    GaudiLlamaAttention,
+    GaudiLlamaDecoderLayer,
+    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
+    GaudiLlamaForCausalLM,
+    GaudiLlamaLinearScalingRotaryEmbedding,
+    GaudiLlamaMLP,
+    GaudiLlamaModel,
+    GaudiLlamaRotaryEmbedding,
+    GaudiMistralAttention,
+    GaudiMistralDecoderLayer,
+    GaudiMistralForCausalLM,
+    GaudiMistralModel,
+    GaudiMixtralForCausalLM,
+    GaudiMptForCausalLM,
+    GaudiMptModel,
+    GaudiOPTForCausalLM,
+    GaudiOPTLearnedPositionalEmbedding,
+    GaudiPhiForCausalLM,
+    _gaudi_wav2vec2_compute_mask_indices,
+    _gaudi_wav2vec2_mask_hidden_states,
+    gaudi_albert_forward,
+    gaudi_BartAttention_forward,
+    gaudi_BartDecoder_forward,
+    gaudi_BartDecoderLayer_forward,
+    gaudi_BartEncoder_forward,
+    gaudi_BartEncoderLayer_forward,
+    gaudi_BartForConditionalGeneration_forward,
+    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_BartLearnedPositionalEmbedding,
+    gaudi_BartModel_forward,
+    gaudi_BlipForConditionalGeneration_generate,
+    gaudi_BlipForQuestionAnswering_generate,
+    gaudi_BlipTextAttention_forward,
+    gaudi_BlipTextEncoder_forward,
+    gaudi_BlipTextLayer_forward,
+    gaudi_BlipTextLMHead_forward,
+    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
+    gaudi_BlipTextModel_forward,
+    gaudi_BlipTextSelfAttention_forward,
+    gaudi_bloom_attention_forward,
+    gaudi_bloom_block_forward,
+    gaudi_bloom_convert_to_bloom_cache,
+    gaudi_bloom_convert_to_standard_cache,
+    gaudi_bloom_model_forward,
+    gaudi_check_and_enable_sdpa,
+    gaudi_codegen_block_forward,
+    gaudi_codegen_model_forward,
+    gaudi_conv1d_forward,
+    gaudi_esm_for_protein_folding_forward,
+    gaudi_esmfolding_trunk_forward,
+    gaudi_falcon_attention_forward,
+    gaudi_falcon_attention_split_heads,
+    gaudi_falcon_decoder_layer_forward,
+    gaudi_generate_speech,
+    gaudi_get_extended_attention_mask,
+    gaudi_gpt2_block_forward,
+    gaudi_gpt2_forward,
+    gaudi_gpt_bigcode_attention_forward,
+    gaudi_gpt_bigcode_block_forward,
+    gaudi_gpt_bigcode_model_forward,
+    gaudi_gpt_neox_attention_forward,
+    gaudi_gpt_neox_layer_forward,
+    gaudi_gpt_neox_model_forward,
+    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
+    gaudi_gptj_block_forward,
+    gaudi_gptj_model_forward,
+    gaudi_invert_attention_mask,
+    gaudi_llama_rmsnorm_forward,
+    gaudi_mistral_rmsnorm_forward,
+    gaudi_mixtral_attention_forward,
+    gaudi_mixtral_block_sparse_moe_forward,
+    gaudi_mixtral_decoder_layer_forward,
+    gaudi_mixtral_model_forward,
+    gaudi_mixtral_rmsnorm_forward,
+    gaudi_mpt_attention_forward,
+    gaudi_mpt_block_forward,
+    gaudi_opt_attention_forward,
+    gaudi_opt_decoder_forward,
+    gaudi_opt_decoder_layer_forward,
+    gaudi_opt_model_forward,
+    gaudi_phi_attention_forward,
+    gaudi_phi_decoder_layer_forward,
+    gaudi_phi_model_forward,
+    gaudi_rot_matmul,
+    gaudi_rot_vec_mul,
+    gaudi_SpeechT5Attention_forward,
+    gaudi_SpeechT5Decoder_forward,
+    gaudi_SpeechT5DecoderLayer_forward,
+    gaudi_SpeechT5SpeechDecoderPrenet_forward,
+    gaudi_swin_get_attn_mask,
+    gaudi_t5_layernorm_forward,
+    gaudi_T5Attention_forward,
+    gaudi_T5Block_forward,
+    gaudi_T5ForConditionalGeneration_forward,
+    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_T5LayerSelfAttention_forward,
+    gaudi_T5Stack_forward,
+    gaudi_vit_self_attention_forward,
+    gaudi_wav2vec2_encoder_forward,
+    gaudi_wav2vec2_forward,
+    gaudi_wav2vec2_tdnnlayer_forward,
+)
+
+
+def adapt_transformers_to_gaudi():
+    """
+    Replaces some Transformers' methods for equivalent methods optimized
+    for Gaudi.
+    """
+
+    # optimize Conv1D
+    transformers.pytorch_utils.Conv1D.forward = gaudi_conv1d_forward
+
+    # Optimization tweak for ViT
+    transformers.models.vit.modeling_vit.ViTSelfAttention.forward = gaudi_vit_self_attention_forward
+
+    # Optimization tweak for Swin
+    transformers.models.swin.modeling_swin.SwinLayer.get_attn_mask = gaudi_swin_get_attn_mask
+
+    # Optimization tweak for Wav2Vec2
+    transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices = _gaudi_wav2vec2_compute_mask_indices
+    # transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices = _gaudi_wav2vec2_sample_negative_indices
+    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states = (
+        _gaudi_wav2vec2_mask_hidden_states
+    )
+    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward = gaudi_wav2vec2_forward
+    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder.forward = gaudi_wav2vec2_encoder_forward
+    transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer.forward = gaudi_wav2vec2_tdnnlayer_forward
+
+    # Generation is modified to run faster in lazy mode
+    transformers.generation.GenerationMixin.generate = GaudiGenerationMixin.generate
+    transformers.generation.GenerationMixin._update_model_kwargs_for_generation = (
+        GaudiGenerationMixin._update_model_kwargs_for_generation
+    )
+    transformers.generation.GenerationMixin.update_model_kwargs_for_bucketing = (
+        GaudiGenerationMixin.update_model_kwargs_for_bucketing
+    )
+    transformers.generation.GenerationMixin._get_hpu_graphs_kwargs = GaudiGenerationMixin._get_hpu_graphs_kwargs
+    transformers.generation.GenerationMixin._expand_inputs_for_generation = staticmethod(
+        GaudiGenerationMixin._expand_inputs_for_generation
+    )
+    transformers.generation.GenerationMixin._prepare_attention_mask_for_generation = (
+        GaudiGenerationMixin._prepare_attention_mask_for_generation
+    )
+    transformers.generation.GenerationMixin._prepare_decoder_input_ids_for_generation = (
+        GaudiGenerationMixin._prepare_decoder_input_ids_for_generation
+    )
+    transformers.generation.GenerationMixin._prepare_decoder_attention_mask = (
+        GaudiGenerationMixin._prepare_decoder_attention_mask
+    )
+    transformers.generation.GenerationMixin._validate_model_kwargs = GaudiGenerationMixin._validate_model_kwargs
+    transformers.generation.GenerationMixin.greedy_search = GaudiGenerationMixin.greedy_search
+    transformers.generation.GenerationMixin.sample = GaudiGenerationMixin.sample
+    transformers.generation.GenerationMixin.beam_search = GaudiGenerationMixin.beam_search
+    transformers.generation.GenerationMixin.beam_sample = GaudiGenerationMixin.beam_sample
+    transformers.generation.GenerationMixin.group_beam_search = GaudiGenerationMixin.group_beam_search
+    transformers.generation.GenerationMixin.constrained_beam_search = GaudiGenerationMixin.constrained_beam_search
+    transformers.generation.GenerationConfig = GaudiGenerationConfig
+    transformers.modeling_utils.GenerationConfig = GaudiGenerationConfig
+    transformers.generation.MaxLengthCriteria.__call__ = gaudi_MaxLengthCriteria_call
+    transformers.generation.MaxNewTokensCriteria.__call__ = gaudi_MaxNewTokensCriteria_call
+
+    # Optimization for BLOOM generation on Gaudi
+    transformers.models.bloom.modeling_bloom.BloomAttention.forward = gaudi_bloom_attention_forward
+    transformers.models.bloom.modeling_bloom.BloomBlock.forward = gaudi_bloom_block_forward
+    transformers.models.bloom.modeling_bloom.BloomModel.forward = gaudi_bloom_model_forward
+    transformers.models.bloom.modeling_bloom.BloomMLP = GaudiBloomMLP
+    transformers.models.bloom.modeling_bloom.BloomForCausalLM = GaudiBloomForCausalLM
+    transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._convert_to_standard_cache = (
+        gaudi_bloom_convert_to_standard_cache
+    )
+    transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._convert_to_bloom_cache = (
+        gaudi_bloom_convert_to_bloom_cache
+    )
+
+    # Optimization for BART generation on Gaudi
+    transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding = gaudi_BartLearnedPositionalEmbedding
+    transformers.models.bart.modeling_bart.BartAttention.forward = gaudi_BartAttention_forward
+    transformers.models.bart.modeling_bart.BartEncoderLayer.forward = gaudi_BartEncoderLayer_forward
+    transformers.models.bart.modeling_bart.BartDecoderLayer.forward = gaudi_BartDecoderLayer_forward
+    transformers.models.bart.modeling_bart.BartEncoder.forward = gaudi_BartEncoder_forward
+    transformers.models.bart.modeling_bart.BartDecoder.forward = gaudi_BartDecoder_forward
+    transformers.models.bart.modeling_bart.BartModel.forward = gaudi_BartModel_forward
+    transformers.models.bart.modeling_bart.BartForConditionalGeneration.forward = (
+        gaudi_BartForConditionalGeneration_forward
+    )
+    transformers.models.bart.modeling_bart.BartForConditionalGeneration.prepare_inputs_for_generation = (
+        gaudi_BartForConditionalGeneration_prepare_inputs_for_generation
+    )
+
+    # Optimization for codegen generation on Gaudi
+    transformers.models.codegen.modeling_codegen.CodeGenAttention = GaudiCodeGenAttention
+    transformers.models.codegen.modeling_codegen.CodeGenForCausalLM = GaudiCodeGenForCausalLM
+    transformers.models.codegen.modeling_codegen.CodeGenModel.forward = gaudi_codegen_model_forward
+    transformers.models.codegen.modeling_codegen.CodeGenBlock.forward = gaudi_codegen_block_forward
+
+    # Replace invert_attention_mask and get_extended_attention_mask
+    # so that Torch Autocast is disabled for specific parts of the code
+    transformers.modeling_utils.ModuleUtilsMixin.invert_attention_mask = gaudi_invert_attention_mask
+    transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask = gaudi_get_extended_attention_mask
+
+    # Override sdpa check on Gaudi
+    transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa = gaudi_check_and_enable_sdpa
+
+    # AlbertModel.forward does not rely on get_extended_attention_mask so it also needs to be replaced
+    transformers.models.albert.modeling_albert.AlbertModel.forward = gaudi_albert_forward
+
+    # Optimization for GPT2 on Gaudi
+    transformers.models.gpt2.modeling_gpt2.GPT2Attention = GaudiGPT2Attention
+    transformers.models.gpt2.modeling_gpt2.GPT2Model.forward = gaudi_gpt2_forward
+    transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel = GaudiGPT2LMHeadModel
+    transformers.models.gpt2.modeling_gpt2.GPT2Block.forward = gaudi_gpt2_block_forward
+
+    # Optimization for EsmFold on Gaudi
+    transformers.models.esm.modeling_esmfold.EsmFoldingTrunk.forward = gaudi_esmfolding_trunk_forward
+    transformers.models.esm.modeling_esmfold.EsmForProteinFolding.forward = gaudi_esm_for_protein_folding_forward
+    transformers.models.esm.openfold_utils.rigid_utils.rot_matmul = gaudi_rot_matmul
+    transformers.models.esm.openfold_utils.rigid_utils.rot_vec_mul = gaudi_rot_vec_mul
+
+    # Optimization for OPT generation on Gaudi
+    transformers.models.opt.modeling_opt.OPTAttention.forward = gaudi_opt_attention_forward
+    transformers.models.opt.modeling_opt.OPTDecoder.forward = gaudi_opt_decoder_forward
+    transformers.models.opt.modeling_opt.OPTForCausalLM = GaudiOPTForCausalLM
+    transformers.models.opt.modeling_opt.OPTModel.forward = gaudi_opt_model_forward
+    transformers.models.opt.modeling_opt.OPTDecoderLayer.forward = gaudi_opt_decoder_layer_forward
+    transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding = GaudiOPTLearnedPositionalEmbedding
+
+    # Optimization for GPTJ on Gaudi
+    transformers.models.gptj.modeling_gptj.GPTJAttention = GaudiGPTJAttention
+    transformers.models.gptj.modeling_gptj.GPTJForCausalLM = GaudiGPTJForCausalLM
+    transformers.models.gptj.modeling_gptj.GPTJBlock.forward = gaudi_gptj_block_forward
+    transformers.models.gptj.modeling_gptj.GPTJModel.forward = gaudi_gptj_model_forward
+
+    # Optimization for GPTBigCode on Gaudi
+    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeAttention.forward = (
+        gaudi_gpt_bigcode_attention_forward
+    )
+    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM = GaudiGPTBigCodeForCausalLM
+    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeBlock.forward = gaudi_gpt_bigcode_block_forward
+    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeModel.forward = gaudi_gpt_bigcode_model_forward
+
+    # Optimization for gpt-neox generation on Gaudi
+    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM = GaudiGPTNeoXForCausalLM
+    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXModel.forward = gaudi_gpt_neox_model_forward
+    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXLayer.forward = gaudi_gpt_neox_layer_forward
+    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention.forward = gaudi_gpt_neox_attention_forward
+    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding._set_cos_sin_cache = (
+        gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache
+    )
+
+    # Optimization for llama generation on Gaudi
+    transformers.models.llama.modeling_llama.LlamaForCausalLM = GaudiLlamaForCausalLM
+    transformers.models.llama.modeling_llama.LlamaModel = GaudiLlamaModel
+    transformers.models.llama.modeling_llama.LlamaAttention = GaudiLlamaAttention
+    transformers.models.llama.modeling_llama.LlamaMLP = GaudiLlamaMLP
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = GaudiLlamaDecoderLayer
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = GaudiLlamaRotaryEmbedding
+    transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding = GaudiLlamaLinearScalingRotaryEmbedding
+    transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding = (
+        GaudiLlamaDynamicNTKScalingRotaryEmbedding
+    )
+    transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward
+
+    # Optimization for falcon generation on Gaudi
+    transformers.models.falcon.modeling_falcon.FalconForCausalLM = GaudiFalconForCausalLM
+    transformers.models.falcon.modeling_falcon.FalconModel = GaudiFalconModel
+    transformers.models.falcon.modeling_falcon.FalconDecoderLayer.forward = gaudi_falcon_decoder_layer_forward
+    transformers.models.falcon.modeling_falcon.FalconAttention.forward = gaudi_falcon_attention_forward
+    transformers.models.falcon.modeling_falcon.FalconAttention._split_heads = gaudi_falcon_attention_split_heads
+
+    # Optimization for t5 on Gaudi
+    transformers.models.t5.modeling_t5.T5LayerNorm.forward = gaudi_t5_layernorm_forward
+    transformers.models.t5.modeling_t5.T5Stack.forward = gaudi_T5Stack_forward
+    transformers.models.t5.modeling_t5.T5LayerSelfAttention.forward = gaudi_T5LayerSelfAttention_forward
+    transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward = gaudi_T5ForConditionalGeneration_forward
+    transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation = (
+        gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation
+    )
+    transformers.models.t5.modeling_t5.T5Attention.forward = gaudi_T5Attention_forward
+    transformers.models.t5.modeling_t5.T5Block.forward = gaudi_T5Block_forward
+
+    # Optimization for mpt on Gaudi
+    transformers.models.mpt.modeling_mpt.MptForCausalLM = GaudiMptForCausalLM
+    transformers.models.mpt.modeling_mpt.MptModel = GaudiMptModel
+    transformers.models.mpt.modeling_mpt.MptAttention.forward = gaudi_mpt_attention_forward
+    transformers.models.mpt.modeling_mpt.MptBlock.forward = gaudi_mpt_block_forward
+
+    # Optimization for mistral on Gaudi
+    transformers.models.mistral.modeling_mistral.MistralForCausalLM = GaudiMistralForCausalLM
+    transformers.models.mistral.modeling_mistral.MistralAttention = GaudiMistralAttention
+    transformers.models.mistral.modeling_mistral.MistralDecoderLayer = GaudiMistralDecoderLayer
+    transformers.models.mistral.modeling_mistral.MistralModel = GaudiMistralModel
+    transformers.models.mistral.modeling_mistral.MistralRMSNorm.forward = gaudi_mistral_rmsnorm_forward
+
+    # Optimization for phi on Gaudi
+    transformers.models.phi.modeling_phi.PhiForCausalLM = GaudiPhiForCausalLM
+    transformers.models.phi.modeling_phi.PhiAttention.forward = gaudi_phi_attention_forward
+    transformers.models.phi.modeling_phi.PhiDecoderLayer.forward = gaudi_phi_decoder_layer_forward
+    transformers.models.phi.modeling_phi.PhiModel.forward = gaudi_phi_model_forward
+
+    # Optimization for blip Text model on Gaudi
+    transformers.models.blip.BlipTextModel.forward = gaudi_BlipTextModel_forward
+    transformers.models.blip.modeling_blip_text.BlipTextLMHeadModel.forward = gaudi_BlipTextLMHead_forward
+    transformers.models.blip.modeling_blip_text.BlipTextLMHeadModel.prepare_inputs_for_generation = (
+        gaudi_BlipTextLMHead_prepare_inputs_for_generation
+    )
+    transformers.models.blip.modeling_blip_text.BlipTextEncoder.forward = gaudi_BlipTextEncoder_forward
+    transformers.models.blip.modeling_blip_text.BlipTextLayer.forward = gaudi_BlipTextLayer_forward
+    transformers.models.blip.modeling_blip_text.BlipTextAttention.forward = gaudi_BlipTextAttention_forward
+    transformers.models.blip.modeling_blip_text.BlipTextSelfAttention.forward = gaudi_BlipTextSelfAttention_forward
+    transformers.models.blip.BlipForQuestionAnswering.generate = gaudi_BlipForQuestionAnswering_generate
+    transformers.models.blip.BlipForConditionalGeneration.generate = gaudi_BlipForConditionalGeneration_generate
+
+    # Optimization for mixtral on Gaudi
+    transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM = GaudiMixtralForCausalLM
+    transformers.models.mixtral.modeling_mixtral.MixtralModel.forward = gaudi_mixtral_model_forward
+    transformers.models.mixtral.modeling_mixtral.MixtralAttention.forward = gaudi_mixtral_attention_forward
+    transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.forward = gaudi_mixtral_block_sparse_moe_forward
+    transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer.forward = gaudi_mixtral_decoder_layer_forward
+    transformers.models.mixtral.modeling_mixtral.MixtralRMSNorm.forward = gaudi_mixtral_rmsnorm_forward
+
+    # Optimization for speecht5 on Gaudi
+    transformers.models.speecht5.modeling_speecht5.SpeechT5Decoder.forward = gaudi_SpeechT5Decoder_forward
+    transformers.models.speecht5.modeling_speecht5.SpeechT5DecoderLayer.forward = gaudi_SpeechT5DecoderLayer_forward
+    transformers.models.speecht5.modeling_speecht5.SpeechT5Attention.forward = gaudi_SpeechT5Attention_forward
+    transformers.models.speecht5.modeling_speecht5._generate_speech = gaudi_generate_speech
+    transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = (
+        gaudi_SpeechT5SpeechDecoderPrenet_forward
+    )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
new file mode 100644
index 00000000000..d0eb8b2dcd6
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
@@ -0,0 +1,149 @@
+from .albert import gaudi_albert_forward
+from .bart import (
+    gaudi_BartAttention_forward,
+    gaudi_BartDecoder_forward,
+    gaudi_BartDecoderLayer_forward,
+    gaudi_BartEncoder_forward,
+    gaudi_BartEncoderLayer_forward,
+    gaudi_BartForConditionalGeneration_forward,
+    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_BartLearnedPositionalEmbedding,
+    gaudi_BartModel_forward,
+)
+from .blip import (
+    gaudi_BlipForConditionalGeneration_generate,
+    gaudi_BlipForQuestionAnswering_generate,
+    gaudi_BlipTextAttention_forward,
+    gaudi_BlipTextEncoder_forward,
+    gaudi_BlipTextLayer_forward,
+    gaudi_BlipTextLMHead_forward,
+    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
+    gaudi_BlipTextModel_forward,
+    gaudi_BlipTextSelfAttention_forward,
+)
+from .bloom import (
+    GaudiBloomForCausalLM,
+    GaudiBloomMLP,
+    gaudi_bloom_attention_forward,
+    gaudi_bloom_block_forward,
+    gaudi_bloom_convert_to_bloom_cache,
+    gaudi_bloom_convert_to_standard_cache,
+    gaudi_bloom_model_forward,
+)
+from .codegen import (
+    GaudiCodeGenAttention,
+    GaudiCodeGenForCausalLM,
+    gaudi_codegen_block_forward,
+    gaudi_codegen_model_forward,
+)
+from .esm import (
+    gaudi_esm_for_protein_folding_forward,
+    gaudi_esmfolding_trunk_forward,
+    gaudi_rot_matmul,
+    gaudi_rot_vec_mul,
+)
+from .falcon import (
+    GaudiFalconForCausalLM,
+    GaudiFalconModel,
+    gaudi_falcon_attention_forward,
+    gaudi_falcon_attention_split_heads,
+    gaudi_falcon_decoder_layer_forward,
+)
+from .gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward
+from .gpt_bigcode import (
+    GaudiGPTBigCodeForCausalLM,
+    gaudi_gpt_bigcode_attention_forward,
+    gaudi_gpt_bigcode_block_forward,
+    gaudi_gpt_bigcode_model_forward,
+)
+from .gpt_neox import (
+    GaudiGPTNeoXForCausalLM,
+    gaudi_gpt_neox_attention_forward,
+    gaudi_gpt_neox_layer_forward,
+    gaudi_gpt_neox_model_forward,
+    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
+)
+from .gptj import (
+    GaudiGPTJAttention,
+    GaudiGPTJForCausalLM,
+    gaudi_gptj_block_forward,
+    gaudi_gptj_model_forward,
+)
+from .llama import (
+    GaudiLlamaAttention,
+    GaudiLlamaDecoderLayer,
+    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
+    GaudiLlamaForCausalLM,
+    GaudiLlamaLinearScalingRotaryEmbedding,
+    GaudiLlamaMLP,
+    GaudiLlamaModel,
+    GaudiLlamaRotaryEmbedding,
+    gaudi_llama_rmsnorm_forward,
+)
+from .mistral import (
+    GaudiMistralAttention,
+    GaudiMistralDecoderLayer,
+    GaudiMistralForCausalLM,
+    GaudiMistralModel,
+    gaudi_mistral_rmsnorm_forward,
+)
+from .mixtral import (
+    GaudiMixtralForCausalLM,
+    gaudi_mixtral_attention_forward,
+    gaudi_mixtral_block_sparse_moe_forward,
+    gaudi_mixtral_decoder_layer_forward,
+    gaudi_mixtral_model_forward,
+    gaudi_mixtral_rmsnorm_forward,
+)
+from .modeling_all_models import (
+    gaudi_check_and_enable_sdpa,
+    gaudi_conv1d_forward,
+    gaudi_get_extended_attention_mask,
+    gaudi_invert_attention_mask,
+)
+from .mpt import (
+    GaudiMptForCausalLM,
+    GaudiMptModel,
+    gaudi_mpt_attention_forward,
+    gaudi_mpt_block_forward,
+)
+from .opt import (
+    GaudiOPTForCausalLM,
+    GaudiOPTLearnedPositionalEmbedding,
+    gaudi_opt_attention_forward,
+    gaudi_opt_decoder_forward,
+    gaudi_opt_decoder_layer_forward,
+    gaudi_opt_model_forward,
+)
+from .phi import (
+    GaudiPhiForCausalLM,
+    gaudi_phi_attention_forward,
+    gaudi_phi_decoder_layer_forward,
+    gaudi_phi_model_forward,
+)
+from .speecht5 import (
+    gaudi_generate_speech,
+    gaudi_SpeechT5Attention_forward,
+    gaudi_SpeechT5Decoder_forward,
+    gaudi_SpeechT5DecoderLayer_forward,
+    gaudi_SpeechT5SpeechDecoderPrenet_forward,
+)
+from .swin import gaudi_swin_get_attn_mask
+from .t5 import (
+    gaudi_t5_layernorm_forward,
+    gaudi_T5Attention_forward,
+    gaudi_T5Block_forward,
+    gaudi_T5ForConditionalGeneration_forward,
+    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_T5LayerSelfAttention_forward,
+    gaudi_T5Stack_forward,
+)
+from .vit import gaudi_vit_self_attention_forward
+from .wav2vec2 import (
+    _gaudi_wav2vec2_compute_mask_indices,
+    _gaudi_wav2vec2_mask_hidden_states,
+    _gaudi_wav2vec2_sample_negative_indices,
+    gaudi_wav2vec2_encoder_forward,
+    gaudi_wav2vec2_forward,
+    gaudi_wav2vec2_tdnnlayer_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
new file mode 100644
index 00000000000..fccea02a857
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
@@ -0,0 +1 @@
+from .modeling_albert import gaudi_albert_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
new file mode 100644
index 00000000000..6ac9b80073b
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+
+
+def gaudi_albert_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[BaseModelOutputWithPooling, Tuple]:
+    """
+    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/models/albert/modeling_albert.py#L689
+    except that mixed precision is disabled for computing:
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    batch_size, seq_length = input_shape
+    device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+    if attention_mask is None:
+        attention_mask = torch.ones(input_shape, device=device)
+    if token_type_ids is None:
+        if hasattr(self.embeddings, "token_type_ids"):
+            buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+            buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+            token_type_ids = buffered_token_type_ids_expanded
+        else:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+    # torch.finfo must take the dtype of encoder_extended_attention_mask
+    extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # bf16 compatibility
+    extended_attention_mask = 1.0 - extended_attention_mask
+    with torch.autocast(enabled=False, device_type="hpu"):
+        extended_attention_mask = extended_attention_mask * torch.finfo(extended_attention_mask.dtype).min
+    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+    embedding_output = self.embeddings(
+        input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+    )
+    encoder_outputs = self.encoder(
+        embedding_output,
+        extended_attention_mask,
+        head_mask=head_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    sequence_output = encoder_outputs[0]
+
+    pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
+
+    if not return_dict:
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+    return BaseModelOutputWithPooling(
+        last_hidden_state=sequence_output,
+        pooler_output=pooled_output,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
new file mode 100644
index 00000000000..c8148194d83
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
@@ -0,0 +1,11 @@
+from .modeling_bart import (
+    gaudi_BartAttention_forward,
+    gaudi_BartDecoder_forward,
+    gaudi_BartDecoderLayer_forward,
+    gaudi_BartEncoder_forward,
+    gaudi_BartEncoderLayer_forward,
+    gaudi_BartForConditionalGeneration_forward,
+    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_BartLearnedPositionalEmbedding,
+    gaudi_BartModel_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
new file mode 100644
index 00000000000..4c66c2cc954
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
@@ -0,0 +1,801 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BART model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import (
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from modeling_bart.py: https://raw.githubusercontent.com/huggingface/transformers/648d0deb1dd28a5d9956e63d8cf8c18f96a6a2aa/src/transformers/models/bart/modeling_bart.py
+# The difference is: modified dynamic shapes to static shapes with `mark_step` for performance improvement.
+
+
+class gaudi_BartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: torch.Tensor = torch.tensor(0)):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
+        positions = torch.arange(0, seq_len, dtype=torch.long, device=self.weight.device).expand(bsz, -1)
+        positions += past_key_values_length
+
+        return super().forward(positions + self.offset)
+
+
+def gaudi_BartAttention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+
+    # if key_value_states are provided this layer is used as a cross-attention layer
+    # for the decoder
+    is_cross_attention = key_value_states is not None
+
+    bsz, tgt_len, _ = hidden_states.size()
+
+    # get query proj
+    query_states = self.q_proj(hidden_states) * self.scaling
+    # get key, value proj
+    # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+    # is checking that the `sequence_length` of the `past_key_value` is the same as
+    # the provided `key_value_states` to support prefix tuning
+    if is_cross_attention and past_key_value is not None and past_key_value[0].shape[2] == key_value_states.shape[1]:
+        # reuse k,v, cross_attentions
+        key_states = past_key_value[0]
+        value_states = past_key_value[1]
+    elif is_cross_attention:
+        # cross_attentions
+        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+    elif past_key_value is not None:
+        # reuse k, v, self_attention
+        present_key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        present_value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if token_idx is not None:
+            # HPU bug WA
+            key_states, value_states = past_key_value
+            key_states.index_add_(
+                2, token_idx - 1, present_key_states - torch.index_select(key_states, 2, token_idx - 1)
+            )
+            value_states.index_add_(
+                2, token_idx - 1, present_value_states - torch.index_select(value_states, 2, token_idx - 1)
+            )
+        else:
+            key_states = torch.cat([past_key_value[0], present_key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], present_value_states], dim=2)
+    else:
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+    if self.is_decoder:
+        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+        # Further calls to cross_attention layer can then reuse all cross-attention
+        # key/value_states (first "if" case)
+        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+        # all previous decoder key/value_states. Further calls to uni-directional self-attention
+        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+        # if encoder bi-directional self-attention `past_key_value` is always `None`
+        past_key_value = (key_states, value_states)
+
+    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+    key_states = key_states.reshape(*proj_shape)
+    value_states = value_states.reshape(*proj_shape)
+
+    src_len = key_states.size(1)
+    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if layer_head_mask is not None:
+        if layer_head_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    if output_attentions:
+        # this operation is a bit awkward, but it's required to
+        # make sure that attn_weights keeps its gradient.
+        # In order to do so, attn_weights have to be reshaped
+        # twice and have to be reused in the following
+        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+    else:
+        attn_weights_reshaped = None
+
+    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+    attn_output = torch.bmm(attn_probs, value_states)
+
+    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+    attn_output = attn_output.transpose(1, 2)
+
+    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+    # partitioned across GPUs when using tensor-parallelism.
+    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, attn_weights_reshaped, past_key_value
+
+
+def gaudi_BartEncoderLayer_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    attention_mask: torch.FloatTensor,
+    layer_head_mask: torch.FloatTensor,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+    residual = hidden_states
+    hidden_states, attn_weights, _ = self.self_attn(
+        hidden_states=hidden_states,
+        attention_mask=attention_mask,
+        layer_head_mask=layer_head_mask,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+    hidden_states = residual + hidden_states
+    hidden_states = self.self_attn_layer_norm(hidden_states)
+
+    residual = hidden_states
+    hidden_states = self.activation_fn(self.fc1(hidden_states))
+    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+    hidden_states = self.fc2(hidden_states)
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+    hidden_states = residual + hidden_states
+    hidden_states = self.final_layer_norm(hidden_states)
+
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (attn_weights,)
+
+    return outputs
+
+
+def gaudi_BartDecoderLayer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = True,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    residual = hidden_states
+
+    # Self Attention
+    # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+    # add present self-attn cache to positions 1,2 of present_key_value tuple
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        past_key_value=self_attn_past_key_value,
+        attention_mask=attention_mask,
+        layer_head_mask=layer_head_mask,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+    hidden_states = residual + hidden_states
+    hidden_states = self.self_attn_layer_norm(hidden_states)
+
+    # Cross-Attention Block
+    cross_attn_present_key_value = None
+    cross_attn_weights = None
+    if encoder_hidden_states is not None:
+        residual = hidden_states
+
+        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+        if use_cache:
+            cross_attn_past_key_value = (
+                past_key_value[-2:] if (past_key_value is not None and past_key_value[-2:] != (None, None)) else None
+            )
+        else:
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+
+        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+            hidden_states=hidden_states,
+            key_value_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            layer_head_mask=cross_attn_layer_head_mask,
+            past_key_value=cross_attn_past_key_value,
+            output_attentions=output_attentions,
+            token_idx=token_idx,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # add cross-attn to positions 3,4 of present_key_value tuple
+        present_key_value = present_key_value + cross_attn_present_key_value
+
+    # Fully Connected
+    residual = hidden_states
+    hidden_states = self.activation_fn(self.fc1(hidden_states))
+    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+    hidden_states = self.fc2(hidden_states)
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+    hidden_states = residual + hidden_states
+    hidden_states = self.final_layer_norm(hidden_states)
+
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (self_attn_weights, cross_attn_weights)
+
+    if use_cache:
+        outputs += (present_key_value,)
+
+    return outputs
+
+
+def gaudi_BartEncoder_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutput]:
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        input = input_ids
+        input_ids = input_ids.view(-1, input_ids.shape[-1])
+    elif inputs_embeds is not None:
+        input = inputs_embeds[:, :, -1]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+    embed_pos = self.embed_positions(input)
+    import habana_frameworks.torch.core as htcore
+
+    htcore.mark_step()
+    embed_pos = embed_pos.to(inputs_embeds.device)
+
+    hidden_states = inputs_embeds + embed_pos
+    hidden_states = self.layernorm_embedding(hidden_states)
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+    # expand attention_mask
+    if attention_mask is not None:
+        if self._use_sdpa and head_mask is None and not output_attentions:
+            # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+            # the manual implementation that requires a 4D causal mask in all cases.
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+        else:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+    encoder_states = () if output_hidden_states else None
+    all_attentions = () if output_attentions else None
+
+    # check if head_mask has a correct number of layers specified if desired
+    if head_mask is not None:
+        if head_mask.size()[0] != (len(self.layers)):
+            raise ValueError(
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                f" {head_mask.size()[0]}."
+            )
+
+    for idx, encoder_layer in enumerate(self.layers):
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+        to_drop = False
+        if self.training:
+            dropout_probability = torch.rand([])
+            if dropout_probability < self.layerdrop:  # skip the layer
+                to_drop = True
+
+        if to_drop:
+            layer_outputs = (None, None)
+        else:
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    (head_mask[idx] if head_mask is not None else None),
+                    output_attentions,
+                    None,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                    token_idx=token_idx,
+                )
+
+            hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            all_attentions = all_attentions + (layer_outputs[1],)
+
+    if output_hidden_states:
+        encoder_states = encoder_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+    return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions)
+
+
+def gaudi_BartDecoder_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    cross_attn_head_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+    elif input_ids is not None:
+        input = input_ids
+        input_shape = input.shape
+        input_ids = input_ids.view(-1, input_shape[-1])
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        input = inputs_embeds[:, :, -1]
+    else:
+        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+    # past_key_values_length
+    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+    tensor_past_key_values_length = token_idx - 1 if use_cache else torch.tensor(past_key_values_length)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input) * self.embed_scale
+
+    if self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
+        # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+        # the manual implementation that requires a 4D causal mask in all cases.
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            input_shape,
+            inputs_embeds,
+            past_key_values_length,
+        )
+    else:
+        # 4d mask is passed through the layers
+        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+    # expand encoder attention mask
+    if encoder_hidden_states is not None and encoder_attention_mask is not None:
+        if self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
+            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                encoder_attention_mask,
+                inputs_embeds.dtype,
+                tgt_len=input_shape[-1],
+            )
+        else:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+    # embed positions
+    import habana_frameworks.torch.core as htcore
+
+    htcore.mark_step()
+    positions = self.embed_positions(input, tensor_past_key_values_length)
+
+    htcore.mark_step()
+    positions = positions.to(inputs_embeds.device)
+
+    hidden_states = inputs_embeds + positions
+    hidden_states = self.layernorm_embedding(hidden_states)
+
+    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+    next_decoder_cache = () if use_cache else None
+
+    # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+    for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+        if attn_mask is not None:
+            if attn_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+    for idx, decoder_layer in enumerate(self.layers):
+        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if self.training:
+            dropout_probability = torch.rand([])
+            if dropout_probability < self.layerdrop:
+                continue
+
+        past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                head_mask[idx] if head_mask is not None else None,
+                cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                None,
+                output_attentions,
+                use_cache,
+                None,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_idx=token_idx,
+            )
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+            if encoder_hidden_states is not None:
+                all_cross_attentions += (layer_outputs[2],)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+            if v is not None
+        )
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+def gaudi_BartModel_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    decoder_input_ids: Optional[torch.LongTensor] = None,
+    decoder_attention_mask: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    decoder_head_mask: Optional[torch.Tensor] = None,
+    cross_attn_head_mask: Optional[torch.Tensor] = None,
+    encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, Seq2SeqModelOutput]:
+    # different to other models, Bart automatically creates decoder_input_ids from
+    # input_ids if no decoder_input_ids are provided
+    if decoder_input_ids is None and decoder_inputs_embeds is None:
+        if input_ids is None:
+            raise ValueError(
+                "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                "passed, `input_ids` cannot be `None`. Please pass either "
+                "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+            )
+
+        decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if encoder_outputs is None:
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+    # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+        encoder_outputs = BaseModelOutput(
+            last_hidden_state=encoder_outputs[0],
+            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+        )
+
+    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+    decoder_outputs = self.decoder(
+        input_ids=decoder_input_ids,
+        attention_mask=decoder_attention_mask,
+        encoder_hidden_states=encoder_outputs[0],
+        encoder_attention_mask=attention_mask,
+        head_mask=decoder_head_mask,
+        cross_attn_head_mask=cross_attn_head_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=decoder_inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        token_idx=token_idx,
+    )
+
+    if not return_dict:
+        return decoder_outputs + encoder_outputs
+
+    return Seq2SeqModelOutput(
+        last_hidden_state=decoder_outputs.last_hidden_state,
+        past_key_values=decoder_outputs.past_key_values,
+        decoder_hidden_states=decoder_outputs.hidden_states,
+        decoder_attentions=decoder_outputs.attentions,
+        cross_attentions=decoder_outputs.cross_attentions,
+        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+        encoder_hidden_states=encoder_outputs.hidden_states,
+        encoder_attentions=encoder_outputs.attentions,
+    )
+
+
+def gaudi_BartForConditionalGeneration_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    decoder_input_ids: Optional[torch.LongTensor] = None,
+    decoder_attention_mask: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    decoder_head_mask: Optional[torch.Tensor] = None,
+    cross_attn_head_mask: Optional[torch.Tensor] = None,
+    encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, Seq2SeqLMOutput]:
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if labels is not None:
+        if use_cache:
+            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+        use_cache = False
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+    outputs = self.model(
+        input_ids,
+        attention_mask=attention_mask,
+        decoder_input_ids=decoder_input_ids,
+        encoder_outputs=encoder_outputs,
+        decoder_attention_mask=decoder_attention_mask,
+        head_mask=head_mask,
+        decoder_head_mask=decoder_head_mask,
+        cross_attn_head_mask=cross_attn_head_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        decoder_inputs_embeds=decoder_inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        token_idx=token_idx,
+    )
+
+    lm_logits = self.lm_head(outputs[0])
+    lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
+
+    masked_lm_loss = None
+    if labels is not None:
+        labels = labels.to(lm_logits.device)
+        loss_fct = CrossEntropyLoss()
+        masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+    if not return_dict:
+        output = (lm_logits,) + outputs[1:]
+        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+    return Seq2SeqLMOutput(
+        loss=masked_lm_loss,
+        logits=lm_logits,
+        past_key_values=outputs.past_key_values,
+        decoder_hidden_states=outputs.decoder_hidden_states,
+        decoder_attentions=outputs.decoder_attentions,
+        cross_attentions=outputs.cross_attentions,
+        encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+        encoder_hidden_states=outputs.encoder_hidden_states,
+        encoder_attentions=outputs.encoder_attentions,
+    )
+
+
+def gaudi_BartForConditionalGeneration_prepare_inputs_for_generation(
+    self,
+    decoder_input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+    use_cache=None,
+    encoder_outputs=None,
+    token_idx=None,
+    **kwargs,
+):
+    # cut decoder_input_ids if past_key_values is used
+    if past_key_values is not None:
+        if token_idx is not None:
+            decoder_input_ids = torch.index_select(decoder_input_ids, 1, token_idx - 1)
+        else:
+            decoder_input_ids = decoder_input_ids[:, -1:].unsqueeze(-1)
+
+    return {
+        "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+        "encoder_outputs": encoder_outputs,
+        "past_key_values": past_key_values,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+        "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        "token_idx": token_idx,
+    }
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
new file mode 100644
index 00000000000..6f105b11a2f
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
@@ -0,0 +1,10 @@
+from .modeling_blip import gaudi_BlipForConditionalGeneration_generate, gaudi_BlipForQuestionAnswering_generate
+from .modeling_blip_text import (
+    gaudi_BlipTextAttention_forward,
+    gaudi_BlipTextEncoder_forward,
+    gaudi_BlipTextLayer_forward,
+    gaudi_BlipTextLMHead_forward,
+    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
+    gaudi_BlipTextModel_forward,
+    gaudi_BlipTextSelfAttention_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
new file mode 100644
index 00000000000..6545a0662d2
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
@@ -0,0 +1,124 @@
+from typing import Optional
+
+import torch
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+@torch.no_grad()
+def gaudi_BlipForConditionalGeneration_generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """
+    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1022
+    The only differences are:
+        - wrap hpu graph for each part
+    """
+    if generate_kwargs.get("hpu_graphs", True):
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        if not hasattr(self.vision_model, "clear_cache"):
+            self.vision_model = wrap_in_hpu_graph(self.vision_model)
+        if not hasattr(self.text_decoder, "clear_cache"):
+            self.text_decoder = wrap_in_hpu_graph(self.text_decoder)
+
+    batch_size = pixel_values.shape[0]
+    vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+    image_embeds = vision_outputs[0]
+
+    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
+
+    if isinstance(input_ids, list):
+        input_ids = torch.LongTensor(input_ids)
+    elif input_ids is None:
+        input_ids = (
+            torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
+            .repeat(batch_size, 1)
+            .to(image_embeds.device)
+        )
+
+    input_ids[:, 0] = self.config.text_config.bos_token_id
+    attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
+
+    outputs = self.text_decoder.generate(
+        input_ids=input_ids[:, :-1],
+        eos_token_id=self.config.text_config.sep_token_id,
+        pad_token_id=self.config.text_config.pad_token_id,
+        attention_mask=attention_mask,
+        encoder_hidden_states=image_embeds,
+        encoder_attention_mask=image_attention_mask,
+        **generate_kwargs,
+    )
+
+    return outputs
+
+
+@torch.no_grad()
+def gaudi_BlipForQuestionAnswering_generate(
+    self,
+    input_ids: torch.LongTensor,
+    pixel_values: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """
+    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1236
+    The only differences are:
+        - wrap hpu graph for each part
+        - torch.full add dtype=torch.int64, or else the default type is torch.float32. lead to coredump in embeding layer
+    """
+    if generate_kwargs.get("hpu_graphs", True):
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        if not hasattr(self.vision_model, "clear_cache"):
+            self.vision_model = wrap_in_hpu_graph(self.vision_model)
+        if not hasattr(self.text_encoder, "clear_cache"):
+            self.text_encoder = wrap_in_hpu_graph(self.text_encoder)
+        if not hasattr(self.text_decoder, "clear_cache"):
+            self.text_decoder = wrap_in_hpu_graph(self.text_decoder)
+
+    vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+    image_embeds = vision_outputs[0]
+
+    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
+
+    if isinstance(input_ids, list):
+        input_ids = torch.LongTensor(input_ids)
+
+    question_outputs = self.text_encoder(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        encoder_hidden_states=image_embeds,
+        encoder_attention_mask=image_attention_mask,
+        return_dict=False,
+    )
+
+    question_embeds = question_outputs[0]
+
+    question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)
+
+    bos_ids = torch.full(
+        (question_embeds.size(0), 1),
+        fill_value=self.decoder_start_token_id,
+        device=question_embeds.device,
+        dtype=torch.int64,
+    )
+
+    outputs = self.text_decoder.generate(
+        input_ids=bos_ids,
+        eos_token_id=self.config.text_config.sep_token_id,
+        pad_token_id=self.config.text_config.pad_token_id,
+        encoder_hidden_states=question_embeds,
+        encoder_attention_mask=question_attention_mask,
+        **generate_kwargs,
+    )
+
+    return outputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
new file mode 100644
index 00000000000..23d4ee3f3c5
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
@@ -0,0 +1,538 @@
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from transformers.modeling_utils import apply_chunking_to_forward
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_BlipTextSelfAttention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor]:
+    """
+    Copied from BlipTextSelfAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L143
+    The only differences are:
+        - add token_idx
+    """
+    mixed_query_layer = self.query(hidden_states)
+
+    # If this is instantiated as a cross-attention module, the keys
+    # and values come from an encoder; the attention mask needs to be
+    # such that the encoder's padding tokens are not attended to.
+    is_cross_attention = encoder_hidden_states is not None
+
+    if is_cross_attention:
+        key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+        value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+        attention_mask = encoder_attention_mask
+    elif past_key_value is not None:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        if token_idx is not None:
+            past_key_value[0].index_copy_(2, token_idx - 1, key_layer)
+            past_key_value[1].index_copy_(2, token_idx - 1, value_layer)
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+        else:
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+    else:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+    query_layer = self.transpose_for_scores(mixed_query_layer)
+
+    past_key_value = (key_layer, value_layer)
+
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+    if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+        seq_length = hidden_states.size()[1]
+        position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+        position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+        distance = position_ids_l - position_ids_r
+        positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+        positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+        if self.position_embedding_type == "relative_key":
+            relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+            attention_scores = attention_scores + relative_position_scores
+        elif self.position_embedding_type == "relative_key_query":
+            relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+            relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+            attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+    if attention_mask is not None:
+        # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
+        attention_scores = attention_scores + attention_mask.to(attention_scores.device)
+
+    # Normalize the attention scores to probabilities.
+    attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs_dropped = self.dropout(attention_probs)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attention_probs_dropped = attention_probs_dropped * head_mask
+
+    context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+    context_layer = context_layer.view(*new_context_layer_shape)
+
+    outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+    outputs = outputs + (past_key_value,)
+    return outputs
+
+
+def gaudi_BlipTextAttention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor]:
+    """
+    Copied from BlipTextAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L265
+    The only differences are:
+        - add token_idx
+    """
+    self_outputs = self.self(
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        past_key_value,
+        output_attentions,
+        token_idx=token_idx,
+    )
+    attention_output = self.output(self_outputs[0], hidden_states)
+    outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+    return outputs
+
+
+def gaudi_BlipTextLayer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor]:
+    """
+    Copied from BlipTextLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L333
+    The only differences are:
+        - add token_idx
+    """
+    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+    self_attention_outputs = self.attention(
+        hidden_states,
+        attention_mask,
+        head_mask,
+        output_attentions=output_attentions,
+        past_key_value=self_attn_past_key_value,
+        token_idx=token_idx,
+    )
+    attention_output = self_attention_outputs[0]
+
+    outputs = self_attention_outputs[1:-1]
+    present_key_value = self_attention_outputs[-1]
+
+    if encoder_hidden_states is not None:
+        cross_attention_outputs = self.crossattention(
+            attention_output,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = cross_attention_outputs[0]
+        outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+    layer_output = apply_chunking_to_forward(
+        self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+    )
+    outputs = (layer_output,) + outputs
+
+    outputs = outputs + (present_key_value,)
+
+    return outputs
+
+
+def gaudi_BlipTextEncoder_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = False,
+    output_hidden_states: Optional[bool] = False,
+    return_dict: Optional[bool] = True,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+    """
+    Copied from BlipTextEncoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L391
+    The only differences are:
+        - add token_idx
+    """
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attentions = () if output_attentions else None
+    all_cross_attentions = () if output_attentions and self.config.is_decoder else None
+
+    next_decoder_cache = () if use_cache else None
+
+    for i in range(self.config.num_hidden_layers):
+        layer_module = self.layer[i]
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_head_mask = head_mask[i] if head_mask is not None else None
+        past_key_value = past_key_values[i] if past_key_values is not None else None
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                layer_module.__call__,
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+        else:
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                token_idx=token_idx,
+            )
+
+        hidden_states = layer_outputs[0]
+        if use_cache:
+            next_decoder_cache += (layer_outputs[-1],)
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (layer_outputs[1],)
+            all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(
+            v
+            for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ]
+            if v is not None
+        )
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=next_decoder_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+def gaudi_BlipTextModel_forward(
+    self,
+    input_ids: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    encoder_embeds: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    is_decoder: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+    """
+    Copied from BlipTextModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L666
+    The only differences are:
+        - add token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if is_decoder:
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+    else:
+        use_cache = False
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+        device = input_ids.device
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = inputs_embeds.device
+    elif encoder_embeds is not None:
+        input_shape = encoder_embeds.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = encoder_embeds.device
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+    # past_key_values_length
+    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+    if attention_mask is None:
+        attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length))).to(device)
+
+    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+    # ourselves in which case we just need to make it broadcastable to all heads.
+    extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+        attention_mask, input_shape, device, is_decoder
+    )
+
+    # If a 2D or 3D attention mask is provided for the cross-attention
+    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+    if encoder_hidden_states is not None:
+        if isinstance(encoder_hidden_states, list):
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+        else:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+        if isinstance(encoder_attention_mask, list):
+            encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+        elif encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+    else:
+        encoder_extended_attention_mask = None
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x n_heads x N x N
+    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+    if encoder_embeds is None:
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+    else:
+        embedding_output = encoder_embeds
+
+    encoder_outputs = self.encoder(
+        embedding_output,
+        attention_mask=extended_attention_mask,
+        head_mask=head_mask,
+        encoder_hidden_states=encoder_hidden_states,
+        encoder_attention_mask=encoder_extended_attention_mask,
+        past_key_values=past_key_values,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        token_idx=token_idx,
+    )
+    sequence_output = encoder_outputs[0]
+    pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+    if not return_dict:
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+    return BaseModelOutputWithPoolingAndCrossAttentions(
+        last_hidden_state=sequence_output,
+        pooler_output=pooled_output,
+        past_key_values=encoder_outputs.past_key_values,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+        cross_attentions=encoder_outputs.cross_attentions,
+    )
+
+
+def gaudi_BlipTextLMHead_forward(
+    self,
+    input_ids: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    labels: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.Tensor]] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    return_logits: Optional[bool] = False,
+    is_decoder: Optional[bool] = True,
+    reduction: Optional[str] = "mean",
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+    """
+    Copied from BlipTextLMHeadModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L820
+    The only differences are:
+        - add token_idx
+    """
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    if labels is not None:
+        use_cache = False
+
+    outputs = self.bert(
+        input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        head_mask=head_mask,
+        inputs_embeds=inputs_embeds,
+        encoder_hidden_states=encoder_hidden_states,
+        encoder_attention_mask=encoder_attention_mask,
+        past_key_values=past_key_values,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        is_decoder=is_decoder,
+        token_idx=token_idx,
+    )
+
+    sequence_output = outputs[0]
+    prediction_scores = self.cls(sequence_output)
+
+    if return_logits:
+        return prediction_scores[:, :-1, :].contiguous()
+
+    lm_loss = None
+    if labels is not None:
+        # we are doing next-token prediction; shift prediction scores and input ids by one
+        shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+        labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
+        loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
+        lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if reduction == "none":
+            lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+    if not return_dict:
+        output = (prediction_scores,) + outputs[2:]
+        return ((lm_loss,) + output) if lm_loss is not None else output
+
+    return CausalLMOutputWithCrossAttentions(
+        loss=lm_loss,
+        logits=prediction_scores,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        cross_attentions=outputs.cross_attentions,
+    )
+
+
+def gaudi_BlipTextLMHead_prepare_inputs_for_generation(
+    self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, **model_kwargs
+):
+    """
+    Copied from BlipTextLMHeadModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L910
+    The only differences are:
+        - add token_idx support, add position_ids
+    """
+    input_shape = input_ids.shape
+    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+    if attention_mask is None:
+        attention_mask = input_ids.new_ones(input_shape)
+
+    # cut decoder_input_ids if past_key_values is used
+    if past_key_values is not None:
+        if token_idx is not None:
+            input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+        else:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+    position_ids = None
+
+    if token_idx is not None and attention_mask is not None:
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "past_key_values": past_key_values,
+        "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+        "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+        "is_decoder": True,
+        "token_idx": token_idx,
+        "position_ids": position_ids,
+    }
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
new file mode 100644
index 00000000000..8aa34e41459
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
@@ -0,0 +1,9 @@
+from .modeling_bloom import (
+    GaudiBloomForCausalLM,
+    GaudiBloomMLP,
+    gaudi_bloom_attention_forward,
+    gaudi_bloom_block_forward,
+    gaudi_bloom_convert_to_bloom_cache,
+    gaudi_bloom_convert_to_standard_cache,
+    gaudi_bloom_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
new file mode 100644
index 00000000000..a0a6d8c2d7e
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -0,0 +1,612 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+# Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomMLP, dropout_add
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_bloom_build_alibi_tensor(
+    attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype, training: bool
+) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`):
+            Number of heads.
+        dtype (`torch.dtype`):
+            Dtype of the output tensor.
+        training (`bool`):
+            Whether the model is being trained or not.
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    if training:
+        arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+        alibi = slopes[..., None] * arange_tensor
+        return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+    else:
+        # code taken from Megatron transformer.py
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(seq_length, device=attention_mask.device).unsqueeze(
+            0
+        ).unsqueeze(0).expand(num_heads, -1, -1)
+
+        # Select the part of the tensor that corresponds to our tensor parallel index.
+        # if inference_tp_size is set use it instead of world size
+        world = int(os.environ.get("WORLD_SIZE", 1))
+        tp_world_size = GaudiBloomForCausalLM.inference_tp_size if GaudiBloomForCausalLM.inference_tp_size else world
+        tp_index = 0  # if world size == 1 ignore rank and use 0 (for cases where WORLD_SIZE is not equal to tp size)
+        if tp_world_size > 1:
+            tp_index = int(os.environ.get("RANK", 0))
+
+        alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index]
+
+        alibi = alibi.repeat(batch_size, 1, 1)
+        return alibi.to(dtype)
+
+
+def update(prev, cur, dim, idx):
+    if idx is not None:
+        if os.environ.get("WA_INDEX_COPY", "1") == "1":
+            past_selector, value_selector = idx
+            if dim == 1:
+                sel = torch.cat([past_selector, value_selector.unsqueeze(2)], dim=2)
+                val = torch.cat([prev, cur], dim=1)
+                return torch.bmm(sel, val)
+            else:
+                sel = torch.cat([past_selector, value_selector.unsqueeze(1)], dim=1)
+                val = torch.cat([prev, cur], dim=2)
+                return torch.bmm(val, sel)
+        else:
+            return prev.index_copy_(dim, idx - 1, cur)
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+
+def gaudi_bloom_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    alibi: torch.Tensor,
+    attention_mask: torch.Tensor,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+    # 3 x [batch_size, seq_length, num_heads, head_dim]
+    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+    batch_size, q_length, _, _ = query_layer.shape
+
+    query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+    key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, q_length)
+    value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+
+    # Collapse views to improve performance on HPU
+    query_layer = query_layer.contiguous()
+    key_layer = key_layer.contiguous()
+    value_layer = value_layer.contiguous()
+
+    if layer_past is not None:
+        past_key, past_value = layer_past
+        # concatenate along seq_length dimension:
+        #  - key: [batch_size * self.num_heads, head_dim, kv_length]
+        #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+        key_layer = update(past_key, key_layer, 2, token_idx)
+        value_layer = update(past_value, value_layer, 1, token_idx)
+
+    _, _, kv_length = key_layer.shape
+
+    if use_cache is True:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+
+    # [batch_size * num_heads, q_length, kv_length]
+    # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
+    matmul_result = alibi.baddbmm(
+        batch1=query_layer,
+        batch2=key_layer,
+        beta=self.beta,
+        alpha=self.inv_norm_factor,
+    )
+
+    # change view to [batch_size, num_heads, q_length, kv_length]
+    attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
+
+    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+    input_dtype = attention_scores.dtype
+    attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
+    attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
+
+    # [batch_size, num_heads, q_length, kv_length]
+    attention_probs = self.attention_dropout(attention_probs)
+
+    if head_mask is not None:
+        attention_probs = attention_probs * head_mask
+
+    # change view [batch_size x num_heads, q_length, kv_length]
+    attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
+
+    # matmul: [batch_size * num_heads, q_length, head_dim]
+    context_layer = torch.bmm(attention_probs_reshaped, value_layer)
+
+    # change view [batch_size, q_length, num_heads * head_dim]
+    context_layer = self._merge_heads(context_layer)
+
+    # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+    if self.pretraining_tp > 1 and self.slow_but_exact:
+        slices = self.hidden_size / self.pretraining_tp
+        output_tensor = torch.zeros_like(context_layer)
+        for i in range(self.pretraining_tp):
+            output_tensor = output_tensor + F.linear(
+                context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+            )
+    else:
+        output_tensor = self.dense(context_layer)
+
+    output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
+
+    outputs = (output_tensor, present)
+    if output_attentions:
+        outputs += (attention_probs,)
+
+    return outputs
+
+
+class GaudiBloomMLP(BloomMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.gelu_impl = torch.nn.GELU(approximate="tanh")
+
+
+def gaudi_bloom_block_forward(
+    self,
+    hidden_states: torch.Tensor,
+    alibi: torch.Tensor,
+    attention_mask: torch.Tensor,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    # hidden_states: [batch_size, seq_length, hidden_size]
+
+    # Layer norm at the beginning of the transformer layer.
+    layernorm_output = self.input_layernorm(hidden_states)
+
+    # Layer norm post the self attention.
+    if self.apply_residual_connection_post_layernorm:
+        residual = layernorm_output
+    else:
+        residual = hidden_states
+
+    # Self attention.
+    attn_outputs = self.self_attention(
+        layernorm_output,
+        residual,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        alibi=alibi,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+
+    attention_output = attn_outputs[0]
+
+    outputs = attn_outputs[1:]
+
+    layernorm_output = self.post_attention_layernorm(attention_output)
+
+    # Get residual
+    if self.apply_residual_connection_post_layernorm:
+        residual = layernorm_output
+    else:
+        residual = attention_output
+
+    # MLP.
+    output = self.mlp(layernorm_output, residual)
+
+    if use_cache:
+        outputs = (output,) + outputs
+    else:
+        outputs = (output,) + outputs[1:]
+
+    return outputs  # hidden_states, present, attentions
+
+
+def gaudi_bloom_convert_to_standard_cache(
+    self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int, training: bool
+) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+    num_heads, ...]))
+    """
+    batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
+    if training:
+        num_heads = batch_size_times_num_heads // batch_size
+    else:
+        world = int(os.environ.get("WORLD_SIZE", 1))
+        tp_world_size = GaudiBloomForCausalLM.inference_tp_size if GaudiBloomForCausalLM.inference_tp_size else world
+        num_heads = self.config.n_head // tp_world_size
+        batch_size = batch_size_times_num_heads // num_heads
+    # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
+    # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
+    return tuple(
+        (
+            layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
+            layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
+        )
+        for layer_past in past_key_value
+    )
+
+
+def gaudi_bloom_convert_to_bloom_cache(
+    self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+    """
+    batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+    batch_size_times_num_heads = batch_size * num_heads
+    # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+    # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+    return tuple(
+        (
+            layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
+            layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
+        )
+        for layer_past in past_key_value
+    )
+
+
+def gaudi_bloom_model_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+    **deprecated_arguments,
+) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+    if deprecated_arguments.pop("position_ids", False) is not False:
+        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+        warnings.warn(
+            "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+            " passing `position_ids`.",
+            FutureWarning,
+        )
+    if len(deprecated_arguments) > 0:
+        raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape
+    elif inputs_embeds is not None:
+        batch_size, seq_length, _ = inputs_embeds.shape
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if past_key_values is None:
+        past_key_values = tuple([None] * len(self.h))
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape batch_size x num_heads x N x N
+    # head_mask has shape n_layer x batch x num_heads x N x N
+    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.word_embeddings(input_ids)
+
+    hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+    presents = () if use_cache else None
+    all_self_attentions = () if output_attentions else None
+    all_hidden_states = () if output_hidden_states else None
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # Compute alibi tensor: check gaudi_bloom_build_alibi_tensor
+    seq_length_with_past = seq_length
+    past_key_values_length = 0
+    if past_key_values[0] is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+        seq_length_with_past = seq_length_with_past + past_key_values_length
+    if attention_mask is None:
+        attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+    else:
+        attention_mask = attention_mask.to(hidden_states.device)
+
+    alibi = gaudi_bloom_build_alibi_tensor(attention_mask, self.num_heads, hidden_states.dtype, self.training)
+
+    causal_mask = _gaudi_prepare_4d_causal_attention_mask(
+        attention_mask,
+        input_shape=(batch_size, seq_length),
+        inputs_embeds=inputs_embeds,
+        past_key_values_length=past_key_values_length,
+    )
+    causal_mask = causal_mask.bool()
+
+    if token_idx is not None and past_key_values[0] is not None and os.environ.get("WA_INDEX_COPY", "1") == "1":
+        pkv = past_key_values[0][0]
+        cur = torch.nn.functional.one_hot(torch.tile(token_idx - 1, (pkv.shape[0],)), pkv.shape[-1]).to(pkv.dtype)
+        past = torch.diag_embed(1 - cur)
+        token_idx = (past, cur)
+
+    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                block.__call__,
+                hidden_states,
+                alibi,
+                causal_mask,
+                layer_past,
+                head_mask[i],
+                use_cache,
+                output_attentions,
+                None,
+            )
+        else:
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                alibi=alibi,
+                token_idx=token_idx,
+            )
+
+        hidden_states = outputs[0]
+        if use_cache is True:
+            presents = presents + (outputs[1],)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+    # Add last hidden state
+    hidden_states = self.ln_f(hidden_states)
+
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+    )
+
+
+class GaudiBloomForCausalLM(BloomForCausalLM):
+    inference_tp_size = None
+
+    def set_tp_for_inference(tp_for_inference: int):
+        world = int(os.environ.get("WORLD_SIZE", 1))
+        assert tp_for_inference == 1 or tp_for_inference == world, "only setting 1 (no tp) or world size is supported"
+        GaudiBloomForCausalLM.inference_tp_size = tp_for_inference
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # only last tokens for input_ids if past is not None
+        if past_key_values is not None:
+            if token_idx is None:
+                input_ids = input_ids[:, -1].unsqueeze(-1)
+            else:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+
+            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
+            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
+                past_key_values = self._convert_to_bloom_cache(past_key_values)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx), training=self.training)
+
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in standardized_past
+        )
+        return self._convert_to_bloom_cache(reordered_past)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
new file mode 100644
index 00000000000..d433e24c8db
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_codegen import (
+    GaudiCodeGenAttention,
+    GaudiCodeGenForCausalLM,
+    gaudi_codegen_block_forward,
+    gaudi_codegen_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
new file mode 100644
index 00000000000..b5680859710
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
@@ -0,0 +1,420 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.codegen.modeling_codegen import (
+    CodeGenAttention,
+    CodeGenForCausalLM,
+    apply_rotary_pos_emb,
+    logger,
+)
+
+
+class GaudiCodeGenAttention(CodeGenAttention):
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[
+        Tuple[torch.Tensor, Tuple[torch.Tensor]],
+        Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
+    ]:
+        """
+        Copied from CodeGenAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
+        The only differences are:
+        - add new args token_idx
+        - optimize KV cache
+        """
+        qkv = self.qkv_proj(hidden_states)
+        # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
+        mp_num = 4
+        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
+        local_dim = self.head_dim * self.num_attention_heads // mp_num
+        query, value, key = torch.split(qkv_split, local_dim, dim=-1)
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        value = value.permute(0, 2, 1, 3)
+
+        embed_positions = self.embed_positions
+        if embed_positions.device != position_ids.device:
+            embed_positions = embed_positions.to(position_ids.device)
+            self.embed_positions = embed_positions
+
+        sincos = embed_positions[position_ids]
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
+
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
+
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+
+        if layer_past is not None:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            if token_idx is not None:
+                key = past_key.index_add_(2, token_idx - 1, key - torch.index_select(past_key, 2, token_idx - 1))
+                value = past_value.index_add_(
+                    2, token_idx - 1, value - torch.index_select(past_value, 2, token_idx - 1)
+                )
+            else:
+                key = torch.cat((past_key, key), dim=-2)
+                value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+def gaudi_codegen_block_forward(
+    self,
+    hidden_states: Optional[torch.FloatTensor],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+    """
+    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
+    The only differences are:
+    - add new args token_idx
+    """
+    residual = hidden_states
+    hidden_states = self.ln_1(hidden_states)
+    attn_outputs = self.attn(
+        hidden_states=hidden_states,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+    outputs = attn_outputs[1:]
+
+    feed_forward_hidden_states = self.mlp(hidden_states)
+    hidden_states = attn_output + feed_forward_hidden_states + residual
+
+    if use_cache:
+        outputs = (hidden_states,) + outputs
+    else:
+        outputs = (hidden_states,) + outputs[1:]
+
+    return outputs  # hidden_states, present, (attentions)
+
+
+def gaudi_codegen_model_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
+    The only differences are:
+    - add new args token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        batch_size = input_ids.shape[0]
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        batch_size = inputs_embeds.shape[0]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+    if token_type_ids is not None:
+        token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+    if past_key_values is None:
+        past_length = 0
+        past_key_values = tuple([None] * len(self.h))
+    else:
+        past_length = past_key_values[0][0].size(-2)
+
+    if position_ids is None:
+        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0)
+
+    # Attention mask.
+    if attention_mask is not None:
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+        attention_mask = attention_mask.view(batch_size, -1)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x num_attention_heads x N x N
+    # head_mask has shape n_layer x batch x num_attention_heads x N x N
+    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.wte(input_ids)
+
+    hidden_states = inputs_embeds
+
+    if token_type_ids is not None:
+        token_type_embeds = self.wte(token_type_ids)
+        hidden_states = hidden_states + token_type_embeds
+
+    hidden_states = self.drop(hidden_states)
+
+    output_shape = input_shape + (hidden_states.size(-1),)
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                "`use_cache=False`..."
+            )
+            use_cache = False
+
+    presents = () if use_cache else None
+    all_self_attentions = () if output_attentions else None
+    all_hidden_states = () if output_hidden_states else None
+    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                block.__call__,
+                hidden_states,
+                None,
+                attention_mask,
+                position_ids,
+                head_mask[i],
+                use_cache,
+                output_attentions,
+                None,
+            )
+        else:
+            outputs = block(
+                hidden_states=hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+            )
+
+        hidden_states = outputs[0]
+        if use_cache is True:
+            presents = presents + (outputs[1],)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+    hidden_states = self.ln_f(hidden_states)
+
+    hidden_states = hidden_states.view(output_shape)
+    # Add last hidden state
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+    )
+
+
+class GaudiCodeGenForCausalLM(CodeGenForCausalLM):
+    """
+    Inherits from CodeGenForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - when KV cache is enabled, slice next_input_ids from input_ids based on the token_idx
+    - when KV cache is enabled, slice next_position_ids from position_ids based on the token_idx
+    """
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, token_idx=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+                if token_type_ids is not None:
+                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
+            else:
+                input_ids = input_ids[:, -1]
+                if token_type_ids is not None:
+                    token_type_ids = token_type_ids[:, -1]
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -1]
+
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "token_idx": token_idx,
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
new file mode 100644
index 00000000000..ca83d982924
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_esmfold import (
+    gaudi_esm_for_protein_folding_forward,
+    gaudi_esmfolding_trunk_forward,
+    gaudi_rot_matmul,
+    gaudi_rot_vec_mul,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
new file mode 100644
index 00000000000..88b6dac0d24
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
@@ -0,0 +1,347 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+from transformers.models.esm.modeling_esmfold import EsmForProteinFoldingOutput, categorical_lddt
+from transformers.models.esm.openfold_utils import (
+    compute_predicted_aligned_error,
+    compute_tm,
+    make_atom14_masks,
+)
+from transformers.utils import (
+    ContextManagers,
+)
+
+
+def gaudi_esmfolding_trunk_forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles):
+    """
+    Inputs:
+        seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B
+        x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues
+
+    Output:
+        predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
+
+    Copied from EsmFoldingTrunk.forward:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esmfold.py
+    The change is:
+    - Add extra mark_step in trunk_iter for each block.
+    """
+
+    device = seq_feats.device
+    s_s_0 = seq_feats
+    s_z_0 = pair_feats
+
+    if no_recycles is None:
+        no_recycles = self.config.max_recycles
+    else:
+        if no_recycles < 0:
+            raise ValueError("Number of recycles must not be negative.")
+        no_recycles += 1  # First 'recycle' is just the standard forward pass through the model.
+
+    def trunk_iter(s, z, residx, mask):
+        z = z + self.pairwise_positional_embedding(residx, mask=mask)
+
+        for block in self.blocks:
+            s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
+            if s.device.type == "hpu":
+                import habana_frameworks.torch.core as htcore
+
+                htcore.mark_step()
+        return s, z
+
+    s_s = s_s_0
+    s_z = s_z_0
+    recycle_s = torch.zeros_like(s_s)
+    recycle_z = torch.zeros_like(s_z)
+    recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
+
+    for recycle_idx in range(no_recycles):
+        with ContextManagers([] if recycle_idx == no_recycles - 1 else [torch.no_grad()]):
+            # === Recycling ===
+            recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device)
+            recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device)
+            recycle_z += self.recycle_disto(recycle_bins.detach()).to(device)
+
+            s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
+
+            # === Structure module ===
+            structure = self.structure_module(
+                {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
+                true_aa,
+                mask.float(),
+            )
+
+            recycle_s = s_s
+            recycle_z = s_z
+            # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
+            recycle_bins = self.distogram(
+                structure["positions"][-1][:, :, :3],
+                3.375,
+                21.375,
+                self.recycle_bins,
+            )
+
+    structure["s_s"] = s_s
+    structure["s_z"] = s_z
+
+    return structure
+
+
+def gaudi_esm_for_protein_folding_forward(
+    self,
+    input_ids: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    masking_pattern: Optional[torch.Tensor] = None,
+    num_recycles: Optional[int] = None,
+) -> EsmForProteinFoldingOutput:
+    r"""
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, EsmForProteinFolding
+
+    >>> model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+    >>> inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt", add_special_tokens=False)  # A tiny random peptide
+    >>> outputs = model(**inputs)
+    >>> folded_positions = outputs.positions
+    ```
+
+    Copied from EsmForProteinFolding.forward:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esmfold.py
+    The change is:
+    - rewrite (softmax().unsqueeze() @ esm_s).squeeze() with equivalent but less dims algorithm on HPU.
+
+    """
+    cfg = self.config.esmfold_config
+
+    aa = input_ids  # B x L
+    B = aa.shape[0]
+    L = aa.shape[1]
+    device = input_ids.device
+    if attention_mask is None:
+        attention_mask = torch.ones_like(aa, device=device)
+    if position_ids is None:
+        position_ids = torch.arange(L, device=device).expand_as(input_ids)
+
+    # === ESM ===
+    esmaa = self.af2_idx_to_esm_idx(aa, attention_mask)
+
+    if masking_pattern is not None:
+        masked_aa, esmaa, mlm_targets = self.bert_mask(aa, esmaa, attention_mask, masking_pattern)
+    else:
+        masked_aa = aa
+        mlm_targets = None
+
+    # We get sequence and pair representations from whatever version of ESM /
+    # configuration we are using. The sequence representation esm_s is always
+    # present. The pair embedding esm_z may be present depending on the
+    # configuration of the model. If esm_z is not used by the model then it
+    # is returned as None here.
+    esm_s = self.compute_language_model_representations(esmaa)
+
+    # Convert esm_s and esm_z, if present, to the precision used by the trunk and
+    # the structure module. These tensors may be a lower precision if, for example,
+    # we're running the language model in fp16 precision.
+    esm_s = esm_s.to(self.esm_s_combine.dtype)
+
+    if cfg.esm_ablate_sequence:
+        esm_s = esm_s * 0
+
+    esm_s = esm_s.detach()
+
+    # === preprocessing ===
+    if esm_s.device.type == "hpu":
+        dims = esm_s.shape
+        esm_s = esm_s.reshape(-1, dims[-2], dims[-1])  # combine first 2 dims
+        esm_s = self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s
+        esm_s = esm_s.reshape(dims[0], dims[1], esm_s.shape[-2], esm_s.shape[-1])  # split back 1st dim
+        esm_s = esm_s.squeeze(2)
+    else:
+        esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
+    s_s_0 = self.esm_s_mlp(esm_s)
+
+    s_z_0 = s_s_0.new_zeros(B, L, L, cfg.trunk.pairwise_state_dim)
+
+    if self.config.esmfold_config.embed_aa:
+        s_s_0 += self.embedding(masked_aa)
+
+    structure: dict = self.trunk(s_s_0, s_z_0, aa, position_ids, attention_mask, no_recycles=num_recycles)
+    # Documenting what we expect:
+    structure = {
+        k: v
+        for k, v in structure.items()
+        if k
+        in [
+            "s_z",
+            "s_s",
+            "frames",
+            "sidechain_frames",
+            "unnormalized_angles",
+            "angles",
+            "positions",
+            "states",
+        ]
+    }
+
+    # Add BERT mask for the loss to use, if available.
+    if mlm_targets:
+        structure["mlm_targets"] = mlm_targets
+
+    disto_logits = self.distogram_head(structure["s_z"])
+    disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
+    structure["distogram_logits"] = disto_logits
+
+    lm_logits = self.lm_head(structure["s_s"])
+    structure["lm_logits"] = lm_logits
+
+    structure["aatype"] = aa
+    make_atom14_masks(structure)
+    # Of course, this doesn't respect the true mask because it doesn't know about it...
+    # We're not going to properly mask change of index tensors:
+    #    "residx_atom14_to_atom37",
+    #    "residx_atom37_to_atom14",
+    for k in [
+        "atom14_atom_exists",
+        "atom37_atom_exists",
+    ]:
+        structure[k] *= attention_mask.unsqueeze(-1)
+    structure["residue_index"] = position_ids
+
+    lddt_head = self.lddt_head(structure["states"]).reshape(structure["states"].shape[0], B, L, -1, self.lddt_bins)
+    structure["lddt_head"] = lddt_head
+    plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
+    structure["plddt"] = plddt
+
+    ptm_logits = self.ptm_head(structure["s_z"])
+    structure["ptm_logits"] = ptm_logits
+    structure["ptm"] = compute_tm(ptm_logits, max_bin=31, no_bins=self.distogram_bins)
+    structure.update(compute_predicted_aligned_error(ptm_logits, max_bin=31, no_bins=self.distogram_bins))
+
+    return EsmForProteinFoldingOutput(**structure)
+
+
+def gaudi_rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    """
+    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
+
+    Args:
+        r: [*, 3, 3] rotation matrices
+        t: [*, 3] coordinate tensors
+    Returns:
+        [*, 3] rotated coordinates
+
+    Copied from rot_vec_mul:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/openfold_utils/rigid_utils.py
+    The change is:
+    - Using matmul when possible on HPU to get better performance.
+    """
+    # Do matmal on HPU directly when possible to get better performance.
+    if r.device.type == "hpu":
+        if t.dim() > 5:
+            pass
+        elif t.dim() == 5:
+            # Combine shape[2] and shape[3] on HPU
+            shape_t = t.shape
+            shape_r = r.shape
+            t = t.reshape(shape_t[0], shape_t[1], shape_t[2] * shape_t[3], shape_t[4])
+            r = r.reshape(shape_r[0], shape_r[1], shape_r[2] * shape_r[3], shape_r[4], shape_r[5])
+            t = t.unsqueeze(-2)
+            r = r.transpose(-2, -1)
+            out = t @ r
+            shape_out = out.shape
+            out = out.reshape(
+                shape_out[0],
+                shape_out[1],
+                max(shape_r[2], shape_t[2]),
+                max(shape_r[3], shape_t[3]),
+                shape_out[3],
+                shape_out[4],
+            )
+            out = out.squeeze(-2)
+            return out
+        else:
+            t = t.unsqueeze(-2)
+            r = r.transpose(-2, -1)
+            out = t @ r
+            out = out.squeeze(-2)
+            return out
+
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+
+
+def gaudi_rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
+
+    Args:
+        a: [*, 3, 3] left multiplicand
+        b: [*, 3, 3] right multiplicand
+    Returns:
+        The product ab
+
+    Copied from rot_matmul:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/openfold_utils/rigid_utils.py
+    The change is:
+    - Using matmul when possible on HPU to get better performance.
+    """
+
+    # Do matmal on HPU directly when possible to get better performance.
+    if a.device.type == "hpu":
+        if a.shape == b.shape or a.dim() < 5:
+            out = a @ b
+            return out
+        elif a.dim() == 5 and a.shape[2] == 1:
+            # HPU does not handle dim==5 with below broadcast correctly.
+            # a.shape = torch.Size([1, 512, 1, 3, 3]), b.shape = torch.Size([1, 512, 8, 3, 3])
+            a = a.permute(0, 1, 2, 4, 3)
+            b = b.permute(0, 1, 2, 4, 3)
+            out = b @ a
+            out = out.permute(0, 1, 2, 4, 3)
+            return out
+        else:
+            pass
+
+    def row_mul(i: int) -> torch.Tensor:
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+
+    return torch.stack(
+        [
+            row_mul(0),
+            row_mul(1),
+            row_mul(2),
+        ],
+        dim=-2,
+    )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
new file mode 100644
index 00000000000..44ac5451f6f
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
@@ -0,0 +1,7 @@
+from .modeling_falcon import (
+    GaudiFalconForCausalLM,
+    GaudiFalconModel,
+    gaudi_falcon_attention_forward,
+    gaudi_falcon_attention_split_heads,
+    gaudi_falcon_decoder_layer_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
new file mode 100644
index 00000000000..a10874bbc7b
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -0,0 +1,679 @@
+import contextlib
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused kernel for scaled_dot_product_attention")
+    FusedSDPA = None
+
+try:
+    from habana_frameworks.torch.hpu import sdp_kernel
+
+    SDPContext = True
+except ImportError:
+    SDPContext = False
+
+try:
+    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+except ImportError:
+    print("Not using HPU fused kernel for apply_rotary_pos_emb")
+    FusedRoPE = None
+
+
+import habana_frameworks.torch.core as htcore
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from transformers.models.falcon.modeling_falcon import (
+    FalconForCausalLM,
+    FalconModel,
+    apply_rotary_pos_emb,
+    build_alibi_tensor,
+    dropout_add,
+)
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import (
+    GaudiAttentionMaskConverter,
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+def apply_customized_rope(q, k, cos, sin, position_ids):
+    if q.device.type == "hpu" and FusedRoPE:
+        # TODO: remove `.clone()` when it is fixed in SynapseAI
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        ), FusedRoPE.apply(
+            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        )
+    else:
+        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+
+
+def gaudi_falcon_attention_split_heads(
+    self, fused_qkv: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Copied from FalconAttention._split_heads https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/falcon/modeling_falcon.py
+    Changing index operation of qkv[:::] to use torch.index_select to work around gradient accuracy issue and improve performance.
+    """
+    if self.new_decoder_architecture:
+        batch, seq_len, _ = fused_qkv.shape
+
+        if self.config.num_attention_heads != self.num_heads:  # When DS divides heads for TP
+            num_heads = self.config.num_attention_heads
+            num_kv_heads = self.config.num_kv_heads
+        else:  # When DS not in use
+            num_heads = self.num_heads
+            num_kv_heads = self.num_kv_heads
+
+        qkv = fused_qkv.view(batch, seq_len, -1, num_heads // num_kv_heads + 2, self.head_dim)
+        # query = qkv[:, :, :, :-2]
+        # key = qkv[:, :, :, [-2]]
+        # value = qkv[:, :, :, [-1]]
+        d3 = qkv.shape[3] - 2
+        query = torch.index_select(qkv, 3, index=torch.arange(d3, device=qkv.device))
+        key = torch.index_select(qkv, 3, index=torch.tensor([d3], device=qkv.device))
+        value = torch.index_select(qkv, 3, index=torch.tensor([d3 + 1], device=qkv.device))
+
+        key = torch.broadcast_to(key, query.shape)
+        value = torch.broadcast_to(value, query.shape)
+
+        query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
+        return query, key, value
+    elif not self.multi_query:
+        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+        # TODO : Need to be fixed to use index_select()
+        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+    else:
+        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+        # return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
+        d2 = fused_qkv.shape[2] - 2
+        query = torch.index_select(fused_qkv, 2, index=torch.arange(d2, device=fused_qkv.device))
+        key = torch.index_select(fused_qkv, 2, index=torch.tensor([d2], device=fused_qkv.device))
+        value = torch.index_select(fused_qkv, 2, index=torch.tensor([d2 + 1], device=fused_qkv.device))
+        return query, key, value
+
+
+def gaudi_falcon_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    alibi: Optional[torch.Tensor],
+    attention_mask: torch.Tensor,
+    position_ids: Optional[torch.LongTensor] = None,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    """
+    Copied from FalconAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    The only differences are:
+    - add new args token_idx and position_ids
+    - replace F.scaled_dot_product_attention with Habana torch's version
+    """
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+
+    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+    # 3 x [batch_size, seq_length, num_heads, head_dim]
+    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+    batch_size, query_length, _, _ = query_layer.shape
+
+    query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+    key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+    value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+
+    kv_seq_len = key_layer.shape[-2]
+    if layer_past is not None:
+        if token_idx is not None:
+            # When token_idx is used,
+            # past_kv_length = 0
+            # static seq len = (input token len + max output token len)
+            kv_seq_len = layer_past[0].shape[-2]
+        else:
+            kv_seq_len += layer_past[0].shape[-2]
+    if alibi is None:
+        cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len)
+        query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids)
+
+    if layer_past is not None:
+        past_key, past_value = layer_past
+        if token_idx is not None:
+            past_key.index_copy_(-2, token_idx - 1, key_layer)
+            past_value.index_copy_(-2, token_idx - 1, value_layer)
+            key_layer = past_key
+            value_layer = past_value
+        else:
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size, self.num_heads, kv_length, head_dim]
+            #  - value: [batch_size, self.num_heads, kv_length, head_dim]
+            key_layer = torch.cat((past_key, key_layer), dim=-2)
+            value_layer = torch.cat((past_value, value_layer), dim=-2)
+
+    kv_length = key_layer.shape[-2]
+    if use_cache:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+
+    if alibi is None:
+        if output_attentions:
+            attention_scores = query_layer @ key_layer.transpose(-1, -2)
+            attention_scores /= math.sqrt(self.head_dim)
+
+            attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+            # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
+            attn_output = attention_scores @ value_layer
+        else:
+            if FusedSDPA:
+                with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
+                    attn_output = FusedSDPA.apply(
+                        query_layer,
+                        key_layer,
+                        value_layer,
+                        attention_mask,
+                        0.0,
+                        # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                        self.is_causal and attention_mask is None and query_length > 1,
+                    )
+            else:
+                # Workaround util scaled_dot_product_attention support broadcast.
+                if self.training is True and query_layer.shape != key_layer.shape:
+                    key_layer = torch.broadcast_to(key_layer, query_layer.shape)
+                    value_layer = torch.broadcast_to(value_layer, query_layer.shape)
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attention_mask,
+                    0.0,
+                    # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
+                )
+            # Performance improvement for HPU
+            if self.training is True and htcore:
+                htcore.mark_step()
+            attention_scores = None
+
+        attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim)
+        attn_output = attn_output.permute(0, 2, 1, 3)
+        attn_output = attn_output.reshape(batch_size, query_length, -1)
+
+        attn_output = self.dense(attn_output)
+
+        if output_attentions:
+            return attn_output, present, attention_scores
+        else:
+            return attn_output, present
+
+    else:
+        if self._use_sdpa and not output_attentions and head_mask is None:
+            if FusedSDPA:
+                with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
+                    attn_output = FusedSDPA.apply(
+                        query_layer,
+                        key_layer,
+                        value_layer,
+                        attention_mask,
+                        self.attention_dropout.p if self.training else 0.0,
+                        self.is_causal and attention_mask is None and query_length > 1,
+                    )
+            else:
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=self.attention_dropout.p if self.training else 0.0,
+                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
+                )
+            attn_output = attn_output.transpose(1, 2)
+            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+            attn_output = self.dense(attn_output)
+        else:
+            matmul_result = query_layer @ key_layer.transpose(-1, -2)
+
+            # change view to [batch_size, num_heads, q_length, kv_length]
+            attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = attention_scores.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                attention_scores = attention_scores.to(torch.float32)
+
+            attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+            attention_logits *= self.inv_norm_factor
+            attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # change view [batch_size, num_heads, q_length, kv_length]
+            attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
+
+            # change view [batch_size, q_length, num_heads * head_dim]
+            attn_output = self._merge_heads(attn_output)
+
+            attn_output = self.dense(attn_output)
+
+        if output_attentions:
+            return attn_output, present, attention_probs
+        else:
+            return attn_output, present
+
+
+def gaudi_falcon_decoder_layer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    alibi: Optional[torch.Tensor],
+    attention_mask: torch.Tensor,
+    position_ids: Optional[torch.LongTensor] = None,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    """
+    Copied from FalconDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    The only differences are:
+    - add new args token_idx and position_ids
+    - add token_idx and position_ids into attention inputs
+    """
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+
+    residual = hidden_states
+
+    if self.config.new_decoder_architecture:
+        attention_layernorm_out = self.ln_attn(hidden_states)
+        mlp_layernorm_out = self.ln_mlp(hidden_states)
+    else:
+        attention_layernorm_out = self.input_layernorm(hidden_states)
+
+    # Self attention.
+    attn_outputs = self.self_attention(
+        attention_layernorm_out,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        alibi=alibi,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+        **kwargs,
+    )
+
+    attention_output = attn_outputs[0]
+
+    if not self.config.new_decoder_architecture:
+        if self.config.parallel_attn:
+            mlp_layernorm_out = attention_layernorm_out
+        else:
+            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
+            mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+    outputs = attn_outputs[1:]
+
+    # MLP.
+    mlp_output = self.mlp(mlp_layernorm_out)
+
+    if self.config.new_decoder_architecture or self.config.parallel_attn:
+        mlp_output += attention_output
+
+    output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+
+    if use_cache:
+        outputs = (output,) + outputs
+    else:
+        outputs = (output,) + outputs[1:]
+
+    return outputs  # hidden_states, present, attentions
+
+
+class GaudiFalconModel(FalconModel):
+    """
+    Inherits from FalconModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    The only differences are:
+    - add new args token_idx and position_ids
+    - add token_idx and position_ids into decoder inputs
+    - set past_key_values_length=0 when token_idx is used (with static input shape)
+    - add new arg tgt_len to _expand_mask because past_key_values_length is no longer valid with token_idx
+    - use old version of _make_causal_mask to workaround toch.triu that is not supported in Synapse
+    """
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        past_key_values_length = 0
+        if past_key_values[0] is not None and token_idx is None:
+            past_key_values_length = past_key_values[0][0].shape[-2]
+
+        if self.use_alibi:
+            mask = (
+                torch.ones(
+                    (batch_size, seq_length + past_key_values_length), device=inputs_embeds.device, dtype=torch.long
+                )
+                if attention_mask is None
+                else attention_mask
+            )
+            alibi = build_alibi_tensor(mask, self.num_heads, dtype=hidden_states.dtype)
+        else:
+            alibi = None
+            if position_ids is None:
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                position_ids = torch.arange(
+                    past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+                )
+                position_ids = position_ids.unsqueeze(0)
+
+        # TODO: Due to perf degradation, disable spda_attn_mask
+        use_sdpa_attn_mask = False
+
+        if self._use_sdpa and not output_attentions and use_sdpa_attn_mask:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            if alibi is None:
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
+            elif head_mask is None:
+                alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+
+                attention_mask_2d = attention_mask
+                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched.
+                attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+                )
+
+                # We take care to integrate alibi bias in the attention_mask here.
+                if attention_mask_2d is None:
+                    attention_mask = alibi / math.sqrt(self.config.hidden_size // self.num_heads)
+                else:
+                    attention_mask = torch.masked_fill(
+                        alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+                        attention_mask < -1,
+                        torch.finfo(alibi.dtype).min,
+                    )
+
+                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+                    # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+                    if seq_length > 1:
+                        attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
+                            attention_mask, attention_mask_2d, unmasked_value=0.0
+                        )
+            else:
+                # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case.
+                attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+                )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                outputs = self._gradient_checkpointing_func(
+                    block.__call__,
+                    hidden_states,
+                    alibi,
+                    attention_mask,
+                    position_ids,
+                    head_mask[i],
+                    layer_past,
+                    use_cache,
+                    output_attentions,
+                    None,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                    token_idx=token_idx,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class GaudiFalconForCausalLM(FalconForCausalLM):
+    """
+    Inherits from FalconForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    The only differences are:
+    - add new args token_idx and position_ids
+    - add token_idx and position_ids into model inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    """
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        if past_key_values is not None:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+
+        # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE.
+        if (
+            not self.transformer.use_alibi
+            and attention_mask is not None
+            and position_ids is None
+            and token_idx is not None
+        ):
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+            "token_idx": token_idx,
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
new file mode 100644
index 00000000000..7a23f947267
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
@@ -0,0 +1 @@
+from .modeling_gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
new file mode 100644
index 00000000000..c48c71199b1
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
@@ -0,0 +1,573 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2LMHeadModel, logger
+
+
+class GaudiGPT2Attention(GPT2Attention):
+    """
+    Copied from GPT2Attention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    The only differences are:
+    - optimize KV cache
+    """
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        key = key.contiguous()
+        value = value.contiguous()
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.full(
+                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            )
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        key = key.contiguous()
+        value = value.contiguous()
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+        attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+        attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim).contiguous()
+        key = self._split_heads(key, self.num_heads, self.head_dim).contiguous()
+        value = self._split_heads(value, self.num_heads, self.head_dim).contiguous()
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            if token_idx is not None:
+                past_key.index_copy_(2, token_idx - 1, key)
+                past_value.index_copy_(2, token_idx - 1, value)
+                key = past_key
+                value = past_value
+            else:
+                key = torch.cat((past_key, key), dim=-2)
+                value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        if self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+def gaudi_gpt2_block_forward(
+    self,
+    hidden_states: Optional[Tuple[torch.FloatTensor]],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+    """
+    Copied from GPT2Block.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    The only differences are:
+    - add new args token_idx
+    """
+
+    residual = hidden_states
+    hidden_states = self.ln_1(hidden_states)
+
+    attn_outputs = self.attn(
+        hidden_states,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+    outputs = attn_outputs[1:]
+    # residual connection
+    hidden_states = attn_output + residual
+
+    if encoder_hidden_states is not None:
+        # add one self-attention block for cross-attention
+        if not hasattr(self, "crossattention"):
+            raise ValueError(
+                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                "cross-attention layers by setting `config.add_cross_attention=True`"
+            )
+        residual = hidden_states
+        hidden_states = self.ln_cross_attn(hidden_states)
+        cross_attn_outputs = self.crossattention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attn_output = cross_attn_outputs[0]
+        # residual connection
+        hidden_states = residual + attn_output
+        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+
+    residual = hidden_states
+    hidden_states = self.ln_2(hidden_states)
+
+    feed_forward_hidden_states = self.mlp(hidden_states)
+    # residual connection
+    hidden_states = residual + feed_forward_hidden_states
+
+    if use_cache:
+        outputs = (hidden_states,) + outputs
+    else:
+        outputs = (hidden_states,) + outputs[1:]
+
+    return outputs  # hidden_states, present, (attentions, cross_attentions)
+
+
+def gaudi_gpt2_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+    """
+    Copied from GPT2Model.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    The only differences are:
+    - disable autocast for attention_mask
+    - add new args token_idx
+    """
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        batch_size = input_ids.shape[0]
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        batch_size = inputs_embeds.shape[0]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+    if token_type_ids is not None:
+        token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+    if past_key_values is None:
+        past_length = 0
+        past_key_values = tuple([None] * len(self.h))
+    else:
+        past_length = past_key_values[0][0].size(-2)
+    if position_ids is None:
+        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0)
+
+    # GPT2Attention mask.
+    if attention_mask is not None:
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+        attention_mask = attention_mask.view(batch_size, -1)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+
+        with torch.autocast(enabled=False, device_type="hpu"):
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+    # If a 2D or 3D attention mask is provided for the cross-attention
+    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+    if self.config.add_cross_attention and encoder_hidden_states is not None:
+        encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+        encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+    else:
+        encoder_attention_mask = None
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x n_heads x N x N
+    # head_mask has shape n_layer x batch x n_heads x N x N
+    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.wte(input_ids)
+    position_embeds = self.wpe(position_ids)
+    hidden_states = inputs_embeds + position_embeds
+
+    if token_type_ids is not None:
+        token_type_embeds = self.wte(token_type_ids)
+        hidden_states = hidden_states + token_type_embeds
+
+    hidden_states = self.drop(hidden_states)
+
+    output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    presents = () if use_cache else None
+    all_self_attentions = () if output_attentions else None
+    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+    all_hidden_states = () if output_hidden_states else None
+    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(hidden_states.device)
+            # Ensure layer_past is on same device as hidden_states (might not be correct)
+            if layer_past is not None:
+                layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+            # Ensure that attention_mask is always on the same device as hidden_states
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(hidden_states.device)
+            if isinstance(head_mask, torch.Tensor):
+                head_mask = head_mask.to(hidden_states.device)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                block.__call__,
+                hidden_states,
+                None,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,
+                encoder_attention_mask,
+                use_cache,
+                output_attentions,
+                None,
+            )
+        else:
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+            )
+
+        hidden_states = outputs[0]
+        if use_cache is True:
+            presents = presents + (outputs[1],)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+            if self.config.add_cross_attention:
+                all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+
+        # Model Parallel: If it's the last layer for that device, put things on the next device
+        if self.model_parallel:
+            for k, v in self.device_map.items():
+                if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                    hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+    hidden_states = self.ln_f(hidden_states)
+
+    hidden_states = hidden_states.view(output_shape)
+    # Add last hidden state
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+            if v is not None
+        )
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+class GaudiGPT2LMHeadModel(GPT2LMHeadModel):
+    """
+    Copied from GPT2LMHeadModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    The only differences are:
+    - add new args token_idx
+    """
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
+    ):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        else:
+            position_ids = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
new file mode 100644
index 00000000000..556f61f8c71
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_gpt_bigcode import (
+    GaudiGPTBigCodeForCausalLM,
+    gaudi_gpt_bigcode_attention_forward,
+    gaudi_gpt_bigcode_block_forward,
+    gaudi_gpt_bigcode_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
new file mode 100644
index 00000000000..8059e338062
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -0,0 +1,494 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM
+
+from ..modeling_attn_mask_utils import GaudiAttentionMaskConverter
+
+
+def gaudi_gpt_bigcode_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    layer_past: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[
+    Tuple[torch.Tensor, Optional[torch.Tensor]],
+    Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
+]:
+    """
+    Copied from GPTBigCodeAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    """
+    if encoder_hidden_states is not None:
+        if not hasattr(self, "q_attn") or not self.is_cross_attention:
+            raise ValueError(
+                "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
+            )
+
+        query = self.q_attn(hidden_states)
+        key_value = self.c_attn(encoder_hidden_states)
+        attention_mask = encoder_attention_mask
+    elif self.multi_query:
+        query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
+    else:
+        # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
+        # i.e., the memory layout is not the same as GPT2.
+        # This makes the concatenation with past_key_value more efficient.
+        query, key_value = (
+            self.c_attn(hidden_states)
+            .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
+            .transpose(1, 2)
+            .split((self.head_dim, 2 * self.head_dim), dim=3)
+        )
+
+    key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)
+
+    if layer_past is not None:
+        past_key, past_value = layer_past.split((self.head_dim, self.head_dim), dim=-1)
+        if token_idx is not None:
+            key = past_key.index_add_(1, token_idx - 1, key - torch.index_select(past_key, 1, token_idx - 1))
+            value = past_value.index_add_(1, token_idx - 1, value - torch.index_select(past_value, 1, token_idx - 1))
+        else:
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+    present = torch.cat((key, value), dim=-1) if use_cache else None
+
+    attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)
+
+    if not self.multi_query:
+        attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
+    attn_output = self.c_proj(attn_output)
+    attn_output = self.resid_dropout(attn_output)
+
+    outputs = (attn_output, present)
+    if output_attentions:
+        if self.multi_query:
+            # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
+            attn_weights = attn_weights.transpose(1, 2)
+        outputs += (attn_weights,)
+
+    return outputs  # a, present, (attentions)
+
+
+def gaudi_gpt_bigcode_block_forward(
+    self,
+    hidden_states: Optional[Tuple[torch.Tensor]],
+    layer_past: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+    """
+    Copied from GPTBigCodeBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    The only differences are:
+    - add new args token_idx
+    """
+    residual = hidden_states
+    hidden_states = self.ln_1(hidden_states)
+    attn_outputs = self.attn(
+        hidden_states,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+    outputs = attn_outputs[1:]
+    # residual connection
+    hidden_states = attn_output + residual
+
+    if encoder_hidden_states is not None:
+        # add one self-attention block for cross-attention
+        if not hasattr(self, "crossattention"):
+            raise ValueError(
+                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                "cross-attention layers by setting `config.add_cross_attention=True`"
+            )
+        residual = hidden_states
+        hidden_states = self.ln_cross_attn(hidden_states)
+        cross_attn_outputs = self.crossattention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attn_output = cross_attn_outputs[0]
+        # residual connection
+        hidden_states = residual + attn_output
+        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+
+    residual = hidden_states
+    hidden_states = self.ln_2(hidden_states)
+    feed_forward_hidden_states = self.mlp(hidden_states)
+    # residual connection
+    hidden_states = residual + feed_forward_hidden_states
+
+    if use_cache:
+        outputs = (hidden_states,) + outputs
+    else:
+        outputs = (hidden_states,) + outputs[1:]
+
+    return outputs  # hidden_states, present, (attentions, cross_attentions)
+
+
+def gaudi_gpt_bigcode_model_forward(
+    self,
+    input_ids: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    token_type_ids: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+    """
+    Copied from GPTBigCodeModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    The only differences are:
+    - add new args token_idx
+    - if token_idx and past_key_values are passed, set self_attention_mask based on the static shape of past_key_values
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        batch_size = input_ids.shape[0]
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        batch_size = inputs_embeds.shape[0]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if batch_size <= 0:
+        raise ValueError("batch_size has to be defined and > 0")
+
+    device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+    if token_type_ids is not None:
+        token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+    if past_key_values is None:
+        past_length = 0
+        past_key_values = tuple([None] * len(self.h))
+    else:
+        past_length = past_key_values[0].size(-2)
+
+    if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_length > 0:
+            position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
+    elif position_ids is None:
+        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0)
+
+    # Self-attention mask.
+    query_length = input_shape[-1]
+    key_length = past_length + query_length
+    if past_length > 0 and token_idx is not None:
+        self_attention_mask = self.bias[None, past_length - 1 : past_length, :past_length]
+    else:
+        self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
+
+    if attention_mask is not None:
+        self_attention_mask = self_attention_mask * attention_mask.view(batch_size, 1, -1).to(
+            dtype=torch.bool, device=self_attention_mask.device
+        )
+
+    # MQA models: (batch_size, query_length, n_heads, key_length)
+    # MHA models: (batch_size, n_heads, query_length, key_length)
+    self_attention_mask = self_attention_mask.unsqueeze(2 if self.multi_query else 1)
+
+    if self._use_sdpa and head_mask is None and not output_attentions:
+        # output_attentions=True can not be supported when using SDPA, and we fall back on
+        # the manual implementation that requires a 4D causal mask in all cases.
+        if self.multi_query:
+            # gpt_bigcode using MQA has the bad taste to use a causal mask with shape
+            # [batch_size, target_length, 1, source_length], not compatible with SDPA, hence this transpose.
+            self_attention_mask = self_attention_mask.transpose(1, 2)
+
+        if query_length > 1 and attention_mask is not None:
+            # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+            # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+            self_attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
+                self_attention_mask, attention_mask, unmasked_value=True
+            )
+
+        # SDPA with a custom mask is much faster in fp16/fp32 dtype rather than bool. Cast here to floating point instead of at every layer.
+        dtype = self.wte.weight.dtype
+        self_attention_mask = torch.where(
+            self_attention_mask,
+            torch.full([], 0.0, dtype=dtype, device=self_attention_mask.device),
+            torch.full([], torch.finfo(self.wte.weight.dtype).min, dtype=dtype, device=self_attention_mask.device),
+        )
+
+    attention_mask = self_attention_mask
+
+    # If a 2D or 3D attention mask is provided for the cross-attention
+    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+    if self.config.add_cross_attention and encoder_hidden_states is not None and encoder_attention_mask is not None:
+        if encoder_attention_mask.dim() == 2:
+            encoder_attention_mask.unsqueeze(1)
+        assert encoder_attention_mask.dim() == 3
+        encoder_attention_mask = encoder_attention_mask.bool().unsqueeze(2 if self.multi_query else 1)
+    else:
+        encoder_attention_mask = None
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x n_heads x N x N
+    # head_mask has shape n_layer x batch x n_heads x N x N
+    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.wte(input_ids)
+    position_embeds = self.wpe(position_ids)
+    hidden_states = inputs_embeds + position_embeds
+
+    if token_type_ids is not None:
+        token_type_embeds = self.wte(token_type_ids)
+        hidden_states = hidden_states + token_type_embeds
+
+    hidden_states = self.drop(hidden_states)
+
+    output_shape = input_shape + (hidden_states.size(-1),)
+
+    presents = [] if use_cache else None
+    all_self_attentions = () if output_attentions else None
+    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+    all_hidden_states = () if output_hidden_states else None
+    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                block.__call__,
+                hidden_states,
+                None,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,
+                encoder_attention_mask,
+                use_cache,
+                output_attentions,
+                None,
+            )
+        else:
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+            )
+
+        hidden_states = outputs[0]
+        if use_cache:
+            presents.append(outputs[1])
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+            if self.config.add_cross_attention:
+                all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+
+    hidden_states = self.ln_f(hidden_states)
+
+    hidden_states = hidden_states.view(output_shape)
+    # Add last hidden state
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+            if v is not None
+        )
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+class GaudiGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
+    """
+    Inherits from GPTBigCodeForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - when KV cache is enabled, slice next_input_ids from input_ids based on the token_idx
+    - when KV cache is enabled, slice next_position_ids from position_ids based on the token_idx
+    """
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
+    ):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+                if token_type_ids is not None:
+                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
+            else:
+                if self.config.multi_query:
+                    past_length = past_key_values[0].shape[1]
+                else:
+                    past_length = past_key_values[0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+                if token_type_ids is not None:
+                    token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+        else:
+            position_ids = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
new file mode 100644
index 00000000000..cceb114b826
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
@@ -0,0 +1,7 @@
+from .modeling_gpt_neox import (
+    GaudiGPTNeoXForCausalLM,
+    gaudi_gpt_neox_attention_forward,
+    gaudi_gpt_neox_layer_forward,
+    gaudi_gpt_neox_model_forward,
+    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
new file mode 100644
index 00000000000..08f34333777
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
@@ -0,0 +1,429 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, apply_rotary_pos_emb, logger
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+except ImportError:
+    print("Not using HPU fused kernel for apply_rotary_pos_emb")
+    FusedRoPE = None
+
+
+def gaudi_gpt_neox_attention_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    attention_mask: torch.FloatTensor,
+    position_ids: torch.LongTensor,
+    head_mask: Optional[torch.FloatTensor] = None,
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    padding_mask: Optional[torch.Tensor] = None,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from GPTNeoXAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    """
+    # Workaround till FusedRoPE is fixed
+    global FusedRoPE
+    if self.training and FusedRoPE is not None:
+        FusedRoPE = None
+
+    has_layer_past = layer_past is not None
+
+    # Compute QKV
+    # Attention heads [batch, seq_len, hidden_size]
+    #   --> [batch, seq_len, (np * 3 * head_size)]
+    qkv = self.query_key_value(hidden_states)
+
+    # [batch, seq_len, (num_heads * 3 * head_size)]
+    #   --> [batch, seq_len, num_heads, 3 * head_size]
+    new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+    qkv = qkv.view(*new_qkv_shape)
+
+    # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+    query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+    key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+    value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+
+    # Compute rotary embeddings on rotary_ndims
+    query_rot = query[..., : self.rotary_ndims]
+    query_pass = query[..., self.rotary_ndims :]
+    key_rot = key[..., : self.rotary_ndims]
+    key_pass = key[..., self.rotary_ndims :]
+
+    # Compute token offset for rotary embeddings (when decoding)
+    seq_len = key.shape[-2]
+    if has_layer_past:
+        seq_len += layer_past[0].shape[-2]
+    cos, sin = self.rotary_emb(value, seq_len=seq_len)
+    query, key = apply_customized_rope(query_rot, key_rot, cos, sin, position_ids)
+    query = torch.cat((query, query_pass), dim=-1).contiguous()
+    key = torch.cat((key, key_pass), dim=-1).contiguous()
+    value = value.contiguous()
+
+    # Cache QKV values
+    if has_layer_past:
+        past_key = layer_past[0]
+        past_value = layer_past[1]
+        if token_idx is not None:
+            past_key.index_copy_(2, token_idx - 1, key)
+            past_value.index_copy_(2, token_idx - 1, value)
+            key = past_key
+            value = past_value
+        else:
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+    present = (key, value) if use_cache else None
+
+    # Compute attention
+    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+    # Reshape outputs
+    attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
+    attn_output = self.dense(attn_output)
+
+    outputs = (attn_output, present)
+    if output_attentions:
+        outputs += (attn_weights,)
+
+    return outputs
+
+
+def gaudi_gpt_neox_layer_forward(
+    self,
+    hidden_states: Optional[torch.FloatTensor],
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = False,
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from GPTNeoxLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    The only differences are:
+    - add new args token_idx
+    """
+    attention_layer_outputs = self.attention(
+        self.input_layernorm(hidden_states),
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        layer_past=layer_past,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+    attn_output = self.post_attention_dropout(attn_output)
+    outputs = attention_layer_outputs[1:]
+
+    if self.use_parallel_residual:
+        # pseudocode:
+        # x = x + attn(ln1(x)) + mlp(ln2(x))
+        mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+        mlp_output = self.post_mlp_dropout(mlp_output)
+        hidden_states = mlp_output + attn_output + hidden_states
+    else:
+        # pseudocode:
+        # x = x + attn(ln1(x))
+        # x = x + mlp(ln2(x))
+        attn_output = attn_output + hidden_states
+        mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+        mlp_output = self.post_mlp_dropout(mlp_output)
+        hidden_states = mlp_output + attn_output
+
+    if use_cache:
+        outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
+    else:
+        outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+
+    return outputs
+
+
+def gaudi_gpt_neox_model_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from GPTNeoxModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    The only differences are:
+    - add new args token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    batch_size, seq_length = input_shape
+
+    if past_key_values is None:
+        past_length = 0
+        past_key_values = tuple([None] * self.config.num_hidden_layers)
+    else:
+        past_length = past_key_values[0][0].size(-2)
+
+    if position_ids is None:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0)
+
+    # Attention mask.
+    if attention_mask is not None:
+        assert batch_size > 0, "batch_size has to be defined and > 0"
+        attention_mask = attention_mask.view(batch_size, -1)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x n_heads x N x N
+    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_in(input_ids)
+
+    hidden_states = self.emb_dropout(inputs_embeds)
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    presents = () if use_cache else None
+    all_attentions = () if output_attentions else None
+    all_hidden_states = () if output_hidden_states else None
+    for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                layer.__call__,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                head_mask[i],
+                use_cache,
+                None,
+                output_attentions,
+                None,
+            )
+        else:
+            outputs = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                layer_past=layer_past,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+            )
+        hidden_states = outputs[0]
+        if use_cache is True:
+            presents = presents + (outputs[1],)
+        if output_attentions:
+            all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+
+    hidden_states = self.final_layer_norm(hidden_states)
+    # Add last hidden state
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_attentions,
+    )
+
+
+class GaudiGPTNeoXForCausalLM(GPTNeoXForCausalLM):
+    """
+    Inherits from GPTNeoXForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt_neox/modeling_gpt_neox.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    """
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.embed_out(hidden_states)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
+    ):
+        input_shape = input_ids.shape
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+                "token_idx": token_idx,
+            }
+        )
+
+        return model_inputs
+
+
+def gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache(self, seq_len, device, dtype):
+    self.max_seq_len_cached = seq_len
+    t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+    freqs = torch.outer(t, self.inv_freq)
+    # Different from paper, but it uses a different permutation in order to obtain the same calculation
+    emb = torch.cat((freqs, freqs), dim=-1)
+    self.cos_cached = emb.cos()
+    self.sin_cached = emb.sin()
+
+
+def apply_customized_rope(q, k, cos, sin, position_ids):
+    if q.device.type == "hpu" and FusedRoPE:
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids
+        ), FusedRoPE.apply(k, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
+    else:
+        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
new file mode 100644
index 00000000000..9b3b6a64340
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_gptj import (
+    GaudiGPTJAttention,
+    GaudiGPTJForCausalLM,
+    gaudi_gptj_block_forward,
+    gaudi_gptj_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
new file mode 100644
index 00000000000..cc08d4d2c87
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
@@ -0,0 +1,530 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.gptj.modeling_gptj import (
+    GPTJAttention,
+    GPTJForCausalLM,
+    apply_rotary_pos_emb,
+    create_sinusoidal_positions,
+    logger,
+)
+
+
+class GaudiGPTJAttention(GPTJAttention):
+    def _attn(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        head_mask=None,
+    ):
+        # compute causal mask from causal mask buffer
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        attn_weights = attn_weights / self.scale_attn
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        token_idx: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        cos: Optional[torch.Tensor] = None,
+    ) -> Union[
+        Tuple[torch.Tensor, Tuple[torch.Tensor]],
+        Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
+    ]:
+        """
+        Copied from GPTJAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+        The only differences are:
+        - add new args token_idx
+        - remove is_torch_fx_proxy
+        - optimize KV cache
+        - pass sin and cos from upper level as they are identical for each attn block
+        """
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, True).contiguous()
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, True).contiguous()
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, False).contiguous()
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+            # Note: it appears that if we use bf16 RoPE(whether use fused kernel or not), there could be acc issue, hence use fp32 RoPE here Fused kernel feasibility needs to be confirmed in the future
+            k_rot = apply_rotary_pos_emb(k_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
+            q_rot = apply_rotary_pos_emb(q_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
+
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            key = apply_rotary_pos_emb(key.to(torch.float32), sin, cos).to(torch.bfloat16)
+            query = apply_rotary_pos_emb(query.to(torch.float32), sin, cos).to(torch.bfloat16)
+
+        key = key.permute(0, 2, 1, 3).contiguous()
+        query = query.permute(0, 2, 1, 3).contiguous()
+
+        if layer_past is not None:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+
+            if token_idx is not None:
+                past_key.index_copy_(2, token_idx - 1, key)
+                past_value.index_copy_(2, token_idx - 1, value)
+                key = past_key
+                value = past_value
+            else:
+                key = torch.cat([past_key, key], dim=-2)
+                value = torch.cat([past_value, value], dim=-2)
+
+        if use_cache is True:
+            # Note that this cast is quite ugly, but is not implemented before ROPE as the original codebase keeps the key in float32 all along the computation.
+            # Reference: https://github.com/kingoflolz/mesh-transformer-jax/blob/f8315e3003033b23f21d78361b288953064e0e76/mesh_transformer/layers.py#L128
+            present = (key.to(hidden_states.dtype), value)
+        else:
+            present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+def gaudi_gptj_block_forward(
+    self,
+    hidden_states: Optional[torch.FloatTensor],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+    sin: Optional[torch.Tensor] = None,
+    cos: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+    """
+    Copied from GPTJBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    The only differences are:
+    - add new args token_idx
+    - pass sin and cos from upper level as they are identical for each attn block
+    """
+    residual = hidden_states
+    hidden_states = self.ln_1(hidden_states)
+    attn_outputs = self.attn(
+        hidden_states=hidden_states,
+        layer_past=layer_past,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        head_mask=head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+        sin=sin,
+        cos=cos,
+    )
+    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+    outputs = attn_outputs[1:]
+
+    feed_forward_hidden_states = self.mlp(hidden_states)
+    hidden_states = attn_output + feed_forward_hidden_states + residual
+
+    if use_cache:
+        outputs = (hidden_states,) + outputs
+    else:
+        outputs = (hidden_states,) + outputs[1:]
+
+    return outputs  # hidden_states, present, (attentions)
+
+
+def gaudi_gptj_model_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+    sin: Optional[torch.Tensor] = None,
+    cos: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from GPTJModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    The only differences are:
+    - add new args token_idx
+    - pass sin and cos from upper level as they are identical for each attn block
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        batch_size = input_ids.shape[0]
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+        batch_size = inputs_embeds.shape[0]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+    if token_type_ids is not None:
+        token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+    if past_key_values is None:
+        past_length = 0
+        past_key_values = tuple([None] * len(self.h))
+    else:
+        past_length = past_key_values[0][0].size(-2)
+
+    if position_ids is None:
+        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0)
+
+    # Attention mask.
+    if attention_mask is not None:
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+        attention_mask = attention_mask.view(batch_size, -1)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x num_attention_heads x N x N
+    # head_mask has shape n_layer x batch x num_attention_heads x N x N
+    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.wte(input_ids)
+
+    hidden_states = inputs_embeds
+
+    if token_type_ids is not None:
+        token_type_embeds = self.wte(token_type_ids)
+        hidden_states = hidden_states + token_type_embeds
+
+    hidden_states = self.drop(hidden_states)
+
+    output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    presents = () if use_cache else None
+    all_self_attentions = () if output_attentions else None
+    all_hidden_states = () if output_hidden_states else None
+
+    # replace original `_get_embed_positions` method and sin cos calculation in the attn block here to improve perf
+    rotary_dim = self.config.rotary_dim
+    embed_dim = self.config.hidden_size
+    pos_embd_dim = rotary_dim or embed_dim
+    max_positions = self.config.max_position_embeddings
+    embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim).to(torch.bfloat16)
+    embed_positions = embed_positions.repeat(position_ids.shape[0], 1, 1)
+    if embed_positions.device != position_ids.device:
+        embed_positions = embed_positions.to(position_ids.device)
+    repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
+    sincos = torch.gather(embed_positions, 1, repeated_position_ids)
+    sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+    sin = sin.contiguous()
+    cos = cos.contiguous()
+
+    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(hidden_states.device)
+            # Ensure layer_past is on same device as hidden_states (might not be correct)
+            if layer_past is not None:
+                layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+            # Ensure that attention_mask is always on the same device as hidden_states
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(hidden_states.device)
+            if isinstance(head_mask, torch.Tensor):
+                head_mask = head_mask.to(hidden_states.device)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            outputs = self._gradient_checkpointing_func(
+                block.__call__,
+                hidden_states,
+                None,
+                attention_mask,
+                position_ids,
+                head_mask[i],
+                use_cache,
+                output_attentions,
+                None,
+                sin,
+                cos,
+            )
+        else:
+            outputs = block(
+                hidden_states=hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+                sin=sin,
+                cos=cos,
+            )
+
+        hidden_states = outputs[0]
+        if use_cache is True:
+            presents = presents + (outputs[1],)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Model Parallel: If it's the last layer for that device, put things on the next device
+        if self.model_parallel:
+            for k, v in self.device_map.items():
+                if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                    hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+    hidden_states = self.ln_f(hidden_states)
+
+    hidden_states = hidden_states.view(output_shape)
+    # Add last hidden state
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=presents,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+    )
+
+
+class GaudiGPTJForCausalLM(GPTJForCausalLM):
+    """
+    Inherits from GPTJForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_token_type_ids from token_type_ids base on the token_idx
+    """
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
+    ):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+
+            if token_type_ids is not None:
+                if token_idx is not None:
+                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
+                else:
+                    token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+                "token_idx": token_idx,
+            }
+        )
+
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
new file mode 100644
index 00000000000..20703ffd095
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
@@ -0,0 +1,11 @@
+from .modeling_llama import (
+    GaudiLlamaAttention,
+    GaudiLlamaDecoderLayer,
+    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
+    GaudiLlamaForCausalLM,
+    GaudiLlamaLinearScalingRotaryEmbedding,
+    GaudiLlamaMLP,
+    GaudiLlamaModel,
+    GaudiLlamaRotaryEmbedding,
+    gaudi_llama_rmsnorm_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
new file mode 100755
index 00000000000..fdf6d6c8639
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -0,0 +1,1029 @@
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    logger,
+)
+
+from ..modeling_attn_mask_utils import (
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+
+    has_fused_rope = True
+except ImportError:
+    has_fused_rope = False
+    print("Not using HPU fused kernel for apply_rotary_pos_emb")
+
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused scaled dot-product attention kernel.")
+    FusedSDPA = None
+
+
+def update(prev, cur, dim, idx, inp_seq_len):
+    orig_cur = cur
+    if prev.dtype == torch.float8_e4m3fn:
+        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2
+
+        cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0]
+    if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
+        # Initialize
+        prev[:, :, :inp_seq_len, :].copy_(cur)
+        return orig_cur
+    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+    if idx is not None:
+        prev.index_copy_(dim, idx - 1, cur)
+        prev_cast = prev.to(orig_cur.dtype)
+        return prev_cast
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+
+def gaudi_llama_rmsnorm_forward(self, hidden_states):
+    """
+    Copied from LlamaRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+        - override RMSNorm with Habana fused RMSNorm
+    """
+    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
+        if hidden_states.dtype != self.weight.dtype:
+            orig_dtype = hidden_states.dtype
+            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
+            return hidden_states.to(orig_dtype)
+        else:
+            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
+            return hidden_states
+    else:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class GaudiLlamaMLP(LlamaMLP):
+    def pre_mlp_forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            output = sum(down_proj)
+        else:
+            input = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
+            output = self.down_proj(input)
+        return output
+
+    def mlp_all_reduce(self, x):
+        if hasattr(self.down_proj, "all_reduce"):
+            self.down_proj.all_reduce(x)
+
+    def post_mlp_forward(self, x):
+        if self.config.pretraining_tp > 1:
+            return x
+        if hasattr(self.down_proj, "post_all_reduce"):
+            return self.down_proj.post_all_reduce(x)
+        return x
+
+
+def gaudi_llama_repeat_kv(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    n_rep: int,
+):
+    """
+    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
+    if n_rep == 1 or num_key_value_heads == 1:
+        return query_states, key_states, value_states, attention_mask
+
+    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
+    key_states = key_states.reshape(new_kv_shape)
+    value_states = value_states.reshape(new_kv_shape)
+
+    batch, _, q_len, head_dim = query_states.shape
+    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
+    query_states = query_states.reshape(new_q_shape)
+
+    if attention_mask is not None:
+        # Add groups dim and set to 1
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return query_states, key_states, value_states, attention_mask
+
+
+class Matmul(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class KVCache(torch.nn.Module):
+    def __init__(self):
+        super(KVCache, self).__init__()
+        self.cache = None
+        self.inp_seq_len = -1
+
+    def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
+        if self.cache is None or self.cache.shape != shape:
+            self.inp_seq_len = inp_seq_len
+            if kv_cache_fp8:
+                dtype = torch.float8_e4m3fn
+            self.cache = torch.zeros(shape, dtype=dtype, device=device)
+        else:
+            assert (
+                self.inp_seq_len == inp_seq_len
+            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            self.cache.fill_(0)
+
+    def get_shape(self):
+        if self.cache is None:
+            return None
+        return self.cache.shape
+
+    def forward(self, cur, dim, idx):
+        return update(self.cache, cur, dim, idx, self.inp_seq_len)
+
+
+class GaudiLlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        super().__init__()
+
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self._cos_cached[:seq_len].to(dtype=x.dtype),
+            self._sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class GaudiLlamaLinearScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding):
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class GaudiLlamaDynamicNTKScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding):
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class GaudiLlamaAttention(LlamaAttention):
+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+
+        self.matmul_qk = Matmul()
+        self.matmul_av = Matmul()
+        self.k_cache = KVCache()
+        self.v_cache = KVCache()
+        self.inp_seq_len = -1
+        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
+
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
+        device = self.k_proj.weight.device
+        dtype = self.config.torch_dtype
+        self.k_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
+        self.v_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
+
+    def update_sincos_cache(self, seq_len):
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # This helps in avoiding creation of these caches during actual model forward pass and
+        # reduce memory consumption and improve performance.
+        if seq_len > self.max_position_embeddings:
+            self.max_position_embeddings = seq_len
+            _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
+
+    def reorder(self, tensor, beam_idx, dim_a, dim_b):
+        updated = tensor.index_select(0, beam_idx)
+        tensor.copy_(updated)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        if self.k_cache.cache is None:
+            return (None, None)
+
+        head_dim = self.k_cache.cache.size(-1)
+        seq_length = self.k_cache.cache.size(-2)
+        self.reorder(self.k_cache.cache, beam_idx, seq_length, head_dim)
+        self.reorder(self.v_cache.cache, beam_idx, seq_length, head_dim)
+        return (self.k_cache.cache.shape, self.v_cache.cache.shape)
+
+    def pre_attn_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+        Copied from LlamaAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        The only differences are:
+        - add new args token_idx
+        - optimize KV cache
+        - add new args attn_softmax_bf16
+        - add new args reuse_cache
+        - add new args use_flash_attention
+        - add new arg flash_attention_recompute
+        """
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if token_idx is None:
+                if hasattr(past_key_value, "get_usable_length"):
+                    kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+                else:
+                    kv_seq_len += past_key_value[0].shape[-2]
+            else:
+                if reuse_cache:
+                    kv_seq_len = past_key_value[0][-2]
+                else:
+                    kv_seq_len = past_key_value[0].shape[-2]
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None or reuse_cache:
+            # reuse k, v, self_attention
+            if reuse_cache:
+                key_states = self.k_cache(key_states, 2, token_idx)
+                value_states = self.v_cache(value_states, 2, token_idx)
+            else:
+                key_states = update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
+                value_states = update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+
+            if cache_idx is not None and q_len == 1:
+                key_states = key_states[:, :, :cache_idx, :]
+                value_states = value_states[:, :, :cache_idx, :]
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, :, :, :cache_idx]
+                kv_seq_len = key_states.shape[-2]
+
+        if use_cache:
+            if reuse_cache:
+                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
+            else:
+                past_key_value = (key_states.contiguous(), value_states.contiguous())
+        else:
+            past_key_value = None
+
+        if use_flash_attention and FusedSDPA:
+            import habana_frameworks.torch.hpu as ht
+
+            if q_len == 1:
+                # next token
+                with ht.sdp_kernel(enable_recompute=False):
+                    attn_output = FusedSDPA.apply(
+                        query_states, key_states, value_states, attention_mask, 0.0, False, None
+                    )
+            else:
+                # first token
+                with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                    attn_output = FusedSDPA.apply(
+                        query_states, key_states, value_states, attention_mask, 0.0, False, None
+                    )
+
+        else:
+            query_states, key_states, value_states, attention_mask = gaudi_llama_repeat_kv(
+                query_states, key_states, value_states, attention_mask, self.num_key_value_groups
+            )
+
+            attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.norm_factor
+
+            if attention_mask is not None:  # no matter the length, we just slice it
+                causal_mask = attention_mask
+                if cache_position is not None:
+                    causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+                attn_weights = attn_weights + causal_mask
+
+            if attn_softmax_bf16:
+                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
+            else:
+                # upcast attention to fp32
+                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+                    query_states.dtype
+                )
+            attn_weights = torch.nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+            attn_output = self.matmul_av(attn_weights, value_states)
+            attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def attention_all_reduce(self, attn_output):
+        if hasattr(self.o_proj, "all_reduce"):
+            self.o_proj.all_reduce(attn_output)
+
+    def post_attn_forward(self, attn_output):
+        if hasattr(self.o_proj, "post_all_reduce"):
+            self.o_proj.post_all_reduce(attn_output)
+        return attn_output
+
+
+class GaudiLlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super(LlamaDecoderLayer, self).__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GaudiLlamaAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GaudiLlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.self_attn.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.self_attn.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Copied from LlamaDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        The only differences are:
+        - add new args token_idx
+        - add new args attn_softmax_bf16
+        - add new args reuse_cache
+        - add new args use_flash_attention
+        - add new arg flash_attention_recompute
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+        output_pre_attn, self_attn_weights, present_key_value = self.pre_attn(
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_value,
+            output_attentions,
+            use_cache,
+            cache_position,
+            token_idx,
+            attn_softmax_bf16,
+            reuse_cache,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
+            cache_idx=cache_idx,
+            **kwargs,
+        )
+
+        self.self_attn.attention_all_reduce(output_pre_attn)
+        output_post_attn_pre_mlp, residual_mlp = self.post_attn_pre_mlp(output_pre_attn, residual)
+        self.mlp.mlp_all_reduce(output_post_attn_pre_mlp)
+        output_post_mlp = self.post_mlp(output_post_attn_pre_mlp, residual_mlp)
+
+        outputs = (output_post_mlp,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    def pre_attn(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        hidden_states = self.input_layernorm(hidden_states)
+        output_attn, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_value,
+            output_attentions,
+            use_cache,
+            cache_position,
+            token_idx,
+            attn_softmax_bf16,
+            reuse_cache,
+            use_flash_attention,
+            flash_attention_recompute,
+            cache_idx=cache_idx,
+        )
+        return output_attn, attn_weights, present_key_value
+
+    def post_attn_pre_mlp(self, input, residual):
+        output_post_attn = self.self_attn.post_attn_forward(input)
+
+        hidden_states = residual + output_post_attn
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp.pre_mlp_forward(hidden_states)
+        return hidden_states, residual
+
+    def post_mlp(self, input, residual):
+        output_post_mlp = self.mlp.post_mlp_forward(input)
+        output = output_post_mlp + residual
+        return output
+
+
+class GaudiLlamaModel(LlamaModel):
+    """
+    Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L909
+    """
+
+    def __init__(self, config: LlamaConfig):
+        """
+        Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L917
+        1. set fill_value to 1 instead of True
+        2. add device=self.device
+        """
+        super(LlamaModel, self).__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = torch.nn.ModuleList(
+            [GaudiLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        # Register a causal mask to separate causal and padding mask creation. Merging happens in the attention class.
+        # NOTE: This is not friendly with TorchScript, ONNX, ExportedProgram serialization for very large `max_position_embeddings`.
+        causal_mask = torch.full(
+            (config.max_position_embeddings, config.max_position_embeddings),
+            fill_value=1,
+            dtype=torch.bool,
+        )
+        self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+        for layer in self.layers:
+            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
+
+    def update_sincos_cache(self, seq_len):
+        for layer in self.layers:
+            layer.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Copied from LlamaModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        The only differences are:
+        - add new args token_idx
+        - add new args attn_softmax_bf16
+        - add new args reuse_cache
+        - add new args use_flash_attention
+        - add new arg flash_attention_recompute
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        ignore_cache_position = True  # Ignoring cache position for HPU
+        use_new_cache = False  # Ignoring new Cache path for HPU
+        past_seen_tokens = 0
+
+        if past_key_values is not None and use_cache:  # kept for BC (cache positions)
+            if reuse_cache:
+                past_seen_tokens = past_key_values[0][0][2]
+            else:
+                if use_new_cache:
+                    if not isinstance(past_key_values, StaticCache):
+                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                    past_seen_tokens = past_key_values.get_seq_length()
+                else:
+                    past_seen_tokens = past_key_values[0][0].shape[2]
+
+        if ignore_cache_position is False:
+            if cache_position is None:
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                )
+            if position_ids is None and cache_position:
+                position_ids = cache_position.unsqueeze(0)
+
+        else:
+            if position_ids is None:
+                position_ids = torch.arange(
+                    past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=inputs_embeds.device
+                )
+                position_ids = position_ids.unsqueeze(0)
+            cache_position = None
+
+        # HPU specific mask generation
+        if ignore_cache_position:
+            causal_mask = _gaudi_prepare_4d_causal_attention_mask(
+                attention_mask,
+                input_ids.shape if input_ids is not None else (batch_size, seq_length),
+                inputs_embeds,
+                past_seen_tokens,
+            )
+        else:
+            causal_mask = self._update_causal_mask(attention_mask, inputs_embeds)
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if not use_new_cache else None
+
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    None,
+                    attn_softmax_bf16,
+                    False,
+                    use_flash_attention,
+                    flash_attention_recompute,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    token_idx=token_idx,
+                    attn_softmax_bf16=attn_softmax_bf16,
+                    reuse_cache=reuse_cache,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
+                    cache_idx=cache_idx,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class GaudiLlamaForCausalLM(LlamaForCausalLM):
+    """
+    Inherits from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    - add new args attn_softmax_bf16
+    - add new args reuse_cache
+    """
+
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+        self.kv_cache_len = max_seq_len
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.model.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.model.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        trim_logits: Optional[bool] = False,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.generation_config.use_fused_rope is False:
+            global has_fused_rope
+            has_fused_rope = False
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            token_idx=token_idx,
+            attn_softmax_bf16=attn_softmax_bf16,
+            reuse_cache=reuse_cache,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
+            cache_idx=cache_idx,
+        )
+        hidden_states = outputs[0]
+        _, seq_len, _ = hidden_states.shape
+        if seq_len > 1 and trim_logits and not self.training:
+            if token_idx is not None:
+                hidden_states = hidden_states.index_select(1, token_idx - 1)
+            else:
+                hidden_states = hidden_states[:, -1, :]
+
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
+    ):
+        past_length = 0
+
+        reuse_cache = kwargs.get("reuse_cache")
+        if past_key_values is not None:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                if isinstance(past_key_values, Cache):
+                    cache_length = past_key_values.get_seq_length()
+                    past_length = past_key_values.seen_tokens
+                    max_cache_length = past_key_values.get_max_length()
+                else:
+                    cache_length = past_length = past_key_values[0][0].shape[2]
+                    max_cache_length = None
+
+                # Keep only the unprocessed tokens:
+                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+                # input)
+                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # input_ids based on the past_length.
+                elif past_length < input_ids.shape[1]:
+                    input_ids = input_ids[:, past_length:]
+                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                if (
+                    max_cache_length is not None
+                    and attention_mask is not None
+                    and cache_length + input_ids.shape[1] > max_cache_length
+                ):
+                    attention_mask = attention_mask[:, -max_cache_length:]
+        elif reuse_cache and token_idx is not None:
+            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
+            input_ids = input_ids[:, :token_idx]
+            attention_mask = attention_mask[:, :token_idx]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+        # TODO: we are using token_idx, disable this for now
+        # if self.generation_config.cache_implementation == "static":
+        # generation with static cache
+        # cache_position = kwargs.get("cache_position", None)
+        # if cache_position is None:
+        # past_length = 0
+        # else:
+        # past_length = cache_position[-1] + 1
+        # input_ids = input_ids[:, past_length:]
+        # position_ids = position_ids[:, past_length:]
+
+        # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
+        # same goes for position ids. Could also help with continued generation.
+        # cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
+        # keep cache_position implementation as None for HPU
+        cache_position = None
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {"input_ids": input_ids.contiguous()}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids.contiguous(),
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+                "trim_logits": kwargs.get("trim_logits"),
+                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
+                "reuse_cache": reuse_cache,
+                "use_flash_attention": kwargs.get("use_flash_attention"),
+                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
+                "cache_idx": kwargs.get("cache_idx"),
+            }
+        )
+        return model_inputs
+
+
+def apply_customized_rope(q, k, cos, sin, position_ids):
+    if q.device.type == "hpu" and has_fused_rope:
+        # TODO: remove `.clone()` when it is fixed in SynapseAI
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        ), FusedRoPE.apply(
+            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        )
+    else:
+        # keep the same implementation as Transformers v4.37.2
+        return apply_rotary_pos_emb(q, k, cos[position_ids], sin[position_ids])
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
new file mode 100644
index 00000000000..192c2677918
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
@@ -0,0 +1,7 @@
+from .modeling_mistral import (
+    GaudiMistralAttention,
+    GaudiMistralDecoderLayer,
+    GaudiMistralForCausalLM,
+    GaudiMistralModel,
+    gaudi_mistral_rmsnorm_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
new file mode 100644
index 00000000000..24174ce1d34
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -0,0 +1,687 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mistral model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import habana_frameworks.torch.core as htcore
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.mistral.configuration_mistral import MistralConfig
+from transformers.models.mistral.modeling_mistral import (
+    MistralAttention,
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralModel,
+    apply_rotary_pos_emb,
+)
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import (
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
+
+logger = logging.get_logger(__name__)
+
+
+def update(prev, cur, dim, idx):
+    orig_cur = cur
+    if prev.shape == cur.shape:
+        # Initialize
+        prev.copy_(cur)
+        return orig_cur
+    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+    if idx is not None:
+        return prev.index_copy_(dim, idx - 1, cur)
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+
+def gaudi_mistral_rmsnorm_forward(self, hidden_states):
+    """
+    Copied from MistralRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py
+    The only differences are:
+        - override RMSNorm with Habana fused RMSNorm
+    """
+    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
+        if hidden_states.dtype != self.weight.dtype:
+            orig_dtype = hidden_states.dtype
+            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
+            return hidden_states.to(orig_dtype)
+        else:
+            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
+            return hidden_states
+    else:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+def gaudi_mistral_repeat_kv(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    n_rep: int,
+):
+    """
+    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mistral/modeling_mistral.py
+    The only differences are:
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
+    if n_rep == 1 or num_key_value_heads == 1:
+        return query_states, key_states, value_states, attention_mask
+
+    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
+    key_states = key_states.reshape(new_kv_shape)
+    value_states = value_states.reshape(new_kv_shape)
+
+    batch, _, q_len, head_dim = query_states.shape
+    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
+    query_states = query_states.reshape(new_q_shape)
+
+    if attention_mask is not None:
+        # Add groups dim and set to 1
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return query_states, key_states, value_states, attention_mask
+
+
+class GaudiMistralAttention(MistralAttention):
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.past_key = None
+        self.past_value = None
+
+    def allocate_kv_cache(self, batch_size, seq_len):
+        kv_shape = (batch_size, self.num_key_value_heads, seq_len, self.head_dim)
+        if self.past_key is None or self.past_key.shape != kv_shape:
+            device = self.k_proj.weight.device
+            dtype = self.k_proj.weight.dtype
+            self.past_key = torch.empty(kv_shape, dtype=dtype, device=device)
+            self.past_value = torch.empty(kv_shape, dtype=dtype, device=device)
+
+    def update_sincos_cache(self, seq_len):
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # This helps in avoiding creation of these caches during actual model forward pass and
+        # reduce memory consumption and improve performance.
+        if seq_len > self.max_position_embeddings:
+            self.max_position_embeddings = seq_len
+            _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
+
+    def reorder(self, tensor, beam_idx, dim_a, dim_b):
+        updated = tensor.index_select(0, beam_idx)
+        tensor.copy_(updated)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        if self.past_key is None:
+            return (None, None)
+
+        head_dim = self.past_key.size(-1)
+        seq_length = self.past_key.size(-2)
+        self.reorder(self.past_key, beam_idx, seq_length, head_dim)
+        self.reorder(self.past_value, beam_idx, seq_length, head_dim)
+        return (self.past_key.shape, self.past_value.shape)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: Optional[int] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+         Copied from MistralAttention.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+         The only differences are:
+         - add new args token_idx
+         - add new args reuse_cache
+        - add new args cache_idx
+        """
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_shape = (
+                (past_key_value[0][-2] if reuse_cache else past_key_value[0].shape[-2])
+                if isinstance(past_key_value, tuple)
+                else past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            )
+            if token_idx is not None:
+                kv_seq_len = kv_shape
+            else:
+                kv_seq_len += kv_shape
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None or reuse_cache:
+            if reuse_cache:
+                past_key = self.past_key
+                past_value = self.past_value
+            else:
+                past_key = past_key_value[0]
+                past_value = past_key_value[1]
+            key_states = update(past_key, key_states, 2, token_idx)
+            value_states = update(past_value, value_states, 2, token_idx)
+        if use_cache:
+            if reuse_cache:
+                past_key_value = (key_states.contiguous().shape, value_states.contiguous().shape)
+            else:
+                past_key_value = (key_states.contiguous(), value_states.contiguous())
+        else:
+            past_key_value = None
+        if cache_idx is not None and q_len == 1:
+            key_states = key_states[:, :, :cache_idx, :]
+            value_states = value_states[:, :, :cache_idx, :]
+            attention_mask = attention_mask[:, :, :, :cache_idx]
+            kv_seq_len = key_states.shape[-2]
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        query_states, key_states, value_states, attention_mask = gaudi_mistral_repeat_kv(
+            query_states, key_states, value_states, attention_mask, self.num_key_value_groups
+        )
+        attn_weights = torch.matmul(query_states, key_states.transpose(-2, -1)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() not in [
+            (bsz, self.num_heads, q_len, kv_seq_len),
+            (bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len),
+        ]:
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)} or"
+                f" {(bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() not in [(bsz, 1, q_len, kv_seq_len), (bsz, 1, 1, q_len, kv_seq_len)]:
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or {(bsz, 1, 1, q_len, kv_seq_len)},"
+                    f" but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        if attn_softmax_bf16:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
+        else:
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class GaudiMistralDecoderLayer(MistralDecoderLayer):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.self_attn = GaudiMistralAttention(config, layer_idx)
+
+    def allocate_kv_cache(self, batch_size, seq_len):
+        self.self_attn.allocate_kv_cache(batch_size, seq_len)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.self_attn.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.self_attn.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: Optional[int] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Copied from MistralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        The only differences are:
+        - add new args token_idx
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
+            attn_softmax_bf16=attn_softmax_bf16,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class GaudiMistralModel(MistralModel):
+    def allocate_kv_cache(self, batch_size, seq_len):
+        for layer in self.layers:
+            layer.allocate_kv_cache(batch_size, seq_len)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: Optional[int] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Copied from MistralModel.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        The only differences are:
+        - add new args token_idx
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+        use_legacy_cache = True
+        use_new_cache = False
+        if past_key_values is not None and use_cache and not reuse_cache:
+            if use_new_cache:
+                use_legacy_cache = not isinstance(past_key_values, Cache)
+                if use_legacy_cache:
+                    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if not use_new_cache else None
+
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if layer_idx == len(self.layers) // 2:
+                htcore.mark_step()
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None if past_key_values is None else past_key_values[layer_idx],
+                    output_attentions,
+                    use_cache,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    token_idx=token_idx,
+                    reuse_cache=reuse_cache,
+                    cache_idx=cache_idx,
+                    attn_softmax_bf16=attn_softmax_bf16,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache
+                if not use_new_cache
+                else (next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache)
+            )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class GaudiMistralForCausalLM(MistralForCausalLM):
+    def allocate_kv_cache(self, batch_size, seq_len, _, __):
+        self.model.allocate_kv_cache(batch_size, seq_len)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.model.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.model.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        trim_logits: Optional[bool] = False,
+        cache_idx: Optional[int] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        The only differences are:
+        - add new args token_idx
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
+            attn_softmax_bf16=attn_softmax_bf16,
+        )
+        hidden_states = outputs[0]
+        _, seq_len, _ = hidden_states.shape
+        if seq_len > 1 and trim_logits and not self.training:
+            if token_idx is not None:
+                hidden_states = hidden_states.index_select(1, token_idx - 1)
+            else:
+                hidden_states = hidden_states[:, -1, :]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        """
+        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        The only differences are:
+        - add new args token_idx
+        - add token_idx into model_inputs
+        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+        """
+        token_idx = kwargs.get("token_idx", None)
+
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if token_idx is None:
+                if isinstance(past_key_values, Cache):
+                    cache_length = past_key_values.get_seq_length()
+                    past_length = past_key_values.seen_tokens
+                    max_cache_length = past_key_values.get_max_length()
+                else:
+                    cache_length = past_length = past_key_values[0][0].shape[2]
+                    max_cache_length = None
+
+                # Keep only the unprocessed tokens:
+                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+                # input)
+                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # input_ids based on the past_length.
+                elif past_length < input_ids.shape[1]:
+                    input_ids = input_ids[:, past_length:]
+                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                if (
+                    max_cache_length is not None
+                    and attention_mask is not None
+                    and cache_length + input_ids.shape[1] > max_cache_length
+                ):
+                    attention_mask = attention_mask[:, -max_cache_length:]
+            else:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+                "reuse_cache": kwargs.get("reuse_cache"),
+                "trim_logits": kwargs.get("trim_logits"),
+                "cache_idx": kwargs.get("cache_idx"),
+                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
+            }
+        )
+        return model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
new file mode 100644
index 00000000000..fd1829bbe20
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
@@ -0,0 +1,8 @@
+from .modeling_mixtral import (
+    GaudiMixtralForCausalLM,
+    gaudi_mixtral_attention_forward,
+    gaudi_mixtral_block_sparse_moe_forward,
+    gaudi_mixtral_decoder_layer_forward,
+    gaudi_mixtral_model_forward,
+    gaudi_mixtral_rmsnorm_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
new file mode 100644
index 00000000000..c6f5f51ab7c
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -0,0 +1,718 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Mixtral model."""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import habana_frameworks.torch.core as htcore
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.integrations.deepspeed import is_deepspeed_available
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralForCausalLM,
+    apply_rotary_pos_emb,
+    load_balancing_loss_func,
+)
+from transformers.utils import logging
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+except ImportError:
+    print("Not using HPU fused kernel for apply_rotary_pos_emb")
+    FusedRoPE = None
+
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused scaled dot-product attention kernel.")
+    FusedSDPA = None
+
+logger = logging.get_logger(__name__)
+
+
+def update(prev, cur, dim, idx, inp_seq_len):
+    orig_cur = cur
+    if prev.dtype == torch.float8_e4m3fn:
+        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2
+
+        cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0]
+    if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
+        # Initialize
+        prev[:, :, :inp_seq_len, :].copy_(cur)
+        return orig_cur
+    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+    if idx is not None:
+        prev.index_copy_(dim, idx - 1, cur)
+        prev_cast = prev.to(orig_cur.dtype)
+        return prev_cast
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+
+def apply_customized_rope(q, k, cos, sin, position_ids):
+    if q.device.type == "hpu" and FusedRoPE:
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids
+        ), FusedRoPE.apply(k, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
+    else:
+        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+
+
+def gaudi_mixtral_rmsnorm_forward(self, hidden_states):
+    """
+    Copied from MixtralRMSNorm.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+        - override RMSNorm with Habana fused RMSNorm
+    """
+    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
+        if hidden_states.dtype != self.weight.dtype:
+            orig_dtype = hidden_states.dtype
+            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
+            return hidden_states.to(orig_dtype)
+        else:
+            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
+            return hidden_states
+    else:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+def gaudi_mixtral_repeat_kv(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    n_rep: int,
+):
+    """
+    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+    - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+    - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
+    if n_rep == 1 or num_key_value_heads == 1:
+        return query_states, key_states, value_states, attention_mask
+
+    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
+    key_states = key_states.reshape(new_kv_shape)
+    value_states = value_states.reshape(new_kv_shape)
+
+    batch, _, q_len, head_dim = query_states.shape
+    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
+    query_states = query_states.reshape(new_q_shape)
+
+    if attention_mask is not None:
+        # Add groups dim and set to 1
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return query_states, key_states, value_states, attention_mask
+
+
+class KVCache(torch.nn.Module):
+    def __init__(self):
+        super(KVCache, self).__init__()
+        self.cache = None
+        self.inp_seq_len = -1
+
+    def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
+        if self.cache is None or self.cache.shape != shape:
+            self.inp_seq_len = inp_seq_len
+            if kv_cache_fp8:
+                dtype = torch.float8_e4m3fn
+            self.cache = torch.zeros(shape, dtype=dtype, device=device)
+        else:
+            assert (
+                self.inp_seq_len == inp_seq_len
+            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            self.cache.fill_(0)
+
+    def get_shape(self):
+        if self.cache is None:
+            return None
+        return self.cache.shape
+
+    def forward(self, cur, dim, idx):
+        return update(self.cache, cur, dim, idx, self.inp_seq_len)
+
+
+def gaudi_mixtral_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Copied from MixtralAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    """
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        if token_idx is not None:
+            if 0 <= self.layer_idx < len(past_key_value.key_cache):
+                kv_seq_len = past_key_value.key_cache[self.layer_idx].shape[-2]
+        else:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        if token_idx is not None:
+            if 0 <= self.layer_idx < len(past_key_value.key_cache):
+                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
+                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
+                key_states = past_key_value.key_cache[self.layer_idx]
+                value_states = past_key_value.value_cache[self.layer_idx]
+            else:
+                past_key_value.key_cache.append(key_states)
+                past_key_value.value_cache.append(value_states)
+        else:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    if FusedSDPA:
+        import habana_frameworks.torch.hpu as ht
+
+        if q_len == 1:
+            # next token
+            with ht.sdp_kernel(enable_recompute=False):
+                attn_output = FusedSDPA.apply(query_states, key_states, value_states, attention_mask, 0.0, False, None)
+        else:
+            # first token
+            with ht.sdp_kernel(enable_recompute=False):  # inference: flash_attention_recompute = False
+                attn_output = FusedSDPA.apply(query_states, key_states, value_states, attention_mask, 0.0, False, None)
+    else:
+        query_states, key_states, value_states, attention_mask = gaudi_mixtral_repeat_kv(
+            query_states, key_states, value_states, attention_mask, self.num_key_value_groups
+        )
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(-2, -1)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.unsqueeze(2)
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.reshape(bsz, self.num_heads, q_len, self.head_dim).contiguous()
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Copied from MixtralSparseMoeBlock.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+    - optimize expert forward, remove dynamic control and dynamic shape
+    """
+    batch_size, sequence_length, hidden_dim = hidden_states.shape
+    hidden_states = hidden_states.view(-1, hidden_dim)
+    # router_logits: (batch * sequence_length, n_experts)
+    router_logits = self.gate(hidden_states)
+
+    if is_deepspeed_available():
+        from deepspeed import comm as dist
+
+        if dist.is_initialized():
+            output_tensors = [router_logits.clone() for _ in range(dist.get_world_size())]
+            dist.all_gather(output_tensors, router_logits)
+            router_logits = torch.cat(output_tensors, dim=1)
+
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    # we cast back to the input dtype
+    routing_weights = routing_weights.to(hidden_states.dtype)
+
+    final_hidden_states = torch.zeros(
+        (batch_size, sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+
+    padded_weights = torch.zeros(
+        (batch_size * sequence_length, self.num_experts), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    padded_weights.scatter_(-1, selected_experts, routing_weights)
+    padded_weights = padded_weights.reshape(-1, sequence_length, self.num_experts)
+    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+
+    # Loop over all available experts in the model and perform the computation on each expert
+    for expert_idx in range(self.num_experts):
+        expert_layer = self.experts[expert_idx]
+        padded_weight = padded_weights[expert_idx]
+        current_state_static = hidden_states.reshape(-1, hidden_dim)
+        current_hidden_states_static = (
+            expert_layer(current_state_static).reshape(-1, sequence_length, hidden_dim) * padded_weight
+        )
+        final_hidden_states += current_hidden_states_static
+
+    return final_hidden_states, router_logits
+
+
+def gaudi_mixtral_decoder_layer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    output_router_logits: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    """
+    Copied from MixtralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+    - add new args token_idx
+    """
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+
+    htcore.mark_step()
+    residual = hidden_states
+
+    hidden_states = self.input_layernorm(hidden_states)
+
+    # Self Attention
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_value=past_key_value,
+        output_attentions=output_attentions,
+        use_cache=use_cache,
+        token_idx=token_idx,
+    )
+    hidden_states = residual + hidden_states
+    htcore.mark_step()
+
+    # Fully Connected
+    residual = hidden_states
+    hidden_states = self.post_attention_layernorm(hidden_states)
+    hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+    hidden_states = residual + hidden_states
+    htcore.mark_step()
+
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (self_attn_weights,)
+
+    if use_cache:
+        outputs += (present_key_value,)
+
+    if output_router_logits:
+        outputs += (router_logits,)
+
+    return outputs
+
+
+def gaudi_mixtral_model_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, MoeModelOutputWithPast]:
+    """
+    Copied from MixtralModel.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1069
+    The only differences are:
+    - add new args token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape
+    elif inputs_embeds is not None:
+        batch_size, seq_length, _ = inputs_embeds.shape
+    else:
+        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+    past_key_values_length = 0
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    if use_cache:
+        use_legacy_cache = not isinstance(past_key_values, Cache)
+        if use_legacy_cache:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+    if position_ids is None:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        position_ids = torch.arange(
+            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+        )
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+    else:
+        position_ids = position_ids.view(-1, seq_length).long()
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    if attention_mask is not None and self.config._attn_implementation == "flash_attention_2" and use_cache:
+        is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+        if is_padding_right:
+            raise ValueError(
+                "You are attempting to perform batched generation with padding_side='right'"
+                " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
+                " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+            )
+
+    if self.config._attn_implementation == "flash_attention_2":
+        # 2d mask is passed through the layers
+        attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+    elif self.config._attn_implementation == "sdpa" and not output_attentions:
+        # output_attentions=True can not be supported when using SDPA, and we fall back on
+        # the manual implementation that requires a 4D causal mask in all cases.
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+    else:
+        # 4d mask is passed through the layers
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+            sliding_window=self.config.sliding_window,
+        )
+
+    hidden_states = inputs_embeds
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    all_router_logits = () if output_router_logits else None
+    next_decoder_cache = None
+
+    for decoder_layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                output_router_logits,
+                use_cache,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                output_router_logits=output_router_logits,
+                use_cache=use_cache,
+                token_idx=token_idx,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+        if output_router_logits:
+            all_router_logits += (layer_outputs[-1],)
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = None
+    if use_cache:
+        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+            if v is not None
+        )
+    return MoeModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+        router_logits=all_router_logits,
+    )
+
+
+class GaudiMixtralForCausalLM(MixtralForCausalLM):
+    """
+    Inherits from MixtralForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1231
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    """
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        token_idx = kwargs.get("token_idx", None)
+
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if token_idx is None:
+                if isinstance(past_key_values, Cache):
+                    cache_length = past_key_values.get_seq_length()
+                    past_length = past_key_values.seen_tokens
+                    max_cache_length = past_key_values.get_max_length()
+                else:
+                    cache_length = past_length = past_key_values[0][0].shape[2]
+                    max_cache_length = None
+
+                # Keep only the unprocessed tokens:
+                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+                # input)
+                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # input_ids based on the past_length.
+                elif past_length < input_ids.shape[1]:
+                    input_ids = input_ids[:, past_length:]
+                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                if (
+                    max_cache_length is not None
+                    and attention_mask is not None
+                    and cache_length + input_ids.shape[1] > max_cache_length
+                ):
+                    attention_mask = attention_mask[:, -max_cache_length:]
+            else:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
new file mode 100644
index 00000000000..c95284cafd5
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Tuple
+
+import torch
+from transformers.modeling_utils import ModuleUtilsMixin, PretrainedConfig
+from transformers.utils.import_utils import is_torch_sdpa_available
+
+
+def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L640
+    except that mixed precision is disabled for computing:
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
+    """
+    if encoder_attention_mask.dim() == 3:
+        encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+    if encoder_attention_mask.dim() == 2:
+        encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+    # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+    # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+    # /transformer/transformer_layers.py#L270
+    # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+    # encoder_extended_attention_mask.transpose(-1, -2))
+    # torch.finfo must take the dtype of encoder_extended_attention_mask
+    encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # bf16 compatibility
+    encoder_extended_attention_mask = 1.0 - encoder_extended_attention_mask
+    #  Fixes issue where the model is not in bf16 and mul is casting it to values out of range resulting in nan
+    with torch.autocast(enabled=False, device_type="hpu"):
+        encoder_extended_attention_mask = (
+            encoder_extended_attention_mask * torch.finfo(encoder_extended_attention_mask.dtype).min
+        )
+
+    return encoder_extended_attention_mask
+
+
+def gaudi_get_extended_attention_mask(
+    self, attention_mask: torch.Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
+) -> torch.Tensor:
+    """
+    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L692
+    except that mixed precision is disabled for computing:
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+    """
+    if dtype is None:
+        dtype = self.dtype
+
+    if not (attention_mask.dim() == 2 and self.config.is_decoder):
+        # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+        if device is not None:
+            warnings.warn(
+                "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+    # ourselves in which case we just need to make it broadcastable to all heads.
+    if attention_mask.dim() == 3:
+        extended_attention_mask = attention_mask[:, None, :, :]
+    elif attention_mask.dim() == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder:
+            extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
+                input_shape, attention_mask, device
+            )
+        else:
+            extended_attention_mask = attention_mask[:, None, None, :]
+    else:
+        raise ValueError(
+            f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+        )
+
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
+    # torch.finfo must take the dtype of encoder_extended_attention_mask
+    extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # bf16 compatibility
+    extended_attention_mask = 1.0 - extended_attention_mask
+    with torch.autocast(enabled=False, device_type="hpu"):
+        extended_attention_mask = extended_attention_mask * torch.finfo(extended_attention_mask.dtype).min
+
+    return extended_attention_mask
+
+
+def gaudi_conv1d_forward(self, x):
+    """
+    Same as https://github.com/huggingface/transformers/blob/3335724376319a0c453049d0cd883504f530ff52/src/transformers/pytorch_utils.py#L100
+    but moves reshape before view for tpc auto fusion.
+    """
+    size_out = x.size()[:-1] + (self.nf,)
+    x = torch.mm(x.view(-1, x.size(-1)), self.weight)
+    x = x.view(size_out)
+    bias_shape = [1 for _ in x.shape]
+    bias_shape[-1] = self.nf
+    bias = self.bias.view(bias_shape)
+    x = x + bias
+    return x
+
+
+# Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
+@classmethod
+def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
+    # This model doesn't support SDPA in Gaudi yet, fallback to original code.
+    MODELS_ATTN_IMPLEMENTATION_EAGER = ["bart", "gpt_bigcode", "mistral", "mixtral"]
+
+    if config.model_type in MODELS_ATTN_IMPLEMENTATION_EAGER:
+        config._attn_implementation = "eager"
+        return config
+
+    # Otherwise, fallback to original implementation
+    # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_utils.py#L1542
+    if hard_check_only:
+        if not cls._supports_sdpa:
+            raise ValueError(
+                f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
+                " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe"
+                ' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
+            )
+        if not is_torch_sdpa_available():
+            raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.")
+
+    if not is_torch_sdpa_available() or not cls._supports_sdpa:
+        return config
+
+    _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+    if _is_bettertransformer:
+        return config
+
+    if not hard_check_only:
+        config._attn_implementation = "sdpa"
+
+    return config
+
+
+# Splitting DeepSpeed LinearAllReduce to three parts to avoid redundant memory consumption
+class ScopedLinearAllReduce(torch.nn.Module):
+    def __init__(self, mod, *args, **kwargs):
+        self.__dict__.update(mod.__dict__)
+
+    def forward(self, input):
+        # pre_all_reduce
+
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        return output
+
+    def all_reduce(self, input):
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+
+            dist.inference_all_reduce(input, group=self.mp_group)
+
+    def post_all_reduce(self, input):
+        output = input + self.bias if (self.bias is not None) else input
+        return output
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
new file mode 100755
index 00000000000..4fe62170997
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
@@ -0,0 +1,106 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+
+@dataclass
+class GaudiAttentionMaskConverter(AttentionMaskConverter):
+    """
+    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L21
+
+    Differences:
+    - replace `triu` with similar logic here: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L169
+    """
+
+    @staticmethod
+    def _make_causal_mask(
+        input_ids_shape: torch.Size,
+        dtype: torch.dtype,
+        device: torch.device,
+        past_key_values_length: int = 0,
+        sliding_window: Optional[int] = None,
+    ):
+        """
+        Make causal mask used for bi-directional self-attention.
+        """
+        bsz, tgt_len = input_ids_shape
+        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+        mask_cond = torch.arange(mask.size(-1), device=device)
+        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+
+        mask = mask.to(dtype)
+
+        if past_key_values_length > 0:
+            mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+
+        # add lower triangular sliding window mask if necessary
+        if sliding_window is not None:
+            diagonal = past_key_values_length - sliding_window + 1
+
+            # Replace triu with below
+            row_indices = torch.arange(mask.size(0), device=mask.device).view(-1, 1)  # Reshape to column vector
+            col_indices = torch.arange(mask.size(1), device=mask.device)
+            context_mask = 1 - (col_indices >= row_indices + diagonal).int().expand_as(
+                mask
+            )  # Expand to match mask shape
+
+            mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min)
+
+        return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _gaudi_prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L278
+
+    Differences:
+    - replace `AttentionMaskConverter` by `GaudiAttentionMaskConverter`
+    """
+    attn_mask_converter = GaudiAttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+
+    key_value_length = input_shape[-1] + past_key_values_length
+
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+
+    return attention_mask
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
new file mode 100644
index 00000000000..1ab41c1a805
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_mpt import (
+    GaudiMptForCausalLM,
+    GaudiMptModel,
+    gaudi_mpt_attention_forward,
+    gaudi_mpt_block_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
new file mode 100644
index 00000000000..294371700b8
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
@@ -0,0 +1,384 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+# Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from transformers.models.mpt.modeling_mpt import MptForCausalLM, MptModel
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_mpt_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    position_bias: torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from MptAttention.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    """
+
+    batch_size, seq_length = hidden_states.shape[:2]
+
+    mixed_qkv = self.Wqkv(hidden_states)
+    bs, seq_len, three_times_hidden_size = mixed_qkv.shape
+    mixed_qkv = mixed_qkv.view(bs, seq_len, self.n_heads * 3, self.head_dim)
+    mixed_qkv = mixed_qkv.transpose(1, 2)
+    query_states, key_states, value_states = (
+        mixed_qkv[:, : self.n_heads, ...],
+        mixed_qkv[:, self.n_heads : 2 * self.n_heads, ...],
+        mixed_qkv[:, 2 * self.n_heads :, ...],
+    )
+
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            if token_idx is not None:
+                past_key_value[0].index_copy_(2, token_idx - 1, key_states)
+                past_key_value[1].index_copy_(2, token_idx - 1, value_states)
+                key_states = past_key_value[0]
+                value_states = past_key_value[1]
+            else:
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states)
+    else:
+        past_key_value = (key_states, value_states)
+
+    attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+
+    query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
+
+    if position_bias is not None:
+        if len(position_bias.shape) != 3:
+            raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
+        key_length = key_states.shape[-2]
+
+        position_bias_query_index = max(0, position_bias.size(1) - query_length)
+        position_bias_key_index = max(0, position_bias.size(2) - key_length)
+
+        position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
+
+        attention_scores = attention_scores + position_bias
+
+    if attention_mask is not None:
+        attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
+
+    # (batch_size, n_heads, seq_length, key_length)
+    attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
+
+    context_states = torch.matmul(attn_weights, value_states)
+    context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+    attn_output = self.out_proj(context_states)
+
+    return attn_output, attn_weights, past_key_value
+
+
+def gaudi_mpt_block_forward(
+    self,
+    hidden_states: torch.Tensor,
+    position_bias: torch.Tensor,
+    attention_mask: torch.Tensor,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from MptBlock.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+    The only differences are:
+    - add new args token_idx
+    """
+    # hidden_states: [batch_size, seq_length, hidden_size]
+    # Layer norm at the beginning of the transformer layer.
+    layernorm_output = self.norm_1(hidden_states)
+
+    residual = hidden_states
+
+    # Self attention.
+    attn_outputs, attn_weights, past_key_value = self.attn(
+        layernorm_output,
+        position_bias=position_bias,
+        attention_mask=attention_mask,
+        past_key_value=layer_past,
+        token_idx=token_idx,
+    )
+
+    hidden_states = self.resid_attn_dropout(attn_outputs) + residual
+
+    layernorm_output = self.norm_2(hidden_states)
+
+    # Get residual
+    residual = hidden_states
+
+    # MLP.
+    output = self.ffn(layernorm_output, residual)
+    outputs = (output,)
+
+    if use_cache:
+        outputs += (past_key_value,)
+
+    if output_attentions:
+        outputs += (attn_weights,)
+
+    return outputs  # hidden_states, present, attentions
+
+
+class GaudiMptModel(MptModel):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        """
+        Copied from MptModel.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        The only differences are:
+        - add new args token_idx
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.blocks))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        hidden_states = inputs_embeds
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None and token_idx is None:  # because RW-cache, not standard format
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        alibi = self.build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
+
+        causal_mask = _gaudi_prepare_4d_causal_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        causal_mask = causal_mask.bool()
+
+        for block, layer_past in zip(self.blocks, past_key_values):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                outputs = self._gradient_checkpointing_func(
+                    block.__call__,
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    layer_past,
+                    use_cache,
+                    output_attentions,
+                    None,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    position_bias=alibi,
+                    token_idx=token_idx,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class GaudiMptForCausalLM(MptForCausalLM):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        The only differences are:
+        - add new args token_idx
+        - add token_idx into model_inputs
+        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+        """
+        # only last tokens for input_ids if past is not None
+        if past_key_values is not None:
+            if token_idx is None:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+            else:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,  # NITS should it be layer_past?
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        """
+        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        The only differences are:
+        - add new args token_idx
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
new file mode 100644
index 00000000000..9ea5a435eed
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
@@ -0,0 +1,8 @@
+from .modeling_opt import (
+    GaudiOPTForCausalLM,
+    GaudiOPTLearnedPositionalEmbedding,
+    gaudi_opt_attention_forward,
+    gaudi_opt_decoder_forward,
+    gaudi_opt_decoder_layer_forward,
+    gaudi_opt_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
new file mode 100644
index 00000000000..6743594e355
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
@@ -0,0 +1,537 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTLearnedPositionalEmbedding, logger
+
+from ..modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
+
+
+class GaudiOPTLearnedPositionalEmbedding(OPTLearnedPositionalEmbedding):
+    """
+    Inherits from OPTLearnedPositionalEmbedding: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    - compute embedding using token_idx if past_key_values_length not 0
+    """
+
+    def forward(
+        self,
+        attention_mask: torch.LongTensor,
+        past_key_values_length: int = 0,
+        token_idx: Optional[torch.Tensor] = None,
+    ):
+        attention_mask = attention_mask.long()
+
+        if past_key_values_length == 0:
+            # first step or kv cache disabled
+            positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
+            positions = positions[:, past_key_values_length:]
+            return torch.nn.Embedding.forward(self, positions + self.offset)
+        else:
+            # if not 0, kv cache is enabled and from step = 2, past_key_values_length is equal to the final length of outputs
+            return torch.nn.Embedding.forward(self, token_idx + self.offset)
+
+
+def gaudi_opt_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Copied from OPTAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    """
+    # if key_value_states are provided this layer is used as a cross-attention layer
+    # for the decoder
+    is_cross_attention = key_value_states is not None
+
+    bsz, tgt_len, _ = hidden_states.size()
+
+    # get query proj
+    query_states = self.q_proj(hidden_states) * self.scaling
+    # get key, value proj
+    if is_cross_attention and past_key_value is not None:
+        # reuse k,v, cross_attentions
+        key_states = past_key_value[0]
+        value_states = past_key_value[1]
+    elif is_cross_attention:
+        # cross_attentions
+        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+    elif past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if token_idx is not None:
+            past_key_value[0].index_copy_(2, token_idx - 1, key_states)
+            past_key_value[1].index_copy_(2, token_idx - 1, value_states)
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        else:
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    else:
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+    if self.is_decoder:
+        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+        # Further calls to cross_attention layer can then reuse all cross-attention
+        # key/value_states (first "if" case)
+        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+        # all previous decoder key/value_states. Further calls to uni-directional self-attention
+        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+        # if encoder bi-directional self-attention `past_key_value` is always `None`
+        past_key_value = (key_states, value_states)
+
+    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+    key_states = key_states.view(*proj_shape)
+    value_states = value_states.view(*proj_shape)
+
+    src_len = key_states.size(1)
+    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+        attn_weights = torch.max(
+            attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+        )
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+
+    if layer_head_mask is not None:
+        if layer_head_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    if output_attentions:
+        # this operation is a bit awkward, but it's required to
+        # make sure that attn_weights keeps its gradient.
+        # In order to do so, attn_weights have to be reshaped
+        # twice and have to be reused in the following
+        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+    else:
+        attn_weights_reshaped = None
+
+    attn_probs = torch.nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+    attn_output = torch.bmm(attn_probs, value_states)
+
+    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+    attn_output = attn_output.transpose(1, 2)
+
+    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+    # partitioned aross GPUs when using tensor-parallelism.
+    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, attn_weights_reshaped, past_key_value
+
+
+def gaudi_opt_decoder_layer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    """
+    Copied from OPTDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    """
+    residual = hidden_states
+
+    # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+    if self.do_layer_norm_before:
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+    # Self Attention
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        past_key_value=past_key_value,
+        attention_mask=attention_mask,
+        layer_head_mask=layer_head_mask,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states = torch.nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+    hidden_states = residual + hidden_states
+
+    # 350m applies layer norm AFTER attention
+    if not self.do_layer_norm_before:
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+    # Fully Connected
+    hidden_states_shape = hidden_states.shape
+    hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
+    residual = hidden_states
+
+    # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+    if self.do_layer_norm_before:
+        hidden_states = self.final_layer_norm(hidden_states)
+
+    hidden_states = self.fc1(hidden_states)
+    hidden_states = self.activation_fn(hidden_states)
+
+    hidden_states = self.fc2(hidden_states)
+    hidden_states = torch.nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+    hidden_states = (residual + hidden_states).view(hidden_states_shape)
+
+    # 350m applies layer norm AFTER attention
+    if not self.do_layer_norm_before:
+        hidden_states = self.final_layer_norm(hidden_states)
+
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (self_attn_weights,)
+
+    if use_cache:
+        outputs += (present_key_value,)
+
+    return outputs
+
+
+def gaudi_opt_decoder_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from OPTDecoder.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    - update calculation of mask_seq_length
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+    elif input_ids is not None:
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+    else:
+        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    batch_size, seq_length = input_shape
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+        mask_seq_length = past_key_values_length
+    else:
+        past_key_values_length = 0
+        mask_seq_length = seq_length
+
+    # embed positions
+    # 4d mask is passed through the layers
+    if attention_mask is None:
+        attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+    elif attention_mask.shape[1] != mask_seq_length:
+        raise ValueError(
+            f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+            f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+        )
+    causal_attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+        attention_mask, input_shape, inputs_embeds, past_key_values_length
+    )
+
+    pos_embeds = self.embed_positions(attention_mask, past_key_values_length, token_idx)
+
+    if self.project_in is not None:
+        inputs_embeds = self.project_in(inputs_embeds)
+
+    hidden_states = inputs_embeds + pos_embeds
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = () if use_cache else None
+
+    # check if head_mask has a correct number of layers specified if desired
+    for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
+        if attn_mask is not None:
+            if attn_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+    for idx, decoder_layer in enumerate(self.layers):
+        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if self.training:
+            dropout_probability = torch.rand([])
+            if dropout_probability < self.layerdrop:
+                continue
+
+        past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                causal_attention_mask,
+                head_mask[idx] if head_mask is not None else None,
+                None,
+                output_attentions,
+                use_cache,
+                None,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_idx=token_idx,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+    if self.final_layer_norm is not None:
+        hidden_states = self.final_layer_norm(hidden_states)
+
+    if self.project_out is not None:
+        hidden_states = self.project_out(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+
+def gaudi_opt_model_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from OPTModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+    decoder_outputs = self.decoder(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        head_mask=head_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        token_idx=token_idx,
+    )
+
+    if not return_dict:
+        return decoder_outputs
+
+    return BaseModelOutputWithPast(
+        last_hidden_state=decoder_outputs.last_hidden_state,
+        past_key_values=decoder_outputs.past_key_values,
+        hidden_states=decoder_outputs.hidden_states,
+        attentions=decoder_outputs.attentions,
+    )
+
+
+class GaudiOPTForCausalLM(OPTForCausalLM):
+    """
+    Inherits from OPTForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    The only differences are:
+    - add new args token_idx
+    - add token_idx into model_inputs
+    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+    """
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+
+        logits = self.lm_head(outputs[0]).contiguous()
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if token_idx is not None:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+            else:
+                past_length = past_key_values[0][0].shape[2]
+
+                # Some generation methods already pass only the last input ID
+                if input_ids.shape[1] > past_length:
+                    remove_prefix_length = past_length
+                else:
+                    # Default to old behavior: keep only final ID
+                    remove_prefix_length = input_ids.shape[1] - 1
+
+                input_ids = input_ids[:, remove_prefix_length:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
new file mode 100644
index 00000000000..1a98f45f513
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_phi import (
+    GaudiPhiForCausalLM,
+    gaudi_phi_attention_forward,
+    gaudi_phi_decoder_layer_forward,
+    gaudi_phi_model_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
new file mode 100644
index 00000000000..a59aadc0505
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Phi model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.phi.modeling_phi import PhiForCausalLM, apply_rotary_pos_emb, repeat_kv
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import (
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_phi_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Copied from PhiAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    The only differences are:
+    - add new args token_idx
+    """
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    if self.qk_layernorm:
+        query_states = self.q_layernorm(query_states)
+        key_states = self.k_layernorm(key_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        if token_idx is not None:
+            if 0 <= self.layer_idx < len(past_key_value.key_cache):
+                kv_seq_len = past_key_value.key_cache[self.layer_idx].shape[-2]
+        else:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+    # Partial rotary embedding
+    query_rot, query_pass = (
+        query_states[..., : self.rotary_emb.dim],
+        query_states[..., self.rotary_emb.dim :],
+    )
+    key_rot, key_pass = (
+        key_states[..., : self.rotary_emb.dim],
+        key_states[..., self.rotary_emb.dim :],
+    )
+    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+
+    # [batch_size, seq_length, num_heads, head_dim]
+    query_states = torch.cat((query_rot, query_pass), dim=-1)
+    key_states = torch.cat((key_rot, key_pass), dim=-1)
+
+    if past_key_value is not None:
+        if token_idx is not None:
+            if 0 <= self.layer_idx < len(past_key_value.key_cache):
+                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
+                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
+                key_states = past_key_value.key_cache[self.layer_idx]
+                value_states = past_key_value.value_cache[self.layer_idx]
+            else:
+                past_key_value.key_cache.append(key_states)
+                past_key_value.value_cache.append(value_states)
+        else:
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
+    attn_weights = torch.matmul(
+        query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
+    ) / math.sqrt(self.head_dim)
+
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+    attn_output = torch.matmul(attn_weights, value_states)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.dense(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def gaudi_phi_decoder_layer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    token_idx: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    """
+    Copied from PhiDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    The only differences are:
+    - add new args token_idx
+    """
+
+    residual = hidden_states
+
+    hidden_states = self.input_layernorm(hidden_states)
+
+    # Self Attention
+    attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_value=past_key_value,
+        output_attentions=output_attentions,
+        use_cache=use_cache,
+        token_idx=token_idx,
+    )
+    attn_outputs = self.resid_dropout(attn_outputs)
+
+    feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
+    hidden_states = attn_outputs + feed_forward_hidden_states + residual
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (self_attn_weights,)
+
+    if use_cache:
+        outputs += (present_key_value,)
+
+    return outputs
+
+
+def gaudi_phi_model_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    """
+    Copied from PhiModel.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    The only differences are:
+    - add new args token_idx
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape[:2]
+    elif inputs_embeds is not None:
+        batch_size, seq_length = inputs_embeds.shape[:2]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    past_key_values_length = 0
+    if use_cache:
+        use_legacy_cache = not isinstance(past_key_values, Cache)
+        if use_legacy_cache:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+    if position_ids is None:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        position_ids = torch.arange(
+            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+        )
+        position_ids = position_ids.unsqueeze(0)
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    inputs_embeds = self.embed_dropout(inputs_embeds)
+
+    # 4d mask is passed through the layers
+    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+        attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+    )
+
+    hidden_states = inputs_embeds
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+
+    for decoder_layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_idx=token_idx,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+    hidden_states = self.final_layernorm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = None
+    if use_cache:
+        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+
+class GaudiPhiForCausalLM(PhiForCausalLM):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+        The only differences are:
+        - add new args token_idx
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            token_idx=token_idx,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        """
+        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+        The only differences are:
+        - add new args token_idx
+        - add token_idx into model_inputs
+        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
+        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+        """
+        token_idx = kwargs.get("token_idx", None)
+
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if token_idx is None:
+                if isinstance(past_key_values, Cache):
+                    cache_length = past_key_values.get_seq_length()
+                    past_length = past_key_values.seen_tokens
+                    max_cache_length = past_key_values.get_max_length()
+                else:
+                    cache_length = past_length = past_key_values[0][0].shape[2]
+                    max_cache_length = None
+
+                # Keep only the unprocessed tokens:
+                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+                # input)
+                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # input_ids based on the past_length.
+                elif past_length < input_ids.shape[1]:
+                    input_ids = input_ids[:, past_length:]
+                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                if (
+                    max_cache_length is not None
+                    and attention_mask is not None
+                    and cache_length + input_ids.shape[1] > max_cache_length
+                ):
+                    attention_mask = attention_mask[:, -max_cache_length:]
+            else:
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                if token_idx is not None:
+                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
+                else:
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "token_idx": token_idx,
+            }
+        )
+        return model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
new file mode 100644
index 00000000000..b328c0e854e
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
@@ -0,0 +1,7 @@
+from .modeling_speecht5 import (
+    gaudi_generate_speech,
+    gaudi_SpeechT5Attention_forward,
+    gaudi_SpeechT5Decoder_forward,
+    gaudi_SpeechT5DecoderLayer_forward,
+    gaudi_SpeechT5SpeechDecoderPrenet_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
new file mode 100644
index 00000000000..222622e0190
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
@@ -0,0 +1,552 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet, SpeechT5PreTrainedModel
+from transformers.utils import logging
+
+from ..modeling_attn_mask_utils import (
+    _gaudi_prepare_4d_causal_attention_mask,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_SpeechT5SpeechDecoderPrenet_forward(
+    self,
+    input_values: torch.Tensor,
+    speaker_embeddings: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from SpeechT5SpeechDecoderPrenet.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    The only differences are:
+    - disable dropout in inference, or else hpu graph could not be used
+    """
+
+    inputs_embeds = input_values
+    for layer in self.layers:
+        inputs_embeds = nn.functional.relu(layer(inputs_embeds))
+        if self.training:
+            inputs_embeds = self._consistent_dropout(inputs_embeds, self.config.speech_decoder_prenet_dropout)
+
+    inputs_embeds = self.final_layer(inputs_embeds)
+    inputs_embeds = self.encode_positions(inputs_embeds)
+
+    if speaker_embeddings is not None:
+        speaker_embeddings = nn.functional.normalize(speaker_embeddings)
+        speaker_embeddings = speaker_embeddings.unsqueeze(1).expand(-1, inputs_embeds.size(1), -1)
+        inputs_embeds = torch.cat([inputs_embeds, speaker_embeddings], dim=-1)
+        inputs_embeds = nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))
+
+    return inputs_embeds
+
+
+def gaudi_SpeechT5Attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    position_bias: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Copied from SpeechT5Attention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    The only differences are:
+    - add new args token_idx
+    """
+
+    # if key_value_states are provided this layer is used as a cross-attention layer
+    # for the decoder
+    is_cross_attention = key_value_states is not None
+
+    bsz, tgt_len, _ = hidden_states.size()
+
+    # get query proj
+    query_states = self.q_proj(hidden_states) * self.scaling
+    # get key, value proj
+    if is_cross_attention and past_key_value is not None:
+        # reuse k,v, cross_attentions
+        key_states = past_key_value[0]
+        value_states = past_key_value[1]
+    elif is_cross_attention:
+        # cross_attentions
+        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+    elif past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if token_idx is not None:
+            past_key_value[0].index_copy_(2, token_idx - 1, key_states)
+            past_key_value[1].index_copy_(2, token_idx - 1, value_states)
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        else:
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    else:
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+    if self.is_decoder:
+        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+        # Further calls to cross_attention layer can then reuse all cross-attention
+        # key/value_states (first "if" case)
+        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+        # all previous decoder key/value_states. Further calls to uni-directional self-attention
+        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+        # if encoder bi-directional self-attention `past_key_value` is always `None`
+        past_key_value = (key_states, value_states)
+
+    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+    key_states = key_states.view(*proj_shape)
+    value_states = value_states.view(*proj_shape)
+
+    src_len = key_states.size(1)
+    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+
+    # relative attention bias
+    if position_bias is not None:
+        reshape_q = query_states.contiguous().view(bsz * self.num_heads, -1, self.head_dim).transpose(0, 1)
+        rel_pos_bias = torch.matmul(reshape_q, position_bias.transpose(-2, -1))
+        rel_pos_bias = rel_pos_bias.transpose(0, 1).view(
+            bsz * self.num_heads, position_bias.size(0), position_bias.size(1)
+        )
+        attn_weights += rel_pos_bias
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if layer_head_mask is not None:
+        if layer_head_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+    if output_attentions:
+        # this operation is a bit awkward, but it's required to
+        # make sure that attn_weights keeps its gradient.
+        # In order to do so, attn_weights have to be reshaped
+        # twice and have to be reused in the following
+        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+    else:
+        attn_weights_reshaped = None
+
+    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+    attn_output = torch.bmm(attn_probs, value_states)
+
+    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+    attn_output = attn_output.transpose(1, 2)
+
+    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+    # partitioned aross GPUs when using tensor-parallelism.
+    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, attn_weights_reshaped, past_key_value
+
+
+def gaudi_SpeechT5DecoderLayer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = True,
+    token_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Copied from SpeechT5DecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    The only differences are:
+    - add token_idx in self-attention
+    """
+    residual = hidden_states
+
+    # Self Attention
+    # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+    # add present self-attn cache to positions 1,2 of present_key_value tuple
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        past_key_value=self_attn_past_key_value,
+        attention_mask=attention_mask,
+        layer_head_mask=layer_head_mask,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = residual + hidden_states
+    hidden_states = self.self_attn_layer_norm(hidden_states)
+
+    # Cross-Attention Block
+    cross_attn_present_key_value = None
+    cross_attn_weights = None
+    if encoder_hidden_states is not None:
+        residual = hidden_states
+
+        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+            hidden_states=hidden_states,
+            key_value_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            layer_head_mask=cross_attn_layer_head_mask,
+            past_key_value=cross_attn_past_key_value,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # add cross-attn to positions 3,4 of present_key_value tuple
+        present_key_value = present_key_value + cross_attn_present_key_value
+
+    # Fully Connected
+    hidden_states = hidden_states + self.feed_forward(hidden_states)
+    hidden_states = self.final_layer_norm(hidden_states)
+
+    outputs = (hidden_states,)
+
+    if output_attentions:
+        outputs += (self_attn_weights, cross_attn_weights)
+
+    if use_cache:
+        outputs += (present_key_value,)
+
+    return outputs
+
+
+def gaudi_SpeechT5Decoder_forward(
+    self,
+    hidden_states: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.LongTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    cross_attn_head_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.Tensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+    """
+    Copied from SpeechT5Decoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    The only differences are:
+    - add token_idx args
+    - use _gaudi_prepare_4d_causal_attention_mask
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    input_shape = hidden_states.size()[:-1]
+
+    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+        attention_mask, input_shape, hidden_states, past_key_values_length
+    )
+
+    # expand encoder attention mask
+    if encoder_hidden_states is not None and encoder_attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        encoder_attention_mask = _prepare_4d_attention_mask(
+            encoder_attention_mask, hidden_states.dtype, tgt_len=input_shape[-1]
+        )
+
+    deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attentions = () if output_attentions else None
+    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+    next_decoder_cache = () if use_cache else None
+
+    # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+    for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+        if attn_mask is not None:
+            if attn_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+    for idx, decoder_layer in enumerate(self.layers):
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+        skip_the_layer = False
+        if self.training:
+            dropout_probability = torch.rand([])
+            skip_the_layer = dropout_probability < self.layerdrop
+        if skip_the_layer and not deepspeed_zero3_is_enabled:
+            continue
+
+        past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                head_mask[idx] if head_mask is not None else None,
+                cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                None,
+                output_attentions,
+                use_cache,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_idx=token_idx,
+            )
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+            if encoder_hidden_states is not None:
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions]
+            if v is not None
+        )
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+def gaudi_generate_speech(
+    model: SpeechT5PreTrainedModel,
+    input_values: torch.FloatTensor,
+    speaker_embeddings: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.LongTensor] = None,
+    threshold: float = 0.5,
+    minlenratio: float = 0.0,
+    maxlenratio: float = 20.0,
+    vocoder: Optional[nn.Module] = None,
+    output_cross_attentions: bool = False,
+    return_output_lengths: bool = False,
+) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
+    """
+    Copied from _generate_speech: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    The only differences are:
+    - add hpu graph wrap
+    - add static shape support in kv-cache in _generate_speech
+    - disable speech_decoder_prenet_dropout to avoid variable output length
+    """
+    if speaker_embeddings is None:
+        raise ValueError(
+            """`speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
+                    the code snippet provided in this link:
+                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
+                    """
+        )
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+    if not hasattr(model.speecht5.encoder, "clear_cache"):
+        model.speecht5.encoder = wrap_in_hpu_graph(model.speecht5.encoder)
+    if not hasattr(model.speecht5.decoder.wrapped_decoder, "clear_cache"):
+        model.speecht5.decoder.wrapped_decoder = wrap_in_hpu_graph(model.speecht5.decoder.wrapped_decoder)
+    if not hasattr(model.speecht5.decoder.prenet, "clear_cache"):
+        model.speecht5.decoder.prenet = wrap_in_hpu_graph(model.speecht5.decoder.prenet)
+
+    if attention_mask is None:
+        encoder_attention_mask = 1 - (input_values == model.config.pad_token_id).int()
+    else:
+        encoder_attention_mask = attention_mask
+
+    bsz = input_values.size(0)
+    encoder_out = model.speecht5.encoder(
+        input_values=input_values,
+        attention_mask=encoder_attention_mask,
+        return_dict=True,
+    )
+
+    encoder_last_hidden_state = encoder_out.last_hidden_state
+
+    # downsample encoder attention mask
+    if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
+        encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
+            encoder_out[0].shape[1], encoder_attention_mask
+        )
+
+    maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
+    minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)
+
+    # Start the output sequence with a mel spectrum that is all zeros.
+    output_sequence = encoder_last_hidden_state.new_zeros(bsz, 1, model.config.num_mel_bins)
+    output_sequence = torch.nn.functional.pad(output_sequence, (0, 0, 0, maxlen - 1), value=model.config.pad_token_id)
+    spectrogram = []
+    cross_attentions = []
+    past_key_values = None
+    idx = 0
+    result_spectrogram = {}
+    token_idx = torch.tensor(1, device=output_sequence.device)
+    attention_mask = torch.zeros((bsz, maxlen), dtype=torch.long, device=output_sequence.device)
+    while True:
+        idx += 1
+        attention_mask.index_fill_(1, token_idx - 1, 1)
+        # Run the decoder prenet on the entire output sequence.
+        decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
+        # Run the decoder layers on the last element of the prenet output.
+        decoder_out = model.speecht5.decoder.wrapped_decoder(
+            hidden_states=decoder_hidden_states
+            if past_key_values is None
+            else torch.index_select(decoder_hidden_states, 1, token_idx - 1),
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+            output_attentions=output_cross_attentions,
+            return_dict=True,
+            token_idx=token_idx,
+        )
+
+        if output_cross_attentions:
+            cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0))
+
+        last_decoder_output = decoder_out.last_hidden_state[:, 0:1, :].squeeze(1)
+        past_key_values = decoder_out.past_key_values
+        # Predict the new mel spectrum for this step in the sequence.
+        spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
+        spectrum = spectrum.view(bsz, model.config.reduction_factor, model.config.num_mel_bins)
+        spectrogram.append(spectrum)
+        output_sequence.index_copy_(1, token_idx, spectrum[:, -1, :].view(bsz, 1, model.config.num_mel_bins))
+        # Predict the probability that this is the stop token.
+        prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
+        token_idx.add_(1)
+        # Finished when stop token or maximum length is reached.
+        if idx < minlen:
+            continue
+        else:
+            # If the generation loop is less than maximum length time, check the ones in the batch that have met
+            # the prob threshold. Otherwise, assume all have met thresholds and fill other spectrograms for the batch.
+            if idx < maxlen:
+                meet_thresholds = torch.sum(prob, dim=-1) >= threshold
+                meet_indexes = torch.where(meet_thresholds)[0].tolist()
+            else:
+                meet_indexes = range(len(prob))
+            meet_indexes = [i for i in meet_indexes if i not in result_spectrogram]
+            if len(meet_indexes) > 0:
+                spectrograms = torch.stack(spectrogram)
+                spectrograms = spectrograms.transpose(0, 1).flatten(1, 2)
+                spectrograms = model.speech_decoder_postnet.postnet(spectrograms)
+                for meet_index in meet_indexes:
+                    result_spectrogram[meet_index] = spectrograms[meet_index]
+            if len(result_spectrogram) >= bsz:
+                break
+
+    spectrograms = [result_spectrogram[i] for i in range(len(result_spectrogram))]
+    if not return_output_lengths:
+        spectrogram = spectrograms[0] if bsz == 1 else torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
+        if vocoder is not None:
+            outputs = vocoder(spectrogram)
+        else:
+            outputs = spectrogram
+        if output_cross_attentions:
+            cross_attentions = torch.cat(cross_attentions, dim=2)
+            if bsz > 1:
+                cross_attentions = cross_attentions.view(
+                    bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
+                )
+            outputs = (outputs, cross_attentions)
+    else:
+        # batched return values should also include the spectrogram/waveform lengths
+        spectrogram_lengths = []
+        for i in range(bsz):
+            spectrogram_lengths.append(spectrograms[i].size(0))
+        if vocoder is None:
+            spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
+            outputs = (spectrograms, spectrogram_lengths)
+        else:
+            waveforms = []
+            spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
+            waveforms = vocoder(spectrograms)
+            waveform_lengths = [int(waveforms.size(1) / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
+            outputs = (waveforms, waveform_lengths)
+        if output_cross_attentions:
+            cross_attentions = torch.cat(cross_attentions, dim=2)
+            cross_attentions = cross_attentions.view(
+                bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
+            )
+            outputs = (*outputs, cross_attentions)
+    return outputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
new file mode 100644
index 00000000000..59dbee4d5d8
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
@@ -0,0 +1 @@
+from .modeling_swin import gaudi_swin_get_attn_mask
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
new file mode 100644
index 00000000000..9ea3b9d28cb
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Swin Transformer model."""
+
+import torch
+from transformers.models.swin.modeling_swin import window_partition
+
+
+def gaudi_swin_get_attn_mask(self, height, width, dtype):
+    """
+    Copied from SwinLayer.get_attn_mask : https://github.com/huggingface/transformers/blob/main/src/transformers/models/swin/modeling_swin.py
+    The only difference is moving img_mask to hpu for performance
+    """
+    if self.shift_size > 0:
+        # calculate attention mask for SW-MSA
+        img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device="hpu")
+        height_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        width_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        count = 0
+        for height_slice in height_slices:
+            for width_slice in width_slices:
+                img_mask[:, height_slice, width_slice, :] = count
+                count += 1
+
+        mask_windows = window_partition(img_mask, self.window_size)
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    else:
+        attn_mask = None
+
+    return attn_mask
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
new file mode 100644
index 00000000000..e92116128d2
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
@@ -0,0 +1,9 @@
+from .modeling_t5 import (
+    gaudi_t5_layernorm_forward,
+    gaudi_T5Attention_forward,
+    gaudi_T5Block_forward,
+    gaudi_T5ForConditionalGeneration_forward,
+    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
+    gaudi_T5LayerSelfAttention_forward,
+    gaudi_T5Stack_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
new file mode 100644
index 00000000000..17b0e49a97a
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -0,0 +1,636 @@
+import warnings
+from typing import Optional, Tuple, Union
+
+import habana_frameworks.torch.core as htcore
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+)
+from transformers.models.t5.modeling_t5 import __HEAD_MASK_WARNING_MSG
+from transformers.utils import (
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
+
+
+def gaudi_t5_layernorm_forward(self, hidden_states):
+    """
+    Copied from T5LayerNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+    The only differences are:
+        - override RMSNorm with Habana fused RMSNorm
+    """
+    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+        orig_dtype = hidden_states.dtype
+        hidden_states = FusedRMSNorm.apply(hidden_states.float(), self.weight.float(), self.variance_epsilon)
+        return hidden_states.to(orig_dtype)
+    else:
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+def gaudi_T5Attention_forward(
+    self,
+    hidden_states,
+    mask=None,
+    key_value_states=None,
+    position_bias=None,
+    past_key_value=None,
+    layer_head_mask=None,
+    query_length=None,
+    use_cache=False,
+    output_attentions=False,
+    token_idx=None,
+):
+    # Input is (batch_size, seq_length, dim)
+    # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+    # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+    batch_size, seq_length = hidden_states.shape[:2]
+
+    real_seq_length = seq_length
+
+    if past_key_value is not None:
+        if len(past_key_value) != 2:
+            raise ValueError(
+                f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            )
+        if token_idx is None:
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+        else:
+            real_seq_length = past_key_value[0].shape[2]
+
+    key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+    def shape(states):
+        """projection"""
+        return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+    def unshape(states):
+        """reshape"""
+        return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+    def project(hidden_states, proj_layer, key_value_states, past_key_value):
+        """projects hidden states correctly to key/query states"""
+        if key_value_states is None:
+            # self-attn
+            # (batch_size, n_heads, seq_length, dim_per_head)
+            hidden_states = shape(proj_layer(hidden_states))
+        elif past_key_value is None:
+            # cross-attn
+            # (batch_size, n_heads, seq_length, dim_per_head)
+            hidden_states = shape(proj_layer(key_value_states))
+
+        if past_key_value is not None:
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, key_length, dim_per_head)
+                if token_idx is None:
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                else:
+                    hidden_states = past_key_value.index_copy_(-2, token_idx - 1, hidden_states)
+            elif past_key_value.shape[2] != key_value_states.shape[1]:
+                # checking that the `sequence_length` of the `past_key_value` is the same as
+                # the provided `key_value_states` to support prefix tuning
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+            else:
+                # cross-attn
+                hidden_states = past_key_value
+        return hidden_states
+
+    # get query states
+    query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+    # get key/value states
+    key_states = project(
+        hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
+    )
+    value_states = project(
+        hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
+    )
+
+    # compute scores
+    scores = torch.matmul(
+        query_states, key_states.transpose(3, 2)
+    )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+    if position_bias is None:
+        if not self.has_relative_attention_bias:
+            position_bias = torch.zeros(
+                (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+            )
+            if self.gradient_checkpointing and self.training:
+                position_bias.requires_grad = True
+        else:
+            position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+
+        # if key and values are already calculated
+        # we want only the last query position bias
+        if past_key_value is not None:
+            if token_idx is None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+            else:
+                position_bias = position_bias.index_select(-2, token_idx - 1)
+
+        if mask is not None:
+            position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+    if self.pruned_heads:
+        mask = torch.ones(position_bias.shape[1])
+        mask[list(self.pruned_heads)] = 0
+        position_bias_masked = position_bias[:, mask.bool()]
+    else:
+        position_bias_masked = position_bias
+
+    scores += position_bias_masked
+    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+        scores
+    )  # (batch_size, n_heads, seq_length, key_length)
+    if self.training:
+        htcore.mark_step()
+    attn_weights = nn.functional.dropout(
+        attn_weights, p=self.dropout, training=self.training
+    )  # (batch_size, n_heads, seq_length, key_length)
+    if self.training:
+        htcore.mark_step()
+
+    # Mask heads if we want to
+    if layer_head_mask is not None:
+        attn_weights = attn_weights * layer_head_mask
+
+    attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+    attn_output = self.o(attn_output)
+
+    present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+    outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+    if output_attentions:
+        outputs = outputs + (attn_weights,)
+    return outputs
+
+
+def gaudi_T5LayerSelfAttention_forward(
+    self,
+    hidden_states,
+    attention_mask=None,
+    position_bias=None,
+    layer_head_mask=None,
+    past_key_value=None,
+    use_cache=False,
+    output_attentions=False,
+    token_idx=None,
+):
+    normed_hidden_states = self.layer_norm(hidden_states)
+    attention_output = self.SelfAttention(
+        normed_hidden_states,
+        mask=attention_mask,
+        position_bias=position_bias,
+        layer_head_mask=layer_head_mask,
+        past_key_value=past_key_value,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states = hidden_states + self.dropout(attention_output[0])
+    outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+    return outputs
+
+
+def gaudi_T5Block_forward(
+    self,
+    hidden_states,
+    attention_mask=None,
+    position_bias=None,
+    encoder_hidden_states=None,
+    encoder_attention_mask=None,
+    encoder_decoder_position_bias=None,
+    layer_head_mask=None,
+    cross_attn_layer_head_mask=None,
+    past_key_value=None,
+    use_cache=False,
+    output_attentions=False,
+    return_dict=True,
+    token_idx=None,
+):
+    if past_key_value is not None:
+        if not self.is_decoder:
+            logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+        expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+        if len(past_key_value) != expected_num_past_key_values:
+            raise ValueError(
+                f"There should be {expected_num_past_key_values} past states. "
+                f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                f"Got {len(past_key_value)} past key / value states"
+            )
+
+        self_attn_past_key_value = past_key_value[:2]
+        cross_attn_past_key_value = past_key_value[2:]
+    else:
+        self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+    self_attention_outputs = self.layer[0](
+        hidden_states,
+        attention_mask=attention_mask,
+        position_bias=position_bias,
+        layer_head_mask=layer_head_mask,
+        past_key_value=self_attn_past_key_value,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        token_idx=token_idx,
+    )
+    hidden_states, present_key_value_state = self_attention_outputs[:2]
+    attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+    do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+    if do_cross_attention:
+        # the actual query length is unknown for cross attention
+        # if using past key value states. Need to inject it here
+        if present_key_value_state is not None:
+            query_length = present_key_value_state[0].shape[2]
+        else:
+            query_length = None
+
+        cross_attention_outputs = self.layer[1](
+            hidden_states,
+            key_value_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            position_bias=encoder_decoder_position_bias,
+            layer_head_mask=cross_attn_layer_head_mask,
+            past_key_value=cross_attn_past_key_value,
+            query_length=query_length,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = cross_attention_outputs[0]
+
+        # Combine self attn and cross attn key value states
+        if present_key_value_state is not None:
+            present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+        # Keep cross-attention outputs and relative position weights
+        attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+    # Apply Feed Forward layer
+    hidden_states = self.layer[-1](hidden_states)
+
+    outputs = (hidden_states,)
+
+    if use_cache:
+        outputs = outputs + (present_key_value_state,) + attention_outputs
+    else:
+        outputs = outputs + attention_outputs
+
+    return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+def gaudi_T5Stack_forward(
+    self,
+    input_ids=None,
+    attention_mask=None,
+    encoder_hidden_states=None,
+    encoder_attention_mask=None,
+    inputs_embeds=None,
+    head_mask=None,
+    cross_attn_head_mask=None,
+    past_key_values=None,
+    use_cache=None,
+    output_attentions=None,
+    output_hidden_states=None,
+    return_dict=None,
+    token_idx=None,
+):
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if input_ids is not None and inputs_embeds is not None:
+        err_msg_prefix = "decoder_" if self.is_decoder else ""
+        raise ValueError(
+            f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+        )
+    elif input_ids is not None:
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.size()[:-1]
+    else:
+        err_msg_prefix = "decoder_" if self.is_decoder else ""
+        raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+    if inputs_embeds is None:
+        if self.embed_tokens is None:
+            raise ValueError("You have to initialize the model with valid token embeddings")
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    batch_size, seq_length = input_shape
+
+    # required mask seq length can be calculated via length of past
+    if token_idx is not None:
+        mask_seq_length = past_key_values[0][0].shape[2] if past_key_values is not None else seq_length
+    else:
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+    if use_cache is True:
+        if not self.is_decoder:
+            raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+
+    # initialize past_key_values with `None` if past does not exist
+    if past_key_values is None:
+        past_key_values = [None] * len(self.block)
+
+    if attention_mask is None:
+        attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+    # ourselves in which case we just need to make it broadcastable to all heads.
+    extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+    # If a 2D or 3D attention mask is provided for the cross-attention
+    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+    if self.is_decoder and encoder_hidden_states is not None:
+        encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long)
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+    else:
+        encoder_extended_attention_mask = None
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # Prepare head mask if needed
+    head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+    cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+    present_key_value_states = () if use_cache else None
+    all_hidden_states = () if output_hidden_states else None
+    all_attentions = () if output_attentions else None
+    all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+    position_bias = None
+    encoder_decoder_position_bias = None
+
+    hidden_states = self.dropout(inputs_embeds)
+
+    for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+        layer_head_mask = head_mask[i]
+        cross_attn_layer_head_mask = cross_attn_head_mask[i]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                layer_module.forward,
+                hidden_states,
+                extended_attention_mask,
+                position_bias,
+                encoder_hidden_states,
+                encoder_extended_attention_mask,
+                encoder_decoder_position_bias,
+                layer_head_mask,
+                cross_attn_layer_head_mask,
+                None,  # past_key_value is always None with gradient checkpointing
+                use_cache,
+                output_attentions,
+                True,
+                None,
+            )
+        else:
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                layer_head_mask=layer_head_mask,
+                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                token_idx=token_idx,
+            )
+
+        # layer_outputs is a tuple with:
+        # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+        if use_cache is False:
+            layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+        hidden_states, present_key_value_state = layer_outputs[:2]
+
+        # We share the position biases between the layers - the first layer store them
+        # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+        # (cross-attention position bias), (cross-attention weights)
+        position_bias = layer_outputs[2]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+        # append next layer key value states
+        if use_cache:
+            present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+        if output_attentions:
+            all_attentions = all_attentions + (layer_outputs[3],)
+            if self.is_decoder:
+                all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+    hidden_states = self.final_layer_norm(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+
+    # Add last layer
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(
+            v
+            for v in [
+                hidden_states,
+                present_key_value_states,
+                all_hidden_states,
+                all_attentions,
+                all_cross_attentions,
+            ]
+            if v is not None
+        )
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=hidden_states,
+        past_key_values=present_key_value_states,
+        hidden_states=all_hidden_states,
+        attentions=all_attentions,
+        cross_attentions=all_cross_attentions,
+    )
+
+
+def gaudi_T5ForConditionalGeneration_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    decoder_input_ids: Optional[torch.LongTensor] = None,
+    decoder_attention_mask: Optional[torch.BoolTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    decoder_head_mask: Optional[torch.FloatTensor] = None,
+    cross_attn_head_mask: Optional[torch.Tensor] = None,
+    encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    token_idx: Optional[torch.LongTensor] = None,
+) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+    if head_mask is not None and decoder_head_mask is None:
+        if self.config.num_layers == self.config.num_decoder_layers:
+            warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+            decoder_head_mask = head_mask
+
+    # Encode if needed (training, first prediction pass)
+    if encoder_outputs is None:
+        # Convert encoder inputs in embeddings if needed
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+        encoder_outputs = BaseModelOutput(
+            last_hidden_state=encoder_outputs[0],
+            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+        )
+
+    hidden_states = encoder_outputs[0]
+
+    if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+        # get decoder inputs from shifting lm labels to the right
+        decoder_input_ids = self._shift_right(labels)
+
+    # Decode
+    decoder_outputs = self.decoder(
+        input_ids=decoder_input_ids,
+        attention_mask=decoder_attention_mask,
+        inputs_embeds=decoder_inputs_embeds,
+        past_key_values=past_key_values,
+        encoder_hidden_states=hidden_states,
+        encoder_attention_mask=attention_mask,
+        head_mask=decoder_head_mask,
+        cross_attn_head_mask=cross_attn_head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        token_idx=token_idx,
+    )
+
+    sequence_output = decoder_outputs[0]
+
+    if self.config.tie_word_embeddings:
+        # Rescale output before projecting on vocab
+        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+        sequence_output = sequence_output * (self.model_dim**-0.5)
+
+    lm_logits = self.lm_head(sequence_output)
+
+    loss = None
+    if labels is not None:
+        loss_fct = CrossEntropyLoss(ignore_index=-100)
+        # move labels to correct device to enable PP
+        labels = labels.to(lm_logits.device)
+        loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+        # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+    if not return_dict:
+        output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+        return ((loss,) + output) if loss is not None else output
+
+    return Seq2SeqLMOutput(
+        loss=loss,
+        logits=lm_logits,
+        past_key_values=decoder_outputs.past_key_values,
+        decoder_hidden_states=decoder_outputs.hidden_states,
+        decoder_attentions=decoder_outputs.attentions,
+        cross_attentions=decoder_outputs.cross_attentions,
+        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+        encoder_hidden_states=encoder_outputs.hidden_states,
+        encoder_attentions=encoder_outputs.attentions,
+    )
+
+
+def gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    decoder_attention_mask=None,
+    cross_attn_head_mask=None,
+    use_cache=None,
+    encoder_outputs=None,
+    token_idx=None,
+    **kwargs,
+):
+    # cut decoder_input_ids if past_key_values is used
+    if past_key_values is not None:
+        if token_idx is not None:
+            input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+        else:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+    return {
+        "decoder_input_ids": input_ids,
+        "past_key_values": past_key_values,
+        "encoder_outputs": encoder_outputs,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+        "use_cache": use_cache,
+        "token_idx": token_idx,
+    }
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
new file mode 100644
index 00000000000..0d3835de543
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
@@ -0,0 +1 @@
+from .modeling_vit import gaudi_vit_self_attention_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
new file mode 100644
index 00000000000..4fd5990e14e
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+def gaudi_vit_self_attention_forward(
+    self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+    """
+    Same method as transformers.models.vit.modeling_vit.ViTSelfAttention.forward with a small tweak:
+    the division is performed before the matmul for computing attention scores.
+    This gives better performance on HPU.
+    """
+
+    mixed_query_layer = self.query(hidden_states)
+
+    key_layer = self.transpose_for_scores(self.key(hidden_states))
+    value_layer = self.transpose_for_scores(self.value(hidden_states))
+    query_layer = self.transpose_for_scores(mixed_query_layer)
+
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    # The div has been put inside the matmul because it achieves better performance on HPU.
+    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt(self.attention_head_size))
+
+    # Normalize the attention scores to probabilities.
+    attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs = self.dropout(attention_probs)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attention_probs = attention_probs * head_mask
+
+    context_layer = torch.matmul(attention_probs, value_layer)
+
+    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+    context_layer = context_layer.view(new_context_layer_shape)
+
+    outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+    return outputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
new file mode 100644
index 00000000000..3a5bae22b8a
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
@@ -0,0 +1,8 @@
+from .modeling_wav2vec2 import (
+    _gaudi_wav2vec2_compute_mask_indices,
+    _gaudi_wav2vec2_mask_hidden_states,
+    _gaudi_wav2vec2_sample_negative_indices,
+    gaudi_wav2vec2_encoder_forward,
+    gaudi_wav2vec2_forward,
+    gaudi_wav2vec2_tdnnlayer_forward,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
new file mode 100644
index 00000000000..983c5b5375b
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -0,0 +1,376 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    Wav2Vec2BaseModelOutput,
+)
+
+
+def _gaudi_wav2vec2_compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> torch.Tensor:
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L135
+    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = torch.rand([], device="hpu")
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = torch.zeros((batch_size, sequence_length), dtype=torch.bool, device="hpu")
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = torch.cat(
+            [
+                spec_aug_mask_idx,
+                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx,
+            ]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long))
+
+    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = torch.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = torch.arange(mask_length, device="hpu")[None, None, :]
+    offsets = torch.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    spec_aug_mask.scatter_(-1, spec_aug_mask_idxs, 1)
+
+    return spec_aug_mask
+
+
+def _gaudi_wav2vec2_sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[torch.Tensor] = None
+):
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L254
+    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = torch.arange(sequence_length, device="hpu")
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = torch.zeros(
+        shape=(batch_size, sequence_length, num_negatives), dtype=torch.int32, device="hpu"
+    )
+
+    mask_time_indices = (
+        mask_time_indices.bool()
+        if mask_time_indices is not None
+        else torch.ones(features_shape, dtype=torch.bool, device="hpu")
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = torch.broadcast_to(torch.arange(high + 1, device="hpu")[:, None], (high + 1, num_negatives))
+        sampled_indices = torch.randint(0, high, size=(high + 1, num_negatives), dtype=torch.int16, device="hpu")
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+def _gaudi_wav2vec2_mask_hidden_states(
+    self,
+    hidden_states: torch.FloatTensor,
+    mask_time_indices: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.LongTensor] = None,
+):
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1227
+    Differences are that (1) `mask_time_indices` is not moved to the current device and converted into boolean because this is already done in _compute_mask_indices.
+    (2) index_put operation on hidden_states is replaced by combination of simpler ops (more suitable for HPU graphs)
+    """
+
+    # `config.apply_spec_augment` can set masking to False
+    if not getattr(self.config, "apply_spec_augment", True):
+        return hidden_states
+
+    # generate indices & apply SpecAugment along time axis
+    batch_size, sequence_length, hidden_size = hidden_states.size()
+
+    if mask_time_indices is not None:
+        # apply SpecAugment along time axis with given mask_time_indices
+        hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+    elif self.config.mask_time_prob > 0 and self.training:
+        mask_time_indices = _gaudi_wav2vec2_compute_mask_indices(
+            (batch_size, sequence_length),
+            mask_prob=self.config.mask_time_prob,
+            mask_length=self.config.mask_time_length,
+            attention_mask=attention_mask,
+            min_masks=self.config.mask_time_min_masks,
+        )
+        # replacement of index_put with combination of simpler ops. Assumption made about sizes of hidden_states (3d),
+        # mask_time_indices (2d), self.masked_spec_embed (1d), for any other combination better to go back to original code using index_put.
+        # hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        inverse_mask_time_indices = torch.bitwise_not(mask_time_indices)
+        hidden_states = hidden_states * inverse_mask_time_indices.unsqueeze(2) + self.masked_spec_embed.to(
+            hidden_states.dtype
+        ).expand(hidden_states.size()) * mask_time_indices.unsqueeze(2)
+
+    if self.config.mask_feature_prob > 0 and self.training:
+        # generate indices & apply SpecAugment along feature axis
+        mask_feature_indices = _gaudi_wav2vec2_compute_mask_indices(
+            (batch_size, hidden_size),
+            mask_prob=self.config.mask_feature_prob,
+            mask_length=self.config.mask_feature_length,
+            min_masks=self.config.mask_feature_min_masks,
+        )
+        mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+        mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+        hidden_states[mask_feature_indices] = 0
+
+    return hidden_states
+
+
+def gaudi_wav2vec2_encoder_forward(
+    self,
+    hidden_states: torch.tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
+    return_dict: bool = True,
+):
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/7790943c91411f4234d11dfbf4c2f21ce7caf088/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L755
+    The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)
+    """
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attentions = () if output_attentions else None
+
+    if attention_mask is not None:
+        # make sure padded tokens output 0
+        expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+        hidden_states[~expand_attention_mask] = 0
+
+        # extend attention_mask
+        attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+        attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+        attention_mask = attention_mask.expand(
+            attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+        )
+
+    position_embeddings = self.pos_conv_embed(hidden_states)
+    hidden_states = hidden_states + position_embeddings
+    hidden_states = self.layer_norm(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+
+    deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+    for layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+        dropout_probability = torch.rand([], device="hpu")
+
+        skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+        if not skip_the_layer or deepspeed_zero3_is_enabled:
+            # under deepspeed zero3 all gpus must run in sync
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                )
+            hidden_states = layer_outputs[0]
+
+        if skip_the_layer:
+            layer_outputs = (None, None)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+    if output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+    return BaseModelOutput(
+        last_hidden_state=hidden_states,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attentions,
+    )
+
+
+def gaudi_wav2vec2_forward(
+    self,
+    input_values: Optional[torch.Tensor],
+    attention_mask: Optional[torch.Tensor] = None,
+    mask_time_indices: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282
+    The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    extract_features = self.feature_extractor(input_values)
+    extract_features = extract_features.transpose(1, 2)
+
+    if attention_mask is not None:
+        # compute reduced attention_mask corresponding to feature vectors
+        attention_mask = self._get_feature_vector_attention_mask(
+            extract_features.shape[1], attention_mask, add_adapter=False
+        )
+
+    hidden_states, extract_features = self.feature_projection(extract_features)
+    hidden_states = self._mask_hidden_states(
+        hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask
+    )
+
+    encoder_outputs = self.encoder(
+        hidden_states,
+        attention_mask=attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    hidden_states = encoder_outputs[0]
+
+    if self.adapter is not None:
+        hidden_states = self.adapter(hidden_states)
+
+    if not return_dict:
+        return (hidden_states, extract_features) + encoder_outputs[1:]
+
+    return Wav2Vec2BaseModelOutput(
+        last_hidden_state=hidden_states,
+        extract_features=extract_features,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
+
+
+def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2290
+    v4.38.2 implementation caused accuracy issue to run pytest Wav2Vec2RobustModelTest.
+    """
+    hidden_states = hidden_states.unsqueeze(1)
+    hidden_states = torch.nn.functional.unfold(
+        hidden_states,
+        (self.kernel_size, self.in_conv_dim),
+        stride=(1, self.in_conv_dim),
+        dilation=(self.dilation, 1),
+    )
+    hidden_states = hidden_states.transpose(1, 2)
+    hidden_states = self.kernel(hidden_states)
+
+    hidden_states = self.activation(hidden_states)
+    return hidden_states

From e0613ad8a276cbb85b421b7c59523a0a7c1b9669 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 Mar 2024 06:12:13 +0000
Subject: [PATCH 02/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../modeling/modeling_gaudi/__init__.py       | 14 ++++++++++
 .../modeling_gaudi/generation/__init__.py     | 14 ++++++++++
 .../generation/configuration_utils.py         | 14 ++++++++++
 .../modeling_gaudi/generation/utils.py        | 26 +++++++++----------
 .../modeling_gaudi/models/__init__.py         | 14 ++++++++++
 .../modeling_gaudi/models/albert/__init__.py  | 14 ++++++++++
 .../modeling_gaudi/models/bart/__init__.py    | 14 ++++++++++
 .../modeling_gaudi/models/blip/__init__.py    | 14 ++++++++++
 .../models/blip/modeling_blip.py              | 16 +++++++++++-
 .../models/blip/modeling_blip_text.py         | 14 ++++++++++
 .../modeling_gaudi/models/bloom/__init__.py   | 14 ++++++++++
 .../models/bloom/modeling_bloom.py            |  2 +-
 .../modeling_gaudi/models/codegen/__init__.py | 14 ++++++++++
 .../models/codegen/modeling_codegen.py        | 14 ++++++++++
 .../modeling_gaudi/models/esm/__init__.py     | 14 ++++++++++
 .../modeling_gaudi/models/falcon/__init__.py  | 14 ++++++++++
 .../models/falcon/modeling_falcon.py          | 14 ++++++++++
 .../modeling_gaudi/models/gpt2/__init__.py    | 14 ++++++++++
 .../models/gpt2/modeling_gpt2.py              | 14 ++++++++++
 .../models/gpt_bigcode/__init__.py            | 14 ++++++++++
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 14 ++++++++++
 .../models/gpt_neox/__init__.py               | 14 ++++++++++
 .../models/gpt_neox/modeling_gpt_neox.py      | 14 ++++++++++
 .../modeling_gaudi/models/gptj/__init__.py    | 14 ++++++++++
 .../models/gptj/modeling_gptj.py              | 14 ++++++++++
 .../modeling_gaudi/models/llama/__init__.py   | 14 ++++++++++
 .../models/llama/modeling_llama.py            | 16 +++++++++++-
 .../modeling_gaudi/models/mistral/__init__.py | 14 ++++++++++
 .../models/mistral/modeling_mistral.py        |  2 +-
 .../modeling_gaudi/models/mixtral/__init__.py | 14 ++++++++++
 .../modeling_gaudi/models/mpt/__init__.py     | 14 ++++++++++
 .../modeling_gaudi/models/opt/__init__.py     | 14 ++++++++++
 .../modeling_gaudi/models/opt/modeling_opt.py | 14 ++++++++++
 .../modeling_gaudi/models/phi/__init__.py     | 14 ++++++++++
 .../models/speecht5/__init__.py               | 14 ++++++++++
 .../models/speecht5/modeling_speecht5.py      | 14 ++++++++++
 .../modeling_gaudi/models/swin/__init__.py    | 14 ++++++++++
 .../modeling_gaudi/models/t5/__init__.py      | 14 ++++++++++
 .../modeling_gaudi/models/t5/modeling_t5.py   | 14 ++++++++++
 .../modeling_gaudi/models/vit/__init__.py     | 14 ++++++++++
 .../models/wav2vec2/__init__.py               | 14 ++++++++++
 41 files changed, 549 insertions(+), 17 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
index d6539dee1f5..bd23ecbaf15 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_utils import adapt_transformers_to_gaudi
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
index 15f567b0be4..aa3bc167f37 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .configuration_utils import GaudiGenerationConfig
 from .stopping_criteria import (
     gaudi_MaxLengthCriteria_call,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
index e75e48a7c7f..0fdf6d3701e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from transformers.generation import GenerationConfig
 
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index faec4696cd7..4daaa48a7c1 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -204,7 +204,7 @@ def _prepare_decoder_input_ids_for_generation(
             if isinstance(decoder_start_token_id, list):
                 if len(decoder_start_token_id) != batch_size:
                     raise ValueError(
-                        f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}"
+                        f"`decoder_start_token_id` expected to have length {batch_size} but got {len(decoder_start_token_id)}"
                     )
                 decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
                 decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
@@ -321,7 +321,7 @@ def update_model_kwargs_for_bucketing(
                     model_kwargs["attention_mask"], (0, pad_amount), value=0
                 )
             else:
-                assert False, "Not tested for cases where attn_mask isnt passed"
+                assert False, "Not tested for cases where attn_mask isn't passed"
             if reduce_recompile and params["passnum"] == 0:
                 position_ids_cpu = model_kwargs["attention_mask"].long().cumsum(-1) - 1
                 position_ids_cpu.masked_fill_(model_kwargs["attention_mask"] == 0, 1)
@@ -435,7 +435,7 @@ def generate(
                 generating before other GPUs. Otherwise it'll be set to `False`.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             streamer (`BaseStreamer`, *optional*):
@@ -451,7 +451,7 @@ def generate(
             hpu_graphs (`bool`, *optional*, defaults to `False`):
                 Whether to use HPU graphs for inference.
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             kwargs (`Dict[str, Any]`, *optional*):
@@ -1181,7 +1181,7 @@ def contrastive_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1294,7 +1294,7 @@ def greedy_search(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1643,7 +1643,7 @@ def sample(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1976,7 +1976,7 @@ def beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2564,7 +2564,7 @@ def beam_sample(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2710,7 +2710,7 @@ def group_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2857,7 +2857,7 @@ def constrained_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -3209,7 +3209,7 @@ def assisted_decoding(
                 more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -3246,7 +3246,7 @@ def assisted_decoding(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             streamer (`BaseStreamer`, *optional*):
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
index d0eb8b2dcd6..4e03d0c28bd 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .albert import gaudi_albert_forward
 from .bart import (
     gaudi_BartAttention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
index fccea02a857..888aeaeb668 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_albert import gaudi_albert_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
index c8148194d83..872e7608a1d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_bart import (
     gaudi_BartAttention_forward,
     gaudi_BartDecoder_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
index 6f105b11a2f..a6bb70a0ebc 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_blip import gaudi_BlipForConditionalGeneration_generate, gaudi_BlipForQuestionAnswering_generate
 from .modeling_blip_text import (
     gaudi_BlipTextAttention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
index 6545a0662d2..2a31547669e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional
 
 import torch
@@ -72,7 +86,7 @@ def gaudi_BlipForQuestionAnswering_generate(
     Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1236
     The only differences are:
         - wrap hpu graph for each part
-        - torch.full add dtype=torch.int64, or else the default type is torch.float32. lead to coredump in embeding layer
+        - torch.full add dtype=torch.int64, or else the default type is torch.float32. lead to coredump in embedding layer
     """
     if generate_kwargs.get("hpu_graphs", True):
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
index 23d4ee3f3c5..386d50a3d37 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
index 8aa34e41459..216a62d37e6 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_bloom import (
     GaudiBloomForCausalLM,
     GaudiBloomMLP,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
index a0a6d8c2d7e..bfc0ba2408d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -491,7 +491,7 @@ def prepare_inputs_for_generation(
             else:
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
 
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
+            # the cache may be in the standard format (e.g. in contrastive search), convert to bloom's format if needed
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
index d433e24c8db..9604d68ae28 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_codegen import (
     GaudiCodeGenAttention,
     GaudiCodeGenForCausalLM,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
index b5680859710..0bf12e0676b 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
index ca83d982924..c7814bdf7d0 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_esmfold import (
     gaudi_esm_for_protein_folding_forward,
     gaudi_esmfolding_trunk_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
index 44ac5451f6f..b8858e01cd1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_falcon import (
     GaudiFalconForCausalLM,
     GaudiFalconModel,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index a10874bbc7b..d88fc879685 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import contextlib
 import math
 import warnings
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
index 7a23f947267..1d70940d71e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
index c48c71199b1..793b79fc0fb 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
index 556f61f8c71..87506e5ec93 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_gpt_bigcode import (
     GaudiGPTBigCodeForCausalLM,
     gaudi_gpt_bigcode_attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
index 8059e338062..82957d1c324 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
index cceb114b826..067b3569a76 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_gpt_neox import (
     GaudiGPTNeoXForCausalLM,
     gaudi_gpt_neox_attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
index 08f34333777..161d0ac2e20 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
index 9b3b6a64340..7cb8093352f 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_gptj import (
     GaudiGPTJAttention,
     GaudiGPTJForCausalLM,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
index cc08d4d2c87..fcae80d4eaf 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
index 20703ffd095..95b91020e49 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_llama import (
     GaudiLlamaAttention,
     GaudiLlamaDecoderLayer,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index fdf6d6c8639..4238710d92f 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
@@ -279,7 +293,7 @@ def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
         self.v_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
index 192c2677918..faebb4dfd4e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_mistral import (
     GaudiMistralAttention,
     GaudiMistralDecoderLayer,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index 24174ce1d34..f65ab019534 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -138,7 +138,7 @@ def allocate_kv_cache(self, batch_size, seq_len):
             self.past_value = torch.empty(kv_shape, dtype=dtype, device=device)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
index fd1829bbe20..b9175162ca8 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_mixtral import (
     GaudiMixtralForCausalLM,
     gaudi_mixtral_attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
index 1ab41c1a805..316f1c184c8 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_mpt import (
     GaudiMptForCausalLM,
     GaudiMptModel,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
index 9ea5a435eed..24e90be065c 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_opt import (
     GaudiOPTForCausalLM,
     GaudiOPTLearnedPositionalEmbedding,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
index 6743594e355..a670eebdaa6 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
index 1a98f45f513..a38f4a0b5bf 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_phi import (
     GaudiPhiForCausalLM,
     gaudi_phi_attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
index b328c0e854e..b268332087c 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_speecht5 import (
     gaudi_generate_speech,
     gaudi_SpeechT5Attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
index 222622e0190..138d7d234ac 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
index 59dbee4d5d8..73f1829791d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_swin import gaudi_swin_get_attn_mask
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
index e92116128d2..eabce5dfa47 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_t5 import (
     gaudi_t5_layernorm_forward,
     gaudi_T5Attention_forward,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
index 17b0e49a97a..189317f80d2 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from typing import Optional, Tuple, Union
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
index 0d3835de543..b73cc7feb47 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_vit import gaudi_vit_self_attention_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
index 3a5bae22b8a..872aff62fea 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .modeling_wav2vec2 import (
     _gaudi_wav2vec2_compute_mask_indices,
     _gaudi_wav2vec2_mask_hidden_states,

From 69e81b3f5d55b1ec4a497d9ab6040f64c7591108 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Tue, 9 Apr 2024 02:45:27 +0000
Subject: [PATCH 03/25] Add test example to itrex

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 examples/habana/run_generation.py | 235 +++++++++++++++++++
 examples/habana/run_llama.sh      |  17 ++
 examples/habana/utils.py          | 378 ++++++++++++++++++++++++++++++
 3 files changed, 630 insertions(+)
 create mode 100644 examples/habana/run_generation.py
 create mode 100644 examples/habana/run_llama.sh
 create mode 100644 examples/habana/utils.py

diff --git a/examples/habana/run_generation.py b/examples/habana/run_generation.py
new file mode 100644
index 00000000000..46207481acd
--- /dev/null
+++ b/examples/habana/run_generation.py
@@ -0,0 +1,235 @@
+"""
+Conditional text generation on Habana Gaudi/Gaudi2.
+"""
+
+import argparse
+import json
+import logging
+import math
+import os
+import time
+from itertools import cycle
+from pathlib import Path
+
+import torch
+from utils import adjust_batch, count_hpu_graphs, initialize_model
+
+from optimum.habana.utils import get_hpu_memory_stats
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def setup_parser(parser):
+    # Arguments management
+    parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model (on the HF Hub or locally).",
+    )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
+    parser.add_argument("--size", type=int, default=19, help="Enlarge the input prompt")
+    parser.add_argument(
+        "--max_input_tokens",
+        type=int,
+        default=0,
+        help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
+            if == 0, then truncate to 16 (original default) \
+            if < 0, then do not truncate, use full input prompt",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
+    parser.add_argument(
+        "--use_kv_cache",
+        action="store_true",
+        help="Whether to use the key/value cache for decoding. It should speed up generation.",
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument(
+        "--num_beams",
+        default=1,
+        type=int,
+        help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
+    )
+    parser.add_argument(
+        "--trim_logits",
+        action="store_true",
+        help="Calculate logits only for the last token to save memory in the first step.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps",
+        default=0,
+        type=int,
+        help="Number of steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps",
+        default=0,
+        type=int,
+        help="Number of steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        nargs="*",
+        help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
+    )
+    parser.add_argument(
+        "--bucket_size",
+        default=-1,
+        type=int,
+        help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
+            then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
+            we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
+    )
+    parser.add_argument(
+        "--bucket_internal",
+        action="store_true",
+        help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
+    )
+    parser.add_argument(
+        "--limit_hpu_graphs",
+        action="store_true",
+        help="Skip HPU Graph usage for first token to save memory",
+    )
+    parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8")
+    parser.add_argument(
+        "--torch_compile",
+        action="store_true",
+        help="Whether to use torch compiled model or not.",
+    )
+    args = parser.parse_args()
+    #(TODO) we will use kv-cache in cpu side so we do not use hpu graphs
+    if args.torch_compile:
+        args.use_hpu_graphs = False
+
+    if not args.use_hpu_graphs:
+        args.limit_hpu_graphs = False
+
+    args.quant_config = os.getenv("QUANT_CONFIG", "")
+    return args
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    args = setup_parser(parser)
+    model, tokenizer, generation_config = initialize_model(args, logger)
+    use_lazy_mode = True
+    if args.torch_compile and model.config.model_type == "llama":
+        use_lazy_mode = False
+
+    import habana_frameworks.torch.hpu as torch_hpu
+
+    # Benchmark over the prompts below
+    input_sentences = [p * args.size for p in args.prompt]
+    # (TODO) if we want to test multi-batch use this code
+    # input_sentences = [
+    #     "DeepSpeed is a machine learning framework",
+    #     "He is working on",
+    #     "He has a",
+    #     "He got all",
+    #     "Everyone is happy and I can",
+    #     "The new movie that got Oscar this year",
+    #     "In the far far distance from our galaxy,",
+    #     "Peace is the only way",
+    # ]
+
+    if args.batch_size > len(input_sentences):
+        # Dynamically extends to support larger batch sizes
+        num_sentences_to_add = args.batch_size - len(input_sentences)
+        for i in range(num_sentences_to_add):
+            input_sentences.append(input_sentences[i % len(input_sentences)])
+    elif args.batch_size < len(input_sentences):
+        input_sentences = input_sentences[: args.batch_size]
+
+    def generate(inputs, size=None):
+        """Generates sequences from the input sentences and returns them."""
+
+        # Tokenization
+        if args.max_input_tokens > 0:
+            input_tokens = tokenizer.batch_encode_plus(
+                inputs,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=args.max_input_tokens,
+                truncation=True,
+            )
+        else:
+            input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+
+        if size is not None:
+            input_tokens = adjust_batch(input_tokens, size)
+        # Move inputs to target device(s)
+        for t in input_tokens:
+            if torch.is_tensor(input_tokens[t]):
+                input_tokens[t] = input_tokens[t].to(args.device)
+
+        outputs = model.generate(
+            **input_tokens,
+            generation_config=generation_config,
+            lazy_mode=use_lazy_mode,
+            hpu_graphs=args.use_hpu_graphs,
+            profiling_steps=args.profiling_steps,
+            profiling_warmup_steps=args.profiling_warmup_steps,
+        ).cpu()
+        return tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    from optimum.habana.utils import HabanaProfile
+
+    # compilation stage disable profiling
+    HabanaProfile.disable()
+    # Compilation
+    logger.info("Graph compilation...")
+    t0 = time.perf_counter()
+    # The first three iterations take longer because of graph compilation
+    for _ in range(args.warmup):
+        generate(input_sentences, None)
+    torch_hpu.synchronize()
+    compilation_duration = time.perf_counter() - t0
+
+    HabanaProfile.enable()
+    total_new_tokens_generated = 0
+    logger.info("Running generate...")
+    t0 = time.perf_counter()
+    print(f"Graph compilation duration          = {compilation_duration} seconds")
+    generated = generate(input_sentences, None)
+    duration = time.perf_counter() - t0
+    total_new_tokens_generated = args.batch_size * args.max_new_tokens
+    throughput = total_new_tokens_generated / duration
+
+    # (TODO) only open this when to check the accuracy of the output
+    # for i, input_sentence in enumerate(zip(input_sentences)):
+    #     print(f"input {i+1}: {input_sentence}\noutput {i+1}: {generated[i]}")
+
+    stats = f"Throughput (including tokenization) = {throughput} tokens/second"
+    stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
+    separator = "-" * 90
+    print(separator)
+    print("The input token size is {}K ".format(args.size))
+    print(stats)
+    mem = get_hpu_memory_stats()
+    for k, v in mem.items():
+        print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
+    print(separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/habana/run_llama.sh b/examples/habana/run_llama.sh
new file mode 100644
index 00000000000..d6aef3a9f83
--- /dev/null
+++ b/examples/habana/run_llama.sh
@@ -0,0 +1,17 @@
+for i in {1..1..2}
+do
+  python run_generation.py \
+      --use_hpu_graphs \
+      --use_kv_cache \
+      --max_new_tokens 32 \
+      --size $i \
+      --max_input_tokens -1 \
+      --model_name_or_path /chenxi/models--01-ai--Yi-34B/snapshots/533e00ce927b9e5711445a991284671ac61c6834 \
+       --fp8 \
+      --batch_size 1 \
+      --prompt "how are you ?" \
+      # --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
+      # --model_name_or_path /chenxi/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423 \
+  sleep 1
+done 
+echo "Test Done...."
diff --git a/examples/habana/utils.py b/examples/habana/utils.py
new file mode 100644
index 00000000000..ea473a85439
--- /dev/null
+++ b/examples/habana/utils.py
@@ -0,0 +1,378 @@
+import copy
+import glob
+import os
+import shutil
+import tempfile
+import time
+from pathlib import Path
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import check_min_version
+
+from optimum.habana.checkpoint_utils import (
+    get_ds_injection_policy,
+    get_repo_root,
+    model_is_optimized,
+    model_on_meta,
+    write_checkpoints_json,
+)
+from optimum.habana.utils import check_habana_frameworks_version
+from optimum.habana.utils import check_optimum_habana_min_version
+from optimum.habana.utils import set_seed
+
+def adjust_batch(batch, size):
+    curr_size = batch["input_ids"].shape[1]
+    if curr_size >= size:
+        adjusted_batch = {
+            "input_ids": batch["input_ids"][:, :size],
+            "attention_mask": batch["attention_mask"][:, :size],
+        }
+    else:
+        adjusted_batch = {}
+        for k in batch.keys():
+            last_colm = batch[k][:, -1]
+            expanded = last_colm.tile((size - curr_size, 1)).T
+            adjusted_batch[k] = torch.concat([batch[k], expanded], 1)
+    assert adjusted_batch["input_ids"].shape[1] == size
+    assert adjusted_batch["attention_mask"].shape[1] == size
+    return adjusted_batch
+
+
+def override_print(enable):
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def override_logger(logger, enable):
+    logger_info = logger.info
+
+    def info(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            logger_info(*args, **kwargs)
+
+    logger.info = info
+
+
+def count_hpu_graphs():
+    return len(glob.glob(".graph_dumps/*PreGraph*"))
+
+
+def override_prints(enable, logger):
+    override_print(enable)
+    override_logger(logger, enable)
+
+
+def setup_distributed(args):
+    args.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    args.world_size = int(os.getenv("WORLD_SIZE", "0"))
+    args.global_rank = int(os.getenv("RANK", "0"))
+
+
+def setup_const_serialization(const_serialization_path):
+    import uuid
+
+    const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex)
+    os.makedirs(const_serialization_path)
+    from habana_frameworks.torch.hpu import enable_const_section_serialization
+
+    print("Serializing const params to {}".format(const_serialization_path))
+    enable_const_section_serialization(const_serialization_path, False, True)
+
+
+def setup_env(args):
+    # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+    check_min_version("4.34.0")
+    # check_optimum_habana_min_version("1.9.0.dev0")
+    # TODO: SW-167588 - WA for memory issue in hqt prep_model
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+    # TODO let's set the lazy mode on
+    # os.environ.setdefault("PT_HPU_LAZY_MODE", "1")
+    # os.environ.setdefault("PT_HPU_MAX_COMPOUND_OP_SIZE", "1")
+
+    if args.global_rank == 0 and not args.torch_compile:
+        os.environ.setdefault("GRAPH_VISUALIZATION", "true")
+        shutil.rmtree(".graph_dumps", ignore_errors=True)
+
+    if args.world_size > 0:
+        os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
+
+    # Tweak generation so that it runs faster on Gaudi
+    from intel_extension_for_transformers.transformers.modeling.modeling_gaudi import adapt_transformers_to_gaudi
+
+    adapt_transformers_to_gaudi()
+
+
+def setup_device(args):
+    if args.device == "hpu":
+        import habana_frameworks.torch.core as htcore
+
+        if args.fp8:
+            htcore.hpu_set_env()
+    return torch.device(args.device)
+
+# patching LinearAllreduce to use ScopedLinearAllReduce
+def patch_scoped_linear_all_reduce(model):
+    from deepspeed.module_inject.layers import LinearAllreduce
+
+    from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce
+
+    for name, module in model.named_children():
+        if type(module) is LinearAllreduce:
+            SL = ScopedLinearAllReduce(mod=module)
+            setattr(model, name, SL)
+        patch_scoped_linear_all_reduce(module)
+
+
+def get_torch_compiled_model(model):
+    # model.model = torch.compile(model.model, backend="aot_hpu_inference_backend")
+    model.model = torch.compile(model.model)
+    return model
+
+
+def setup_model(args, model_dtype, model_kwargs, logger):
+    logger.info("Single-device run.")
+
+    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+    model = model.eval().to("hpu")
+
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+            model = wrap_in_hpu_graph(model, hash_with_views=False)
+        else:
+            model = wrap_in_hpu_graph(model)
+
+    if args.torch_compile and model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+
+    return model
+
+
+def setup_distributed_model(args, model_dtype, model_kwargs, logger):
+    import deepspeed
+
+    logger.info("DeepSpeed is enabled.")
+    deepspeed.init_distributed(dist_backend="hccl")
+    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    load_to_meta = model_on_meta(config)
+
+    if load_to_meta:
+        # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+        with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+            model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
+
+        # Model loaded to meta is managed differently
+        checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+
+        # For PEFT models, write the merged model on disk to be able to load it on the meta device
+        if args.peft_model is not None:
+            merged_model_dir = "/tmp/text_generation_merged_peft_model"
+            if args.local_rank == 0:
+                if Path(merged_model_dir).is_dir():
+                    shutil.rmtree(merged_model_dir)
+                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
+            torch.distributed.barrier()
+
+        write_checkpoints_json(
+            merged_model_dir if args.peft_model is not None else args.model_name_or_path,
+            args.local_rank,
+            checkpoints_json,
+            token=args.token,
+        )
+    else:
+        # TODO: revisit placement on CPU when auto-injection is possible
+        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
+            if args.peft_model is not None:
+                model = peft_model(args, model_dtype, logger, **model_kwargs)
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+                )
+    model.eval()
+
+    # Initialize the model
+    ds_inference_kwargs = {"dtype": model_dtype}
+    ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
+    ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
+    ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
+    if load_to_meta:
+        ds_inference_kwargs["checkpoint"] = checkpoints_json.name
+
+    model = deepspeed.init_inference(model, **ds_inference_kwargs)
+    model = model.module
+    if model.config.model_type in ["llama", "falcon"]:
+        patch_scoped_linear_all_reduce(model)
+
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+
+    if args.torch_compile and model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+
+    return model
+
+
+def peft_model(args, model_dtype, logger, **model_kwargs):
+    import importlib.util
+
+    if importlib.util.find_spec("peft") is None:
+        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
+    from peft import AutoPeftModelForCausalLM
+    from peft.config import PeftConfigMixin
+
+    base_model_name = PeftConfigMixin.from_pretrained(
+        args.peft_model,
+        token=model_kwargs["token"] if "token" in model_kwargs else None,
+    ).base_model_name_or_path
+
+    base_model_is_local = Path(base_model_name).is_dir()
+    if not base_model_is_local:
+        # Check if the base model path to a remote repository on the HF Hub exists
+        from huggingface_hub import list_repo_files
+
+        try:
+            list_repo_files(base_model_name)
+            base_model_is_remote = True
+        except Exception:
+            base_model_is_remote = False
+
+    if base_model_is_local or base_model_is_remote:
+        model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+    else:
+        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
+        logger.warning(
+            f"The base model `{base_model_name}` of the LoRA configuration associated"
+            f" to `{args.peft_model}` does not exist locally or remotely. Using "
+            f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model."
+        )
+        from peft import PeftModel
+
+        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+        model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+
+    return model.merge_and_unload()
+
+
+def setup_tokenizer(args, model):
+    tokenizer_kwargs = {
+        "revision": "main",
+        "token": None, 
+    }
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
+    if not model.config.is_encoder_decoder:
+        tokenizer.padding_side = "left"
+    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
+    if model.config.model_type == "llama":
+        # unwind broken decapoda-research config
+        model.generation_config.pad_token_id = 0
+        model.generation_config.bos_token_id = 1
+        model.generation_config.eos_token_id = 2
+        tokenizer.bos_token_id = model.generation_config.bos_token_id
+        tokenizer.eos_token_id = model.generation_config.eos_token_id
+        tokenizer.pad_token_id = model.generation_config.pad_token_id
+        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
+        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
+        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    return tokenizer, model
+
+
+def setup_generation_config(args, model, tokenizer):
+    bad_words_ids = None
+    force_words_ids = None
+
+    is_optimized = model_is_optimized(model.config)
+    # Generation configuration
+    generation_config = copy.deepcopy(model.generation_config)
+    generation_config.max_new_tokens = args.max_new_tokens
+    generation_config.use_cache = args.use_kv_cache
+    generation_config.static_shapes = True
+    generation_config.bucket_size = args.bucket_size if is_optimized else -1
+    # generation_config.bucket_size = args.bucket_size
+    # (TODO) bucket internal will increase the shape length
+    generation_config.bucket_internal = False
+    # generation_config.do_sample = args.do_sample
+    generation_config.num_beams = args.num_beams
+    generation_config.bad_words_ids = bad_words_ids
+    generation_config.force_words_ids = force_words_ids
+    # generation_config.num_return_sequences = args.num_return_sequences
+    generation_config.trim_logits = args.trim_logits
+    # TODO notice here why can't use softmax_bf16
+    generation_config.attn_softmax_bf16 = False
+    generation_config.limit_hpu_graphs = args.limit_hpu_graphs
+    # TODO why reuse cache  and reduce recompile false
+    generation_config.reuse_cache = False
+    generation_config.reduce_recompile = False
+    if generation_config.reduce_recompile:
+        assert generation_config.bucket_size > 0
+    # TODO this will also influence
+    generation_config.use_flash_attention = False
+    return generation_config
+
+
+def initialize_model(args, logger):
+    init_start = time.perf_counter()
+    setup_distributed(args)
+    override_prints(args.global_rank == 0 or args.verbose_workers, logger)
+    setup_env(args)
+    setup_device(args)
+    set_seed(27)
+    get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=None)
+    use_deepspeed = args.world_size > 0
+    if use_deepspeed or args.bf16 or args.fp8:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float
+        args.attn_softmax_bf16 = False
+
+    model_kwargs = {
+        "revision": "main",
+        "token":None, 
+    }
+
+    model_kwargs["device_map"] = "auto"
+    model_kwargs["offload_folder"] = "/tmp/offload_folder/"
+
+    model = (
+        setup_model(args, model_dtype, model_kwargs, logger)
+        if not use_deepspeed
+        else setup_distributed_model(args, model_dtype, model_kwargs, logger)
+    )
+    tokenizer, model = setup_tokenizer(args, model)
+    generation_config = setup_generation_config(args, model, tokenizer)
+
+    # if args.const_serialization_path:
+    #     setup_const_serialization(args.const_serialization_path)
+    if args.fp8:
+        import habana_frameworks.torch.core as htcore
+
+        print("Initializing inference mode")
+        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
+        if const_marking == "True":
+            htcore.hpu_initialize(model)
+    init_end = time.perf_counter()
+    # logger.info(f"Args: {args}")
+    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
+    logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
+    return model, tokenizer, generation_config

From 6454315737381687c5b0e89432d611e5bc376cef Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 9 Apr 2024 03:23:32 +0000
Subject: [PATCH 04/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/habana/run_llama.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/habana/run_llama.sh b/examples/habana/run_llama.sh
index d6aef3a9f83..90a29747915 100644
--- a/examples/habana/run_llama.sh
+++ b/examples/habana/run_llama.sh
@@ -10,7 +10,7 @@ do
        --fp8 \
       --batch_size 1 \
       --prompt "how are you ?" \
-      # --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
+      # --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, instead of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dilemma when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
       # --model_name_or_path /chenxi/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423 \
   sleep 1
 done 

From 72a491042175e9a579bdb53065ae0b493b6ee328 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Mon, 15 Apr 2024 09:10:34 +0000
Subject: [PATCH 05/25] add fp8 support and fix bugs

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 ...xabs_hw_weights_pcs_maxabs_pow2_quant.json |  10 +
 .../quantization_config/maxabs_measure.json   |   9 +
 .../maxabs_measure_include_outputs.json       |  10 +
 .../quantization_config/maxabs_quant.json     |  10 +
 .../maxabs_quant_mixtral.json                 |  13 ++
 .../quantization_config/unit_scale_quant.json |  10 +
 examples/habana/run_generation.py             |  14 +-
 examples/habana/utils.py                      |  30 ++-
 .../generation/configuration_utils.py         |  20 +-
 .../modeling_gaudi/generation/utils.py        |  80 +++++---
 .../models/llama/modeling_llama.py            | 188 ++++++++++--------
 .../models/mistral/modeling_mistral.py        |   4 +-
 12 files changed, 253 insertions(+), 145 deletions(-)
 create mode 100644 examples/habana/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
 create mode 100644 examples/habana/quantization_config/maxabs_measure.json
 create mode 100644 examples/habana/quantization_config/maxabs_measure_include_outputs.json
 create mode 100644 examples/habana/quantization_config/maxabs_quant.json
 create mode 100644 examples/habana/quantization_config/maxabs_quant_mixtral.json
 create mode 100644 examples/habana/quantization_config/unit_scale_quant.json

diff --git a/examples/habana/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json b/examples/habana/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
new file mode 100644
index 00000000000..602a147baab
--- /dev/null
+++ b/examples/habana/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
@@ -0,0 +1,10 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
diff --git a/examples/habana/quantization_config/maxabs_measure.json b/examples/habana/quantization_config/maxabs_measure.json
new file mode 100644
index 00000000000..3645fe743a2
--- /dev/null
+++ b/examples/habana/quantization_config/maxabs_measure.json
@@ -0,0 +1,9 @@
+{
+    "method": "HOOKS",
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
\ No newline at end of file
diff --git a/examples/habana/quantization_config/maxabs_measure_include_outputs.json b/examples/habana/quantization_config/maxabs_measure_include_outputs.json
new file mode 100644
index 00000000000..6de845a54d8
--- /dev/null
+++ b/examples/habana/quantization_config/maxabs_measure_include_outputs.json
@@ -0,0 +1,10 @@
+{
+    "method": "HOOKS",
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "measure_exclude": "NONE",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
\ No newline at end of file
diff --git a/examples/habana/quantization_config/maxabs_quant.json b/examples/habana/quantization_config/maxabs_quant.json
new file mode 100644
index 00000000000..02314a728e6
--- /dev/null
+++ b/examples/habana/quantization_config/maxabs_quant.json
@@ -0,0 +1,10 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
\ No newline at end of file
diff --git a/examples/habana/quantization_config/maxabs_quant_mixtral.json b/examples/habana/quantization_config/maxabs_quant_mixtral.json
new file mode 100644
index 00000000000..737edcc4130
--- /dev/null
+++ b/examples/habana/quantization_config/maxabs_quant_mixtral.json
@@ -0,0 +1,13 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "whitelist": {"types": [], "names":  ["gate","w1","w3","w2"]},
+    "blacklist": {"types": [], "names":  [
+        "model.layers.1.block_sparse_moe.experts.(3|4).w2",
+        "model.layers.[29-31].block_sparse_moe.experts.[0-7].w2"
+    ]},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
\ No newline at end of file
diff --git a/examples/habana/quantization_config/unit_scale_quant.json b/examples/habana/quantization_config/unit_scale_quant.json
new file mode 100644
index 00000000000..caad4bb2a4f
--- /dev/null
+++ b/examples/habana/quantization_config/unit_scale_quant.json
@@ -0,0 +1,10 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "unit_scale",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
diff --git a/examples/habana/run_generation.py b/examples/habana/run_generation.py
index 46207481acd..5f4f3143e2e 100644
--- a/examples/habana/run_generation.py
+++ b/examples/habana/run_generation.py
@@ -13,9 +13,7 @@
 
 import torch
 from utils import adjust_batch, count_hpu_graphs, initialize_model
-
-from optimum.habana.utils import get_hpu_memory_stats
-
+from utils import print_memory_stats
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -128,11 +126,11 @@ def setup_parser(parser):
     args.quant_config = os.getenv("QUANT_CONFIG", "")
     return args
 
-
 def main():
     parser = argparse.ArgumentParser()
     args = setup_parser(parser)
     model, tokenizer, generation_config = initialize_model(args, logger)
+    # print_memory_stats()
     use_lazy_mode = True
     if args.torch_compile and model.config.model_type == "llama":
         use_lazy_mode = False
@@ -204,6 +202,7 @@ def generate(inputs, size=None):
     for _ in range(args.warmup):
         generate(input_sentences, None)
     torch_hpu.synchronize()
+    # print_memory_stats()
     compilation_duration = time.perf_counter() - t0
 
     HabanaProfile.enable()
@@ -226,10 +225,11 @@ def generate(inputs, size=None):
     print(separator)
     print("The input token size is {}K ".format(args.size))
     print(stats)
-    mem = get_hpu_memory_stats()
-    for k, v in mem.items():
-        print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
+    print_memory_stats()
     print(separator)
+    if args.quant_config:
+        import habana_quantization_toolkit
+        habana_quantization_toolkit.finish_measurements(model)
 
 if __name__ == "__main__":
     main()
diff --git a/examples/habana/utils.py b/examples/habana/utils.py
index ea473a85439..9b1482d7346 100644
--- a/examples/habana/utils.py
+++ b/examples/habana/utils.py
@@ -21,6 +21,16 @@
 from optimum.habana.utils import check_optimum_habana_min_version
 from optimum.habana.utils import set_seed
 
+def print_memory_stats(p_info=""):
+    from optimum.habana.utils import get_hpu_memory_stats
+    separator = "-" * 90
+    print(separator)
+    print("{}".format(p_info))
+    mem = get_hpu_memory_stats()
+    for k, v in mem.items():
+        print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
+    print(separator)
+
 def adjust_batch(batch, size):
     curr_size = batch["input_ids"].shape[1]
     if curr_size >= size:
@@ -142,13 +152,15 @@ def get_torch_compiled_model(model):
 
 def setup_model(args, model_dtype, model_kwargs, logger):
     logger.info("Single-device run.")
-
     model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
     if args.quant_config:
         import habana_quantization_toolkit
 
         habana_quantization_toolkit.prep_model(model)
-    model = model.eval().to("hpu")
+
+    model = model.eval()
+    # import pdb; pdb.set_trace()
+    model = model.to("hpu")
 
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
@@ -319,10 +331,10 @@ def setup_generation_config(args, model, tokenizer):
     # generation_config.num_return_sequences = args.num_return_sequences
     generation_config.trim_logits = args.trim_logits
     # TODO notice here why can't use softmax_bf16
-    generation_config.attn_softmax_bf16 = False
+    generation_config.attn_softmax_bf16 = True
     generation_config.limit_hpu_graphs = args.limit_hpu_graphs
     # TODO why reuse cache  and reduce recompile false
-    generation_config.reuse_cache = False
+    generation_config.reuse_cache = True
     generation_config.reduce_recompile = False
     if generation_config.reduce_recompile:
         assert generation_config.bucket_size > 0
@@ -339,7 +351,8 @@ def initialize_model(args, logger):
     setup_device(args)
     set_seed(27)
     get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=None)
-    use_deepspeed = args.world_size > 0
+    use_deepspeed = args.world_size > 1
+    # import pdb; pdb.set_trace()
     if use_deepspeed or args.bf16 or args.fp8:
         model_dtype = torch.bfloat16
     else:
@@ -368,9 +381,10 @@ def initialize_model(args, logger):
         import habana_frameworks.torch.core as htcore
 
         print("Initializing inference mode")
-        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
-        if const_marking == "True":
-            htcore.hpu_initialize(model)
+        # const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
+        # if const_marking == "True":
+            # TODO always initialize model
+        htcore.hpu_initialize(model)
     init_end = time.perf_counter()
     # logger.info(f"Args: {args}")
     logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
index 0fdf6d3701e..61585b559f9 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from transformers.generation import GenerationConfig
 
 
@@ -43,12 +29,12 @@ class GaudiGenerationConfig(GenerationConfig):
         Only active if `static_shapes` is used. Can't be used with `reuse_cache`.
     bucket_internal (`bool`, *optional*):
         Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.
-    kv_cache_fp8 (`bool`, *optional*):
-        Store kv-cache in float8 when kv-cache is used
     use_flash_attention (`bool`, *optional*):
         Whether to use flash attention optimization.
     flash_attention_recompute (`bool`, *optional*):
         Whether to enable recompute if use Habana flash attention.
+    flash_attention_causal_mask (`bool`, *optional*):
+        Whether to enable causal_mask if use Habana flash attention.
     """
 
     def __init__(self, **kwargs):
@@ -62,7 +48,7 @@ def __init__(self, **kwargs):
         self.bucket_size = kwargs.get("bucket_size", -1)
         self.bucket_internal = kwargs.get("bucket_internal", None)
         self.reduce_recompile = kwargs.get("reduce_recompile", None)
-        self.kv_cache_fp8 = kwargs.get("kv_cache_fp8", None)
         self.use_flash_attention = kwargs.get("use_flash_attention", None)
         self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
+        self.flash_attention_causal_mask = kwargs.get("flash_attention_causal_mask", None)
         self.use_fused_rope = kwargs.get("use_fused_rope", None)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index 4daaa48a7c1..a4b2f1c6a30 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -204,7 +204,7 @@ def _prepare_decoder_input_ids_for_generation(
             if isinstance(decoder_start_token_id, list):
                 if len(decoder_start_token_id) != batch_size:
                     raise ValueError(
-                        f"`decoder_start_token_id` expected to have length {batch_size} but got {len(decoder_start_token_id)}"
+                        f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}"
                     )
                 decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
                 decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
@@ -321,7 +321,7 @@ def update_model_kwargs_for_bucketing(
                     model_kwargs["attention_mask"], (0, pad_amount), value=0
                 )
             else:
-                assert False, "Not tested for cases where attn_mask isn't passed"
+                assert False, "Not tested for cases where attn_mask isnt passed"
             if reduce_recompile and params["passnum"] == 0:
                 position_ids_cpu = model_kwargs["attention_mask"].long().cumsum(-1) - 1
                 position_ids_cpu.masked_fill_(model_kwargs["attention_mask"] == 0, 1)
@@ -345,20 +345,32 @@ def create_pad_arg(pad_amount, i, j):
                         assert False, "Unknown case, please handle, or dont use bucketing"
 
                 new_kv = [None for i in range(len(model_kwargs["past_key_values"]))]
-                for i in range(len(model_kwargs["past_key_values"])):
-                    tmp_lst = [None for j in range(len(model_kwargs["past_key_values"][i]))]
-                    for j in range(len(model_kwargs["past_key_values"][i])):
-                        pad_tuple = create_pad_arg(pad_amount, i, j)
-                        # Different models might have different shapes of kv-cache
-                        # create_pad_arg handles them on a per-model basis
-                        # This is a necessary (but not sufficient) condition: what ever dimension we are padding, should be a multiple of bucket_size
-                        # This check is added in case we get a new model with a new kv-cache structure, and we attempt to pad some wrong dimension
-                        assert model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == 0
-                        tmp_lst[j] = torch.nn.functional.pad(
-                            model_kwargs["past_key_values"][i][j], pad_tuple, value=pad_token_id
+                if self.config.model_type == "gpt_bigcode" and model_kwargs["past_key_values"][0][0].dim() == 2:
+                    # GPT_BIGCODE's kv cache is list of tensors.
+                    new_kv = [None for i in range(len(model_kwargs["past_key_values"]))]
+                    for i in range(len(model_kwargs["past_key_values"])):
+                        pad = (0, 0, 0, pad_amount)
+                        new_kv[i] = torch.nn.functional.pad(
+                            model_kwargs["past_key_values"][i], pad, value=pad_token_id
                         )
-                    new_kv[i] = tuple(tmp_lst)
-                model_kwargs["past_key_values"] = tuple(new_kv)
+                    model_kwargs["past_key_values"] = list(new_kv)
+                else:
+                    for i in range(len(model_kwargs["past_key_values"])):
+                        tmp_lst = [None for j in range(len(model_kwargs["past_key_values"][i]))]
+                        for j in range(len(model_kwargs["past_key_values"][i])):
+                            pad_tuple = create_pad_arg(pad_amount, i, j)
+                            # Different models might have different shapes of kv-cache
+                            # create_pad_arg handles them on a per-model basis
+                            # This is a necessary (but not sufficient) condition: what ever dimension we are padding, should be a multiple of bucket_size
+                            # This check is added in case we get a new model with a new kv-cache structure, and we attempt to pad some wrong dimension
+                            assert (
+                                model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == 0
+                            )
+                            tmp_lst[j] = torch.nn.functional.pad(
+                                model_kwargs["past_key_values"][i][j], pad_tuple, value=pad_token_id
+                            )
+                        new_kv[i] = tuple(tmp_lst)
+                    model_kwargs["past_key_values"] = tuple(new_kv)
 
         if "token_idx" not in model_kwargs:
             model_kwargs["token_idx"] = torch.tensor(params["token_idx"], device=self.device)
@@ -435,7 +447,7 @@ def generate(
                 generating before other GPUs. Otherwise it'll be set to `False`.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             streamer (`BaseStreamer`, *optional*):
@@ -451,7 +463,7 @@ def generate(
             hpu_graphs (`bool`, *optional*, defaults to `False`):
                 Whether to use HPU graphs for inference.
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             kwargs (`Dict[str, Any]`, *optional*):
@@ -584,7 +596,8 @@ def generate(
             assert self.config.model_type in [
                 "llama",
                 "mistral",
-            ], "reuse_cache only supported by llama and mistral at the moment"
+                "falcon",
+            ], "reuse_cache only supported by llama, mistral and falcon at the moment"
             if not generation_config.bucket_internal:
                 assert (
                     generation_config.bucket_size <= 0
@@ -725,6 +738,8 @@ def generate(
         # determine whether flash attention needs to be used
         model_kwargs["use_flash_attention"] = generation_config.use_flash_attention
         model_kwargs["flash_attention_recompute"] = True if generation_config.flash_attention_recompute else False
+        model_kwargs["flash_attention_causal_mask"] = True if generation_config.flash_attention_causal_mask else False
+
         if not self.config.is_encoder_decoder:
             calculated_max_length = input_ids.shape[-1]
             if not generation_config.static_shapes and generation_config.max_new_tokens is not None:
@@ -733,14 +748,11 @@ def generate(
                 bs, _ = input_ids.shape
                 if not is_greedy_or_beam_and_bucket:
                     unwrap_deepspeed_model(self).allocate_kv_cache(
-                        bs * generation_config.num_beams,
-                        calculated_max_length,
-                        token_idx,
-                        generation_config.kv_cache_fp8,
+                        bs * generation_config.num_beams, calculated_max_length, token_idx
                     )
                     model_kwargs["kv_cache_len"] = calculated_max_length
 
-            if self.config.model_type in ["llama"]:
+            if self.config.model_type in ["llama", "falcon"]:
                 if self.config.max_position_embeddings < calculated_max_length:
                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
 
@@ -1181,7 +1193,7 @@ def contrastive_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1294,7 +1306,7 @@ def greedy_search(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1433,6 +1445,7 @@ def greedy_search(
                 )
 
             # prepare model inputs
+            model_kwargs["lazy_mode"] = lazy_mode
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
@@ -1643,7 +1656,7 @@ def sample(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1784,6 +1797,7 @@ def sample(
                     break
 
             # prepare model inputs
+            model_kwargs["lazy_mode"] = lazy_mode
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
@@ -1976,7 +1990,7 @@ def beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2230,6 +2244,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                     params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
                 )
 
+            model_kwargs["lazy_mode"] = lazy_mode
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             # if sequential is True, split the input to batches of batch_size and run sequentially
@@ -2564,7 +2579,7 @@ def beam_sample(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2710,7 +2725,7 @@ def group_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2857,7 +2872,7 @@ def constrained_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -3010,6 +3025,7 @@ def constrained_beam_search(
                 if this_peer_finished_flag.item() == 0.0:
                     break
 
+            model_kwargs["lazy_mode"] = lazy_mode
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
@@ -3209,7 +3225,7 @@ def assisted_decoding(
                 more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -3246,7 +3262,7 @@ def assisted_decoding(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profiling.
+                Number of steps to ignore for profling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             streamer (`BaseStreamer`, *optional*):
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 4238710d92f..59f9f9930c0 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
@@ -47,9 +33,11 @@
 
 try:
     from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+
+    has_fused_rms_norm = True
 except ImportError:
+    has_fused_rms_norm = False
     print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
 
 try:
     from habana_frameworks.torch.hpex.kernels import FusedSDPA
@@ -57,24 +45,7 @@
     print("Not using HPU fused scaled dot-product attention kernel.")
     FusedSDPA = None
 
-
-def update(prev, cur, dim, idx, inp_seq_len):
-    orig_cur = cur
-    if prev.dtype == torch.float8_e4m3fn:
-        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2
-
-        cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0]
-    if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-        # Initialize
-        prev[:, :, :inp_seq_len, :].copy_(cur)
-        return orig_cur
-    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-    if idx is not None:
-        prev.index_copy_(dim, idx - 1, cur)
-        prev_cast = prev.to(orig_cur.dtype)
-        return prev_cast
-    else:
-        return torch.cat((prev, cur), dim=dim)
+import habana_frameworks.torch.core as htcore
 
 
 def gaudi_llama_rmsnorm_forward(self, hidden_states):
@@ -83,7 +54,7 @@ def gaudi_llama_rmsnorm_forward(self, hidden_states):
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+    if hidden_states.device.type == "hpu" and has_fused_rms_norm:
         # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
         if hidden_states.dtype != self.weight.dtype:
             orig_dtype = hidden_states.dtype
@@ -183,11 +154,9 @@ def __init__(self):
         self.cache = None
         self.inp_seq_len = -1
 
-    def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
+    def allocate(self, inp_seq_len, dtype, device, shape):
         if self.cache is None or self.cache.shape != shape:
             self.inp_seq_len = inp_seq_len
-            if kv_cache_fp8:
-                dtype = torch.float8_e4m3fn
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
             assert (
@@ -195,13 +164,29 @@ def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
             ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
             self.cache.fill_(0)
 
+    def update(self, prev, cur, dim, idx, inp_seq_len):
+        orig_cur = cur
+        if prev.shape == cur.shape:
+            prev.copy_(cur)
+            return orig_cur
+        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
+            # Initialize
+            prev[:, :, :inp_seq_len, :].copy_(cur)
+            return orig_cur
+        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+        if idx is not None:
+            prev.index_copy_(dim, idx - 1, cur)
+            return prev
+        else:
+            return torch.cat((prev, cur), dim=dim)
+
     def get_shape(self):
         if self.cache is None:
             return None
         return self.cache.shape
 
     def forward(self, cur, dim, idx):
-        return update(self.cache, cur, dim, idx, self.inp_seq_len)
+        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
 
 
 class GaudiLlamaRotaryEmbedding(torch.nn.Module):
@@ -285,15 +270,15 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
         self.inp_seq_len = -1
         self.norm_factor = 1.0 / math.sqrt(self.head_dim)
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
         cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
         device = self.k_proj.weight.device
         dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
+        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
+        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
@@ -328,6 +313,7 @@ def pre_attn_forward(
         reuse_cache: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -340,6 +326,7 @@ def pre_attn_forward(
         - add new args reuse_cache
         - add new args use_flash_attention
         - add new arg flash_attention_recompute
+        - add new arg flash_attention_causal_mask
         """
         bsz, q_len, _ = hidden_states.size()
 
@@ -385,14 +372,23 @@ def pre_attn_forward(
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
 
-        if past_key_value is not None or reuse_cache:
+        if use_cache:
             # reuse k, v, self_attention
             if reuse_cache:
                 key_states = self.k_cache(key_states, 2, token_idx)
                 value_states = self.v_cache(value_states, 2, token_idx)
+                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
             else:
-                key_states = update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+                if past_key_value is None:
+                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
+                    past_value = torch.zeros(
+                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
+                    )
+                    past_key_value = (past_key, past_value)
+                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
+                value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+                if token_idx is None:
+                    past_key_value = (key_states, value_states)
 
             if cache_idx is not None and q_len == 1:
                 key_states = key_states[:, :, :cache_idx, :]
@@ -400,12 +396,6 @@ def pre_attn_forward(
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, :, :, :cache_idx]
                 kv_seq_len = key_states.shape[-2]
-
-        if use_cache:
-            if reuse_cache:
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                past_key_value = (key_states.contiguous(), value_states.contiguous())
         else:
             past_key_value = None
 
@@ -420,10 +410,15 @@ def pre_attn_forward(
                     )
             else:
                 # first token
-                with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-                    attn_output = FusedSDPA.apply(
-                        query_states, key_states, value_states, attention_mask, 0.0, False, None
-                    )
+                if flash_attention_causal_mask:
+                    # causal masking on first token requires inputs to be of the same length
+                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                        attn_output = FusedSDPA.apply(query_states, key_states, value_states, None, 0.0, True, None)
+                else:
+                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                        attn_output = FusedSDPA.apply(
+                            query_states, key_states, value_states, attention_mask, 0.0, False, None
+                        )
 
         else:
             query_states, key_states, value_states, attention_mask = gaudi_llama_repeat_kv(
@@ -487,8 +482,8 @@ def __init__(self, config: LlamaConfig, layer_idx: int):
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return self.self_attn.reorder_kv_cache(beam_idx)
@@ -510,6 +505,7 @@ def forward(
         reuse_cache: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -521,6 +517,7 @@ def forward(
         - add new args reuse_cache
         - add new args use_flash_attention
         - add new arg flash_attention_recompute
+        - add new arg flash_attention_causal_mask
         """
         if "padding_mask" in kwargs:
             warnings.warn(
@@ -528,7 +525,7 @@ def forward(
             )
 
         residual = hidden_states
-        output_pre_attn, self_attn_weights, present_key_value = self.pre_attn(
+        hidden_states, self_attn_weights, present_key_value = self.pre_attn(
             hidden_states,
             attention_mask,
             position_ids,
@@ -541,16 +538,16 @@ def forward(
             reuse_cache,
             use_flash_attention=use_flash_attention,
             flash_attention_recompute=flash_attention_recompute,
+            flash_attention_causal_mask=flash_attention_causal_mask,
             cache_idx=cache_idx,
             **kwargs,
         )
+        self.self_attn.attention_all_reduce(hidden_states)
+        hidden_states, residual = self.post_attn_pre_mlp(hidden_states, residual)
+        self.mlp.mlp_all_reduce(hidden_states)
+        hidden_states = self.post_mlp(hidden_states, residual)
 
-        self.self_attn.attention_all_reduce(output_pre_attn)
-        output_post_attn_pre_mlp, residual_mlp = self.post_attn_pre_mlp(output_pre_attn, residual)
-        self.mlp.mlp_all_reduce(output_post_attn_pre_mlp)
-        output_post_mlp = self.post_mlp(output_post_attn_pre_mlp, residual_mlp)
-
-        outputs = (output_post_mlp,)
+        outputs = (hidden_states,)
 
         if output_attentions:
             outputs += (self_attn_weights,)
@@ -573,10 +570,11 @@ def pre_attn(
         reuse_cache: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         hidden_states = self.input_layernorm(hidden_states)
-        output_attn, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
+        hidden_states, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
             hidden_states,
             attention_mask,
             position_ids,
@@ -589,25 +587,36 @@ def pre_attn(
             reuse_cache,
             use_flash_attention,
             flash_attention_recompute,
+            flash_attention_causal_mask,
             cache_idx=cache_idx,
         )
-        return output_attn, attn_weights, present_key_value
+        return hidden_states, attn_weights, present_key_value
 
-    def post_attn_pre_mlp(self, input, residual):
-        output_post_attn = self.self_attn.post_attn_forward(input)
+    def post_attn_pre_mlp(self, hidden_states, residual):
+        hidden_states = self.self_attn.post_attn_forward(hidden_states)
 
-        hidden_states = residual + output_post_attn
-        residual = hidden_states
+        if self.training:
+            hidden_states = hidden_states + residual
+            residual = hidden_states
+        else:
+            residual.add_(hidden_states)
+            hidden_states = residual
 
         hidden_states = self.post_attention_layernorm(hidden_states)
 
         hidden_states = self.mlp.pre_mlp_forward(hidden_states)
         return hidden_states, residual
 
-    def post_mlp(self, input, residual):
-        output_post_mlp = self.mlp.post_mlp_forward(input)
-        output = output_post_mlp + residual
-        return output
+    def post_mlp(self, hidden_states, residual):
+        hidden_states = self.mlp.post_mlp_forward(hidden_states)
+
+        if self.training:
+            hidden_states = hidden_states + residual
+        else:
+            residual.add_(hidden_states)
+            hidden_states = residual
+
+        return hidden_states
 
 
 class GaudiLlamaModel(LlamaModel):
@@ -643,9 +652,9 @@ def __init__(self, config: LlamaConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
         for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
@@ -671,7 +680,9 @@ def forward(
         reuse_cache: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
+        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
         Copied from LlamaModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
@@ -681,6 +692,8 @@ def forward(
         - add new args reuse_cache
         - add new args use_flash_attention
         - add new arg flash_attention_recompute
+        - add new arg flash_attention_causal_mask
+        - add new arg lazy_mode
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -757,7 +770,17 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = () if not use_new_cache else None
 
+        if lazy_mode:
+            htcore.mark_step()
+
         for layer_idx, decoder_layer in enumerate(self.layers):
+            if (
+                lazy_mode
+                and not self.training
+                and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
+            ):
+                htcore.mark_step()
+
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -776,6 +799,7 @@ def forward(
                     False,
                     use_flash_attention,
                     flash_attention_recompute,
+                    flash_attention_causal_mask,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -791,6 +815,7 @@ def forward(
                     reuse_cache=reuse_cache,
                     use_flash_attention=use_flash_attention,
                     flash_attention_recompute=flash_attention_recompute,
+                    flash_attention_causal_mask=flash_attention_causal_mask,
                     cache_idx=cache_idx,
                 )
             hidden_states = layer_outputs[0]
@@ -834,9 +859,8 @@ class GaudiLlamaForCausalLM(LlamaForCausalLM):
     - add new args reuse_cache
     """
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
-        self.kv_cache_len = max_seq_len
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return self.model.reorder_kv_cache(beam_idx)
@@ -863,7 +887,9 @@ def forward(
         reuse_cache: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
+        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -891,7 +917,9 @@ def forward(
             reuse_cache=reuse_cache,
             use_flash_attention=use_flash_attention,
             flash_attention_recompute=flash_attention_recompute,
+            flash_attention_causal_mask=flash_attention_causal_mask,
             cache_idx=cache_idx,
+            lazy_mode=lazy_mode,
         )
         hidden_states = outputs[0]
         _, seq_len, _ = hidden_states.shape
@@ -1024,7 +1052,9 @@ def prepare_inputs_for_generation(
                 "reuse_cache": reuse_cache,
                 "use_flash_attention": kwargs.get("use_flash_attention"),
                 "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
+                "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
                 "cache_idx": kwargs.get("cache_idx"),
+                "lazy_mode": kwargs.get("lazy_mode"),
             }
         )
         return model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index f65ab019534..29fac0d171f 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -138,7 +138,7 @@ def allocate_kv_cache(self, batch_size, seq_len):
             self.past_value = torch.empty(kv_shape, dtype=dtype, device=device)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
@@ -518,7 +518,7 @@ def forward(
 
 
 class GaudiMistralForCausalLM(MistralForCausalLM):
-    def allocate_kv_cache(self, batch_size, seq_len, _, __):
+    def allocate_kv_cache(self, batch_size, seq_len, _):
         self.model.allocate_kv_cache(batch_size, seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):

From 4ad3b047c78b6e00d2fe6816e1249454f5e50420 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Apr 2024 09:14:56 +0000
Subject: [PATCH 06/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../generation/configuration_utils.py         | 14 +++
 .../modeling_gaudi/generation/utils.py        | 86 +++++++++----------
 .../modeling/modeling_gaudi/modeling_utils.py |  6 +-
 .../models/bart/modeling_bart.py              |  4 +-
 .../models/bloom/modeling_bloom.py            | 23 ++---
 .../models/codegen/modeling_codegen.py        | 10 +--
 .../models/esm/modeling_esmfold.py            |  6 +-
 .../models/falcon/modeling_falcon.py          | 10 +--
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 10 +--
 .../models/gptj/modeling_gptj.py              | 10 +--
 .../models/llama/modeling_llama.py            | 16 +++-
 .../models/mistral/modeling_mistral.py        |  2 +-
 .../models/mixtral/modeling_mixtral.py        |  1 -
 .../models/modeling_attn_mask_utils.py        |  4 +-
 .../modeling_gaudi/models/t5/modeling_t5.py   |  6 +-
 .../modeling_gaudi/models/vit/modeling_vit.py |  4 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  2 +-
 17 files changed, 113 insertions(+), 101 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
index 61585b559f9..8130f375fc1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from transformers.generation import GenerationConfig
 
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index a4b2f1c6a30..62fed0fbb14 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -109,8 +109,8 @@ def incrementor(bucket_size, prompt_len):
 
 
 class GaudiGenerationMixin(GenerationMixin):
-    """
-    This class enables to perform fast generation in lazy mode and with HPU graphs.
+    """This class enables to perform fast generation in lazy mode and with HPU graphs.
+
     The only difference with GenerationMixin is that the various generation
     methods will generate sequences whose size is max_length. Having constant
     sizes allows to make the most of lazy mode and HPU graphs.
@@ -123,8 +123,7 @@ def _expand_inputs_for_generation(
         input_ids: Optional[torch.LongTensor] = None,
         **model_kwargs,
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
-        """
-        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...].
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...].
 
         Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L704
         The tensor `token_idx` is not expanded.
@@ -183,7 +182,7 @@ def _prepare_decoder_input_ids_for_generation(
         device: torch.device = None,
         max_new_tokens: int = None,
     ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
-        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models."""
         # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
         # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
 
@@ -204,7 +203,7 @@ def _prepare_decoder_input_ids_for_generation(
             if isinstance(decoder_start_token_id, list):
                 if len(decoder_start_token_id) != batch_size:
                     raise ValueError(
-                        f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}"
+                        f"`decoder_start_token_id` expected to have length {batch_size} but got {len(decoder_start_token_id)}"
                     )
                 decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
                 decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
@@ -321,7 +320,7 @@ def update_model_kwargs_for_bucketing(
                     model_kwargs["attention_mask"], (0, pad_amount), value=0
                 )
             else:
-                assert False, "Not tested for cases where attn_mask isnt passed"
+                assert False, "Not tested for cases where attn_mask isn't passed"
             if reduce_recompile and params["passnum"] == 0:
                 position_ids_cpu = model_kwargs["attention_mask"].long().cumsum(-1) - 1
                 position_ids_cpu.masked_fill_(model_kwargs["attention_mask"] == 0, 1)
@@ -395,8 +394,7 @@ def generate(
         profiling_steps: Optional[int] = 0,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head.
+        r"""Generates sequences of token ids for models with a language modeling head.
 
         <Tip warning={true}>
 
@@ -447,7 +445,7 @@ def generate(
                 generating before other GPUs. Otherwise it'll be set to `False`.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             streamer (`BaseStreamer`, *optional*):
@@ -463,7 +461,7 @@ def generate(
             hpu_graphs (`bool`, *optional*, defaults to `False`):
                 Whether to use HPU graphs for inference.
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             kwargs (`Dict[str, Any]`, *optional*):
@@ -1139,8 +1137,7 @@ def contrastive_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
+        r"""Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
         be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -1193,7 +1190,7 @@ def contrastive_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1229,7 +1226,8 @@ def contrastive_search(
         ... )
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
-        ```"""
+        ```
+        """
 
         raise NotImplementedError("Contrastive search is not supported by optimum-habana yet.")
 
@@ -1254,8 +1252,7 @@ def greedy_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+        r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
         used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -1306,7 +1303,7 @@ def greedy_search(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1355,7 +1352,8 @@ def greedy_search(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-        ```"""
+        ```
+        """
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -1601,8 +1599,7 @@ def sample(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        r"""Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -1656,7 +1653,7 @@ def sample(
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -1721,7 +1718,8 @@ def sample(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
-        ```"""
+        ```
+        """
 
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -1937,8 +1935,7 @@ def beam_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+        r"""Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -1990,7 +1987,7 @@ def beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2053,7 +2050,8 @@ def beam_search(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Wie alt bist du?']
-        ```"""
+        ```
+        """
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2526,8 +2524,7 @@ def beam_sample(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **beam search multinomial
+        r"""Generates sequences of token ids for models with a language modeling head using **beam search multinomial
         sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -2579,7 +2576,7 @@ def beam_sample(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2652,7 +2649,8 @@ def beam_sample(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Wie alt bist du?']
-        ```"""
+        ```
+        """
 
         raise NotImplementedError("Beam search sampling is not supported by optimum-habana yet.")
 
@@ -2676,8 +2674,7 @@ def group_beam_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ):
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **diverse beam search
+        r"""Generates sequences of token ids for models with a language modeling head using **diverse beam search
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -2725,7 +2722,7 @@ def group_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2794,7 +2791,8 @@ def group_beam_search(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Wie alt bist du?']
-        ```"""
+        ```
+        """
 
         raise NotImplementedError("Group beam search is not supported by optimum-habana yet.")
 
@@ -2818,8 +2816,7 @@ def constrained_beam_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **constrained beam search
+        r"""Generates sequences of token ids for models with a language modeling head using **constrained beam search
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -2872,7 +2869,7 @@ def constrained_beam_search(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             model_kwargs:
@@ -2940,7 +2937,8 @@ def constrained_beam_search(
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Wie alt sind Sie?']
-        ```"""
+        ```
+        """
 
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -3203,8 +3201,7 @@ def assisted_decoding(
         streamer: Optional["BaseStreamer"] = None,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
+        r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
         **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
         candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
         models.
@@ -3225,7 +3222,7 @@ def assisted_decoding(
                 more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                 is much faster than running generation with the model you're calling generate from. As such, the
                 assistant model should be much smaller.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -3262,7 +3259,7 @@ def assisted_decoding(
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
+                Number of steps to ignore for profiling.
             profiling_steps (`int`, *optional*, defaults to 0):
                 Number of steps to be captured when enabling profiling.
             streamer (`BaseStreamer`, *optional*):
@@ -3313,5 +3310,6 @@ def assisted_decoding(
         ... )
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-        ```"""
+        ```
+        """
         raise NotImplementedError("Assisted decoding is not supported by optimum-habana yet.")
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
index 9d4e473aab3..0561a428334 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/modeling_utils.py
@@ -139,10 +139,8 @@
 
 
 def adapt_transformers_to_gaudi():
-    """
-    Replaces some Transformers' methods for equivalent methods optimized
-    for Gaudi.
-    """
+    """Replaces some Transformers' methods for equivalent methods optimized
+    for Gaudi."""
 
     # optimize Conv1D
     transformers.pytorch_utils.Conv1D.forward = gaudi_conv1d_forward
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
index 4c66c2cc954..cab69760e15 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
@@ -48,9 +48,7 @@
 
 
 class gaudi_BartLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
+    """This module learns positional embeddings up to a fixed maximum size."""
 
     def __init__(self, num_embeddings: int, embedding_dim: int):
         # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
index bfc0ba2408d..bc958571d64 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -281,10 +281,8 @@ def gaudi_bloom_block_forward(
 def gaudi_bloom_convert_to_standard_cache(
     self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int, training: bool
 ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-    """
-    Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
-    num_heads, ...]))
-    """
+    """Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+    num_heads, ...]))"""
     batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
     if training:
         num_heads = batch_size_times_num_heads // batch_size
@@ -307,9 +305,7 @@ def gaudi_bloom_convert_to_standard_cache(
 def gaudi_bloom_convert_to_bloom_cache(
     self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
 ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-    """
-    Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
-    """
+    """Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))"""
     batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
     batch_size_times_num_heads = batch_size * num_heads
     # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
@@ -526,11 +522,11 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
         **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        r"""Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         if deprecated_arguments.pop("position_ids", False) is not False:
             # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
@@ -589,8 +585,7 @@ def forward(
     def _reorder_cache(
         self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        """This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
         beam_idx at every generation step.
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
index 0bf12e0676b..23fbb596890 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
@@ -379,11 +379,11 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        r"""Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
index 88b6dac0d24..b68feacf5ec 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
@@ -239,8 +239,7 @@ def gaudi_esm_for_protein_folding_forward(
 
 
 def gaudi_rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
-    """
-    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
+    """Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
 
     Args:
         r: [*, 3, 3] rotation matrices
@@ -296,8 +295,7 @@ def gaudi_rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
 
 
 def gaudi_rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-    """
-    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
+    """Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
 
     Args:
         a: [*, 3, 3] left multiplicand
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index d88fc879685..89f92795f72 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -643,11 +643,11 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        r"""Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
index 82957d1c324..b7874bf8ca6 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -457,11 +457,11 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        r"""Labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
index fcae80d4eaf..a4e279766b1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
@@ -484,11 +484,11 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        r"""Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 59f9f9930c0..61fb4858baa 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
@@ -278,7 +292,7 @@ def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
         self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index 29fac0d171f..b7e0932b5a9 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -138,7 +138,7 @@ def allocate_kv_cache(self, batch_size, seq_len):
             self.past_value = torch.empty(kv_shape, dtype=dtype, device=device)
 
     def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
         if seq_len > self.max_position_embeddings:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index c6f5f51ab7c..9e56ed4de8a 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -17,7 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """PyTorch Mixtral model."""
 
 import math
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
index 4fe62170997..fe776330131 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
@@ -35,9 +35,7 @@ def _make_causal_mask(
         past_key_values_length: int = 0,
         sliding_window: Optional[int] = None,
     ):
-        """
-        Make causal mask used for bi-directional self-attention.
-        """
+        """Make causal mask used for bi-directional self-attention."""
         bsz, tgt_len = input_ids_shape
         mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
         mask_cond = torch.arange(mask.size(-1), device=device)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
index 189317f80d2..fddc9580cce 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -93,15 +93,15 @@ def gaudi_T5Attention_forward(
     key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
 
     def shape(states):
-        """projection"""
+        """projection."""
         return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
 
     def unshape(states):
-        """reshape"""
+        """reshape."""
         return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
 
     def project(hidden_states, proj_layer, key_value_states, past_key_value):
-        """projects hidden states correctly to key/query states"""
+        """Projects hidden states correctly to key/query states."""
         if key_value_states is None:
             # self-attn
             # (batch_size, n_heads, seq_length, dim_per_head)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
index 4fd5990e14e..a0615687b19 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/vit/modeling_vit.py
@@ -23,8 +23,8 @@
 def gaudi_vit_self_attention_forward(
     self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
 ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-    """
-    Same method as transformers.models.vit.modeling_vit.ViTSelfAttention.forward with a small tweak:
+    """Same method as transformers.models.vit.modeling_vit.ViTSelfAttention.forward with a small tweak:
+
     the division is performed before the matmul for computing attention scores.
     This gives better performance on HPU.
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
index 983c5b5375b..5c05d65065a 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -50,7 +50,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
     epsilon = torch.rand([], device="hpu")
 
     def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
+        """Given input length, compute how many spans should be masked."""
         num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
         num_masked_span = max(num_masked_span, min_masks)
 

From 63f0fc00a284c72d34999b3c943602ec758c1549 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Apr 2024 16:23:29 +0800
Subject: [PATCH 07/25] Create requirements.txt

Signed-off-by: Meng, Hengyu <hengyu.meng@intel.com>
---
 examples/habana/requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 examples/habana/requirements.txt

diff --git a/examples/habana/requirements.txt b/examples/habana/requirements.txt
new file mode 100644
index 00000000000..148bf03f02d
--- /dev/null
+++ b/examples/habana/requirements.txt
@@ -0,0 +1 @@
+optimum_habana

From 1da9dfbb79fb91775190d34ddda0e07ca39342f4 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Thu, 25 Apr 2024 03:33:28 +0000
Subject: [PATCH 08/25] add ppl measurement in gaudi

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 examples/habana/run_generation.py | 118 +++++++++++++++++++++++-------
 examples/habana/utils.py          |  57 +++++++++++++++
 2 files changed, 147 insertions(+), 28 deletions(-)

diff --git a/examples/habana/run_generation.py b/examples/habana/run_generation.py
index 5f4f3143e2e..42b42be2ae2 100644
--- a/examples/habana/run_generation.py
+++ b/examples/habana/run_generation.py
@@ -12,8 +12,9 @@
 from pathlib import Path
 
 import torch
+import datasets
 from utils import adjust_batch, count_hpu_graphs, initialize_model
-from utils import print_memory_stats
+from utils import print_memory_stats, compute_perplexity
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -26,6 +27,13 @@
 def setup_parser(parser):
     # Arguments management
     parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
+    parser.add_argument(
+            "--dataset_name", 
+            type=str, choices=["tau/scrolls", "hoskinson-center/proof-pile", "pg19"],
+            help="Dataset to use", default=None)
+    parser.add_argument(
+            "--tokenized", 
+            type=str, help="Dataset has been tokenized", default=None)
     parser.add_argument(
         "--model_name_or_path",
         default=None,
@@ -138,32 +146,82 @@ def main():
     import habana_frameworks.torch.hpu as torch_hpu
 
     # Benchmark over the prompts below
-    input_sentences = [p * args.size for p in args.prompt]
-    # (TODO) if we want to test multi-batch use this code
-    # input_sentences = [
-    #     "DeepSpeed is a machine learning framework",
-    #     "He is working on",
-    #     "He has a",
-    #     "He got all",
-    #     "Everyone is happy and I can",
-    #     "The new movie that got Oscar this year",
-    #     "In the far far distance from our galaxy,",
-    #     "Peace is the only way",
-    # ]
-
-    if args.batch_size > len(input_sentences):
-        # Dynamically extends to support larger batch sizes
-        num_sentences_to_add = args.batch_size - len(input_sentences)
-        for i in range(num_sentences_to_add):
-            input_sentences.append(input_sentences[i % len(input_sentences)])
-    elif args.batch_size < len(input_sentences):
-        input_sentences = input_sentences[: args.batch_size]
+    if args.dataset_name:
+        if args.tokenized:
+            input_sentences = datasets.load_from_disk(args.tokenized)
+        else:
+            input_sentences = datasets.load_dataset(args.dataset_name, split='test')
+            tokenizer.pad_token = tokenizer.eos_token
+            feature = 'text' if column_name is None else column_name
+            def tokenize(example):
+                tokenized = tokenizer(
+                    example[feature],
+                    add_special_tokens=False,
+                    padding=True,
+                    truncation=False,
+                    max_length=args.max_input_tokens,
+                    return_attention_mask=True,
+                )
+                example["input_ids"] = tokenized["input_ids"]
+                example["attention_mask"] = tokenized["attention_mask"]
+                example["tokenized_len"] = len(tokenized["input_ids"])
+                return example
+    
+            input_sentences = input_sentences.map(tokenize)
+            # TODO you can save the tokenized results, this will save a lot of time
+            # inputs.save_to_disk("tokenized_path/token_1k")
+    
+        input_sentences = input_sentences.filter(lambda x: x["tokenized_len"] >= args.size * 1024)
+        input_sentences = input_sentences.filter(lambda x: x["tokenized_len"] <= (args.size + 2) * 1024)
+
+    else:
+        input_sentences = [p * args.size for p in args.prompt]
+        # (TODO) if we want to test multi-batch use this code
+        # input_sentences = [
+        #     "DeepSpeed is a machine learning framework",
+        #     "He is working on",
+        #     "He has a",
+        #     "He got all",
+        #     "Everyone is happy and I can",
+        #     "The new movie that got Oscar this year",
+        #     "In the far far distance from our galaxy,",
+        #     "Peace is the only way",
+        # ]
+
+        if args.batch_size > len(input_sentences):
+            # Dynamically extends to support larger batch sizes
+            num_sentences_to_add = args.batch_size - len(input_sentences)
+            for i in range(num_sentences_to_add):
+                input_sentences.append(input_sentences[i % len(input_sentences)])
+        elif args.batch_size < len(input_sentences):
+            input_sentences = input_sentences[: args.batch_size]
+
+    def compute_ppl(inputs):
+        results = []
+        result = []
+        has_bos_token = tokenizer.bos_token is not None
+        samples_numbers = [10, 20, 50]
+        sliding_windows = [256, 512, 1024]
+        for samples_num  in samples_numbers:
+            for sliding_window in sliding_windows:
+                ppl = compute_perplexity(
+                        model, tokenizer, inputs, samples_num,
+                        add_start_token=has_bos_token, max_length=args.size*1024,
+                        sliding_window=sliding_window, truncate=True)
+                print("PPL result is {}".format(ppl)) 
+                result.append(ppl)
+    
+        result.insert(0, model)
+        results.append(result)
+        return results
 
     def generate(inputs, size=None):
         """Generates sequences from the input sentences and returns them."""
-
         # Tokenization
-        if args.max_input_tokens > 0:
+        if args.dataset_name:
+            input_tokens = {'input_ids': torch.tensor([inputs[0]['input_ids']]),
+                    'attention_mask': torch.tensor([inputs[0]['attention_mask']])}
+        elif args.max_input_tokens > 0:
             input_tokens = tokenizer.batch_encode_plus(
                 inputs,
                 return_tensors="pt",
@@ -180,7 +238,6 @@ def generate(inputs, size=None):
         for t in input_tokens:
             if torch.is_tensor(input_tokens[t]):
                 input_tokens[t] = input_tokens[t].to(args.device)
-
         outputs = model.generate(
             **input_tokens,
             generation_config=generation_config,
@@ -200,7 +257,7 @@ def generate(inputs, size=None):
     t0 = time.perf_counter()
     # The first three iterations take longer because of graph compilation
     for _ in range(args.warmup):
-        generate(input_sentences, None)
+        warm_generated = generate(input_sentences, None)
     torch_hpu.synchronize()
     # print_memory_stats()
     compilation_duration = time.perf_counter() - t0
@@ -210,19 +267,24 @@ def generate(inputs, size=None):
     logger.info("Running generate...")
     t0 = time.perf_counter()
     print(f"Graph compilation duration          = {compilation_duration} seconds")
-    generated = generate(input_sentences, None)
+    if args.dataset_name:
+        ppl_results = compute_ppl(input_sentences)
+    else:
+        generated = generate(input_sentences, None)
     duration = time.perf_counter() - t0
     total_new_tokens_generated = args.batch_size * args.max_new_tokens
     throughput = total_new_tokens_generated / duration
 
     # (TODO) only open this when to check the accuracy of the output
-    # for i, input_sentence in enumerate(zip(input_sentences)):
-    #     print(f"input {i+1}: {input_sentence}\noutput {i+1}: {generated[i]}")
+    # if not args.dataset_name:
+    #     for i, input_sentence in enumerate(zip(input_sentences)):
+    #         print(f"input {i+1}: {input_sentence}\noutput {i+1}: {generated[i]}")
 
     stats = f"Throughput (including tokenization) = {throughput} tokens/second"
     stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
     separator = "-" * 90
     print(separator)
+    print("".format(duration))
     print("The input token size is {}K ".format(args.size))
     print(stats)
     print_memory_stats()
diff --git a/examples/habana/utils.py b/examples/habana/utils.py
index 9b1482d7346..3e5926411ac 100644
--- a/examples/habana/utils.py
+++ b/examples/habana/utils.py
@@ -1,6 +1,7 @@
 import copy
 import glob
 import os
+import sys
 import shutil
 import tempfile
 import time
@@ -21,6 +22,62 @@
 from optimum.habana.utils import check_optimum_habana_min_version
 from optimum.habana.utils import set_seed
 
+"Compute 'sliding window' perplexity on a dataset. Validated against the calculations reported in arXiv 2306.15595"
+def compute_perplexity(model, tokenizer, inputs, samples_num=None, add_start_token=True, max_length=None, sliding_window=256, truncate=False):
+
+    if samples_num:
+        encodings = inputs[: samples_num]
+
+    device='hpu'
+    max_tokenized_len = max_length - 1 if add_start_token else max_length
+
+    encoded_texts = encodings["input_ids"]
+    attn_masks = encodings["attention_mask"]
+
+    if max_length and truncate:
+        encoded_texts = [x[0:max_tokenized_len] for x in encoded_texts]
+        attn_masks = [x[0:max_tokenized_len] for x in attn_masks]
+        # sliding_window = max_tokenized_len
+
+    nlls = []
+    t_ppl = time.perf_counter()
+    for encoding_index in range(0, len(encoded_texts)):
+        labels = torch.tensor(encoded_texts[encoding_index:encoding_index+1])
+        seq_len = labels.size(1)
+
+        prev_end_loc = 0
+        for begin_loc in range(0, seq_len, sliding_window):
+
+            end_loc = min(begin_loc + max_tokenized_len, seq_len)
+            trg_len = end_loc - prev_end_loc
+            input_ids = labels[:, begin_loc:end_loc].to(device)
+
+            if add_start_token:
+                bos_tokens_tensor = torch.tensor(
+                    [[tokenizer.bos_token_id]] * input_ids.size(dim=0)).to(device)
+                input_ids = torch.cat(
+                    [bos_tokens_tensor, input_ids], dim=1)
+
+            target_ids = input_ids.clone()
+            target_ids[:, :-trg_len] = -100
+
+            with torch.no_grad():
+                outputs = model(input_ids, labels=target_ids)
+                neg_log_likelihood = outputs.loss
+            
+            nlls.append(neg_log_likelihood)
+
+            ppl = float(torch.exp(torch.stack(nlls).mean()).float().cpu())
+
+            prev_end_loc = end_loc
+            if end_loc == seq_len:
+                break
+
+    ppl = float(torch.exp(torch.stack(nlls).mean()).float().cpu())
+    ppl_duration = time.perf_counter() - t_ppl
+    return {'max_length': max_length, 'ppl': ppl, 'duration': ppl_duration, 'samples_num': samples_num, 'sliding_window': sliding_window}
+
+
 def print_memory_stats(p_info=""):
     from optimum.habana.utils import get_hpu_memory_stats
     separator = "-" * 90

From fb2f7ccb815fc43046f3a1c7193a5484c771d17c Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Fri, 26 Apr 2024 02:00:24 +0000
Subject: [PATCH 09/25] fix the ppl acc issue

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 examples/habana/run_generation.py | 36 +++++++++++-------
 examples/habana/run_llama.sh      | 17 +++++----
 examples/habana/run_measure.sh    | 14 +++++++
 examples/habana/run_ppl.sh        | 23 +++++++++++
 examples/habana/utils.py          | 63 ++++++-------------------------
 5 files changed, 80 insertions(+), 73 deletions(-)
 create mode 100644 examples/habana/run_measure.sh
 create mode 100644 examples/habana/run_ppl.sh

diff --git a/examples/habana/run_generation.py b/examples/habana/run_generation.py
index 42b42be2ae2..9e541719ca7 100644
--- a/examples/habana/run_generation.py
+++ b/examples/habana/run_generation.py
@@ -7,6 +7,7 @@
 import logging
 import math
 import os
+import sys
 import time
 from itertools import cycle
 from pathlib import Path
@@ -34,6 +35,9 @@ def setup_parser(parser):
     parser.add_argument(
             "--tokenized", 
             type=str, help="Dataset has been tokenized", default=None)
+    parser.add_argument(
+            "--save_tokenized", 
+            type=str, help="Path to save tokenized file", default=None)
     parser.add_argument(
         "--model_name_or_path",
         default=None,
@@ -47,7 +51,7 @@ def setup_parser(parser):
         help="Whether to perform generation in bf16 precision.",
     )
     parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
-    parser.add_argument("--size", type=int, default=19, help="Enlarge the input prompt")
+    parser.add_argument("--size", type=int, default=1, help="Enlarge the input prompt")
     parser.add_argument(
         "--max_input_tokens",
         type=int,
@@ -137,7 +141,7 @@ def setup_parser(parser):
 def main():
     parser = argparse.ArgumentParser()
     args = setup_parser(parser)
-    model, tokenizer, generation_config = initialize_model(args, logger)
+    model, tokenizer, generation_config = initialize_model(args)
     # print_memory_stats()
     use_lazy_mode = True
     if args.torch_compile and model.config.model_type == "llama":
@@ -152,14 +156,14 @@ def main():
         else:
             input_sentences = datasets.load_dataset(args.dataset_name, split='test')
             tokenizer.pad_token = tokenizer.eos_token
-            feature = 'text' if column_name is None else column_name
+            feature = 'text'
             def tokenize(example):
                 tokenized = tokenizer(
                     example[feature],
                     add_special_tokens=False,
                     padding=True,
                     truncation=False,
-                    max_length=args.max_input_tokens,
+                    max_length=sys.maxsize,
                     return_attention_mask=True,
                 )
                 example["input_ids"] = tokenized["input_ids"]
@@ -168,11 +172,14 @@ def tokenize(example):
                 return example
     
             input_sentences = input_sentences.map(tokenize)
-            # TODO you can save the tokenized results, this will save a lot of time
-            # inputs.save_to_disk("tokenized_path/token_1k")
     
         input_sentences = input_sentences.filter(lambda x: x["tokenized_len"] >= args.size * 1024)
         input_sentences = input_sentences.filter(lambda x: x["tokenized_len"] <= (args.size + 2) * 1024)
+        # TODO you can save the tokenized results, this will save a lot of time
+        if args.save_tokenized:
+            input_sentences.save_to_disk(args.save_tokenized)
+            print("Token has been save to {}".format(args.save_tokenized))
+            return
 
     else:
         input_sentences = [p * args.size for p in args.prompt]
@@ -200,16 +207,17 @@ def compute_ppl(inputs):
         results = []
         result = []
         has_bos_token = tokenizer.bos_token is not None
-        samples_numbers = [10, 20, 50]
+        # samples_number will influence the ppl
+        samples_numbers = [50]
         sliding_windows = [256, 512, 1024]
         for samples_num  in samples_numbers:
-            for sliding_window in sliding_windows:
-                ppl = compute_perplexity(
-                        model, tokenizer, inputs, samples_num,
-                        add_start_token=has_bos_token, max_length=args.size*1024,
-                        sliding_window=sliding_window, truncate=True)
-                print("PPL result is {}".format(ppl)) 
-                result.append(ppl)
+            # for sliding_window in sliding_windows:
+            ppl = compute_perplexity(
+                    model, tokenizer, inputs, samples_num,
+                    add_start_token=has_bos_token, max_length=args.size*1024,
+                    sliding_window=512, truncate=True)
+            print("PPL result is {}".format(ppl)) 
+            result.append(ppl)
     
         result.insert(0, model)
         results.append(result)
diff --git a/examples/habana/run_llama.sh b/examples/habana/run_llama.sh
index 90a29747915..cc92523798c 100644
--- a/examples/habana/run_llama.sh
+++ b/examples/habana/run_llama.sh
@@ -1,17 +1,18 @@
-for i in {1..1..2}
+for i in {10..10..2}
 do
   python run_generation.py \
       --use_hpu_graphs \
       --use_kv_cache \
-      --max_new_tokens 32 \
+      --limit_hpu_graphs \
       --size $i \
-      --max_input_tokens -1 \
-      --model_name_or_path /chenxi/models--01-ai--Yi-34B/snapshots/533e00ce927b9e5711445a991284671ac61c6834 \
-       --fp8 \
       --batch_size 1 \
-      --prompt "how are you ?" \
-      # --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, instead of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dilemma when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
-      # --model_name_or_path /chenxi/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423 \
+      --trim_logits \
+      --model_name_or_path /chenxi/models--01-ai--Yi-34B/snapshots/f9cec17e8fcc054d6c8d98fd5a41ed14895caa8b \
+      --max_input_tokens -1 \
+      --fp8 \
+      --bf16 \
+      --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, instead of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dilemma when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
+      # --prompt "how are you ?" \
   sleep 1
 done 
 echo "Test Done...."
diff --git a/examples/habana/run_measure.sh b/examples/habana/run_measure.sh
new file mode 100644
index 00000000000..b585931112e
--- /dev/null
+++ b/examples/habana/run_measure.sh
@@ -0,0 +1,14 @@
+for i in {1..1..2}
+do
+  python run_generation.py \
+      --use_hpu_graphs \
+      --use_kv_cache \
+      --size $i \
+      --trim_logits \
+      --batch_size 1 \
+      --bf16 \
+      --model_name_or_path /chenxi/models--01-ai--Yi-34B/snapshots/f9cec17e8fcc054d6c8d98fd5a41ed14895caa8b \
+      --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, instead of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dilemma when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
+  sleep 1
+done 
+echo "Test Done...."
diff --git a/examples/habana/run_ppl.sh b/examples/habana/run_ppl.sh
new file mode 100644
index 00000000000..f3213c639aa
--- /dev/null
+++ b/examples/habana/run_ppl.sh
@@ -0,0 +1,23 @@
+for i in {2..2..2}
+do
+  python run_generation.py \
+      --use_hpu_graphs \
+      --use_kv_cache \
+      --limit_hpu_graphs \
+      --size $i \
+      --batch_size 1 \
+      --trim_logits \
+      --max_input_tokens -1 \
+      --fp8 \
+      --bf16 \
+      --model_name_or_path /chenxi/models--01-ai--Yi-34B/snapshots/f9cec17e8fcc054d6c8d98fd5a41ed14895caa8b \
+      --dataset_name "hoskinson-center/proof-pile" \
+      --tokenized /chenxi/itrex/examples/habana/token_1k_20k \
+      --prompt "how are you ?" \
+      # --save_tokenized /chenxi/itrex/examples/habana/token_1k_20k \
+      # --fp8 \
+      # --prompt "It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, instead of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dilemma when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill." \
+      # --model_name_or_path /chenxi/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9 \
+  sleep 1
+done 
+echo "Test Done...."
diff --git a/examples/habana/utils.py b/examples/habana/utils.py
index 3e5926411ac..29dece41461 100644
--- a/examples/habana/utils.py
+++ b/examples/habana/utils.py
@@ -37,7 +37,7 @@ def compute_perplexity(model, tokenizer, inputs, samples_num=None, add_start_tok
     if max_length and truncate:
         encoded_texts = [x[0:max_tokenized_len] for x in encoded_texts]
         attn_masks = [x[0:max_tokenized_len] for x in attn_masks]
-        # sliding_window = max_tokenized_len
+        sliding_window = max_tokenized_len
 
     nlls = []
     t_ppl = time.perf_counter()
@@ -75,7 +75,7 @@ def compute_perplexity(model, tokenizer, inputs, samples_num=None, add_start_tok
 
     ppl = float(torch.exp(torch.stack(nlls).mean()).float().cpu())
     ppl_duration = time.perf_counter() - t_ppl
-    return {'max_length': max_length, 'ppl': ppl, 'duration': ppl_duration, 'samples_num': samples_num, 'sliding_window': sliding_window}
+    return {'max_length': max_length, 'ppl': ppl, 'duration': ppl_duration, 'samples_num': samples_num}
 
 
 def print_memory_stats(p_info=""):
@@ -106,39 +106,10 @@ def adjust_batch(batch, size):
     return adjusted_batch
 
 
-def override_print(enable):
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if force or enable:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def override_logger(logger, enable):
-    logger_info = logger.info
-
-    def info(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if force or enable:
-            logger_info(*args, **kwargs)
-
-    logger.info = info
-
-
 def count_hpu_graphs():
     return len(glob.glob(".graph_dumps/*PreGraph*"))
 
 
-def override_prints(enable, logger):
-    override_print(enable)
-    override_logger(logger, enable)
-
-
 def setup_distributed(args):
     args.local_rank = int(os.getenv("LOCAL_RANK", "0"))
     args.world_size = int(os.getenv("WORLD_SIZE", "0"))
@@ -207,8 +178,7 @@ def get_torch_compiled_model(model):
     return model
 
 
-def setup_model(args, model_dtype, model_kwargs, logger):
-    logger.info("Single-device run.")
+def setup_model(args, model_dtype, model_kwargs):
     model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
     if args.quant_config:
         import habana_quantization_toolkit
@@ -233,10 +203,9 @@ def setup_model(args, model_dtype, model_kwargs, logger):
     return model
 
 
-def setup_distributed_model(args, model_dtype, model_kwargs, logger):
+def setup_distributed_model(args, model_dtype, model_kwargs):
     import deepspeed
 
-    logger.info("DeepSpeed is enabled.")
     deepspeed.init_distributed(dist_backend="hccl")
     config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
     load_to_meta = model_on_meta(config)
@@ -255,7 +224,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
             if args.local_rank == 0:
                 if Path(merged_model_dir).is_dir():
                     shutil.rmtree(merged_model_dir)
-                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
+                peft_model(args, model_dtype, **model_kwargs).save_pretrained(merged_model_dir)
             torch.distributed.barrier()
 
         write_checkpoints_json(
@@ -268,7 +237,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
         # TODO: revisit placement on CPU when auto-injection is possible
         with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
             if args.peft_model is not None:
-                model = peft_model(args, model_dtype, logger, **model_kwargs)
+                model = peft_model(args, model_dtype, **model_kwargs)
             else:
                 model = AutoModelForCausalLM.from_pretrained(
                     args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
@@ -299,7 +268,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
     return model
 
 
-def peft_model(args, model_dtype, logger, **model_kwargs):
+def peft_model(args, model_dtype, **model_kwargs):
     import importlib.util
 
     if importlib.util.find_spec("peft") is None:
@@ -326,12 +295,6 @@ def peft_model(args, model_dtype, logger, **model_kwargs):
     if base_model_is_local or base_model_is_remote:
         model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
     else:
-        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
-        logger.warning(
-            f"The base model `{base_model_name}` of the LoRA configuration associated"
-            f" to `{args.peft_model}` does not exist locally or remotely. Using "
-            f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model."
-        )
         from peft import PeftModel
 
         model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
@@ -400,10 +363,11 @@ def setup_generation_config(args, model, tokenizer):
     return generation_config
 
 
-def initialize_model(args, logger):
+def initialize_model(args):
+    args.quant_config = os.getenv("QUANT_CONFIG", "")
     init_start = time.perf_counter()
     setup_distributed(args)
-    override_prints(args.global_rank == 0 or args.verbose_workers, logger)
+    # override_prints(args.global_rank == 0 or args.verbose_workers,)
     setup_env(args)
     setup_device(args)
     set_seed(27)
@@ -425,9 +389,9 @@ def initialize_model(args, logger):
     model_kwargs["offload_folder"] = "/tmp/offload_folder/"
 
     model = (
-        setup_model(args, model_dtype, model_kwargs, logger)
+        setup_model(args, model_dtype, model_kwargs)
         if not use_deepspeed
-        else setup_distributed_model(args, model_dtype, model_kwargs, logger)
+        else setup_distributed_model(args, model_dtype, model_kwargs)
     )
     tokenizer, model = setup_tokenizer(args, model)
     generation_config = setup_generation_config(args, model, tokenizer)
@@ -443,7 +407,4 @@ def initialize_model(args, logger):
             # TODO always initialize model
         htcore.hpu_initialize(model)
     init_end = time.perf_counter()
-    # logger.info(f"Args: {args}")
-    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
-    logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
     return model, tokenizer, generation_config

From 360e32fc0b98d809df5b6eeb9622473e4c26a777 Mon Sep 17 00:00:00 2001
From: zhentaoyu <zhentao.yu@intel.com>
Date: Wed, 22 May 2024 09:57:51 +0800
Subject: [PATCH 10/25] [Gaudi] Add LLAMA Streaming LLM in Gaudi (#1558)

* initial commit

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* mv example

Signed-off-by: Yu Zhentao <zhentao.yu@intel.com>

* update model dtype

Signed-off-by: Yu Zhentao <zhentao.yu@intel.com>

* fix multi-round generation without streaming_llm

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* add mem and token num log

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* rebase

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* initial fp8

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* add ppl eval scripts

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* typo

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* add llama2-13b ppl eval script (align paper)

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* hide kv cache operation inside (v0.1)

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* hide kv cache operation inside (v0.2)

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* hide kv cache operation inside (v0.3)

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* update scripts

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* add README

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* update test scripts

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* remove useless code

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

* update README and rename shell scripts

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>

---------

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>
Signed-off-by: Yu Zhentao <zhentao.yu@intel.com>
---
 examples/habana/streaming_llm/README.md       |  61 ++++
 .../streaming_llm/eval_bf16_streaming.sh      |  33 ++
 .../streaming_llm/eval_fp8_streaming.sh       |  51 +++
 examples/habana/streaming_llm/perplexity.py   | 135 ++++++++
 .../habana/streaming_llm/plot_perplexity.py   | 141 ++++++++
 .../streaming_llm/run_bf16_streaming.sh       |  12 +
 .../habana/streaming_llm/run_fp8_streaming.sh |  27 ++
 .../habana/streaming_llm/run_streaming_llm.py | 167 ++++++++++
 examples/habana/utils.py                      |  16 +-
 .../generation/configuration_utils.py         |   2 +
 .../modeling_gaudi/generation/utils.py        |   6 +
 .../models/llama/modeling_llama.py            |  73 ++++-
 .../models/llama/pos_shift_llama.py           | 300 ++++++++++++++++++
 .../modeling/modeling_gaudi/streaming_llm.py  |  51 +++
 14 files changed, 1073 insertions(+), 2 deletions(-)
 create mode 100644 examples/habana/streaming_llm/README.md
 create mode 100644 examples/habana/streaming_llm/eval_bf16_streaming.sh
 create mode 100644 examples/habana/streaming_llm/eval_fp8_streaming.sh
 create mode 100644 examples/habana/streaming_llm/perplexity.py
 create mode 100644 examples/habana/streaming_llm/plot_perplexity.py
 create mode 100644 examples/habana/streaming_llm/run_bf16_streaming.sh
 create mode 100644 examples/habana/streaming_llm/run_fp8_streaming.sh
 create mode 100644 examples/habana/streaming_llm/run_streaming_llm.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py

diff --git a/examples/habana/streaming_llm/README.md b/examples/habana/streaming_llm/README.md
new file mode 100644
index 00000000000..2218ad06dc0
--- /dev/null
+++ b/examples/habana/streaming_llm/README.md
@@ -0,0 +1,61 @@
+# Streaming LLM
+
+Streaming LLM is an useful approach in long context generation and multi-round chatting scenario. In this example, we will show how to enable it in Intel Guadi device. More technical details, please refet to [paper](https://arxiv.org/abs/2309.17453).
+
+> Note: Only supports Llama model architecture and one-single HPU card.
+
+## Create Environment
+Validate in Habana version 1.15.1 with its Pytorch-2.2.0 docker.
+​
+```shell
+# start docker
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -v <workfloder>:<docker workfolder>  --name "streaming_llm" vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+
+# install related packages in docker
+# install optimum-habana
+pip install git+https://github.com/huggingface/optimum-habana.git@753da20f98ad6f874075701995428072159ba600
+# install intel-extension-for-transformers
+git clone https://github.com/intel/intel-extension-for-transformers.git itrex
+cd itrex && python setup.py install
+# or just install from PyPI
+pip install intel-extension-for-transformers
+```
+
+## Run
+We provide [01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat) as an model example by default for demonstrating streaming outputs.
+
+1. bf16 data type:
+```shell
+bash run_bf16_streaming.sh
+```
+
+2. fp8 data type:
+```shell
+bash run_fp8_streaming.sh
+```
+
+You can change the input args values (like `attention_sink_window_size`, `num_sample` for fp8 calibration, etc.) or set env var `MODEL=<model_name_or_path>` to implement more experiments.
+
+## Evaluation (PPL)
+
+We follow the one token by one token ppl evaluation way in [streaming llm](https://github.com/mit-han-lab/streaming-llm/blob/main/examples/eval_long_ppl.py#L81-L91).
+
+1. test `llama2-13b` ppl to align with paper's result.
+
+```shell
+HF_TOKEN=<your HF account token> MODEL=meta-llama/Llama-2-13b-hf bash eval_bf16_streaming.sh
+```
+
+2. test another model with bf16 data type
+
+```shell
+MODEL=<HF model name or local path> bash eval_bf16_streaming.sh
+```
+
+3. test model with fp8 data type
+
+```shell
+MODEL=<HF model name or local path> bash eval_fp8_streaming.sh
+```
+
+The shell script will plot `ppl_memory` and `ppl_latency` figures with svg format for visualization after ppl evaluation.
diff --git a/examples/habana/streaming_llm/eval_bf16_streaming.sh b/examples/habana/streaming_llm/eval_bf16_streaming.sh
new file mode 100644
index 00000000000..958d39c9117
--- /dev/null
+++ b/examples/habana/streaming_llm/eval_bf16_streaming.sh
@@ -0,0 +1,33 @@
+MODEL_NAME_OR_PATH=${MODEL:-meta-llama/Llama-2-13b-hf}
+echo "========== USING MODEL: ${MODEL_NAME_OR_PATH} =========="
+python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=emozilla/pg19-test \
+    --split=test \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --num_sample=1 \
+    --num_tokens=65000 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs \
+    --perplexity \
+    --output_dir=benchmark/bf16_streaming_outputs \
+    --overwrite
+
+echo "========== PLOTTING PERPLEXITY =========="
+python plot_perplexity.py \
+    --features perplexity memory \
+    --output_dir benchmark/bf16_streaming_outputs \
+    --title "Log perplexity & memory of BF16 model in streaming_llm" \
+    --log_perplexity_limit 5.0 \
+    --skip_first 100 \
+    --figure_dir bf16_streaming_ppl_memory.svg
+
+python plot_perplexity.py \
+    --features perplexity latency \
+    --output_dir benchmark/bf16_streaming_outputs \
+    --title "Log perplexity & latency of BF16 model in streaming_llm" \
+    --log_perplexity_limit 5.0 \
+    --skip_first 100 \
+    --figure_dir bf16_streaming_ppl_latency.svg
diff --git a/examples/habana/streaming_llm/eval_fp8_streaming.sh b/examples/habana/streaming_llm/eval_fp8_streaming.sh
new file mode 100644
index 00000000000..046112381db
--- /dev/null
+++ b/examples/habana/streaming_llm/eval_fp8_streaming.sh
@@ -0,0 +1,51 @@
+MODEL_NAME_OR_PATH=${MODEL:-meta-llama/Llama-2-13b-hf}
+echo "========== using model: ${MODEL_NAME_OR_PATH} =========="
+echo "========== START TO MEASURE =========="
+QUANT_CONFIG=../quantization_config/maxabs_measure.json python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=emozilla/pg19-test \
+    --split=test \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --num_sample=1 \
+    --num_tokens=2000 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs \
+    --perplexity \
+    --output_dir=benchmark/fp8_streaming_outputs \
+    --overwrite
+
+echo "========== START TO QUANT AND RUN =========="
+QUANT_CONFIG=../quantization_config/maxabs_quant.json python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=emozilla/pg19-test \
+    --split=test \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --num_sample=1 \
+    --num_tokens=65000 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs \
+    --perplexity \
+    --output_dir=benchmark/fp8_streaming_outputs \
+    --overwrite \
+    --fp8
+
+echo "========== PLOTTING PERPLEXITY =========="
+python plot_perplexity.py \
+    --features perplexity memory \
+    --output_dir benchmark/fp8_streaming_outputs \
+    --title "Log perplexity & memory of FP8 model in streaming_llm" \
+    --log_perplexity_limit 5.0 \
+    --skip_first 100 \
+    --figure_dir fp8_streaming_ppl_memory.svg
+
+python plot_perplexity.py \
+    --features perplexity latency \
+    --output_dir benchmark/fp8_streaming_outputs \
+    --title "Log perplexity & latency of FP8 model in streaming_llm" \
+    --log_perplexity_limit 5.0 \
+    --skip_first 100 \
+    --figure_dir fp8_streaming_ppl_latency.svg
diff --git a/examples/habana/streaming_llm/perplexity.py b/examples/habana/streaming_llm/perplexity.py
new file mode 100644
index 00000000000..d2fd677ba14
--- /dev/null
+++ b/examples/habana/streaming_llm/perplexity.py
@@ -0,0 +1,135 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from https://github.com/tomaarsen/attention_sinks
+"""
+
+
+import itertools
+import time
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+import torch
+from torch.nn import CrossEntropyLoss
+from tqdm import tqdm
+from optimum.habana.utils import get_hpu_memory_stats
+
+
+def compute_perplexity(
+    model,
+    tokenizer,
+    dataset,
+    kv_window_size=1024,
+    output_dir= "outputs",
+    data_column= "text",
+    num_samples = 1,
+    num_tokens= None,
+    overwrite= False,
+):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    suffix = "attention_sink"
+    output_file = output_dir / f"{suffix}.csv"
+
+    if output_file.exists() and not overwrite:
+        raise ValueError(
+            f"The {output_file!r} output file already exists - if you really want to override it, then use `--overwrite`."
+        )
+
+    logs = defaultdict(list)
+    loss_fn = CrossEntropyLoss(reduction="none")
+    num_processed_tokens = 0
+
+    # allocate kv cache
+    model.allocate_kv_cache(1, kv_window_size, 1)
+    for text in itertools.islice(dataset, num_samples):
+        encodings = tokenizer(text[data_column], return_tensors="pt")
+
+        seq_len = encodings.input_ids.size(1)
+        print(f"sequence length: {seq_len}")
+        pbar = tqdm(range(0, seq_len - 1))
+
+        for idx in pbar:
+            start_t = time.time()
+            input_ids = encodings.input_ids[:, idx : idx + 1].to(model.device)
+            attention_mask = torch.full((1, 1, 1, kv_window_size),
+                                        1,
+                                        dtype=torch.int64,
+                                        device="cpu")
+            n_past = min(idx, kv_window_size -1)
+            attention_mask[:, :, :, n_past + 1:] = 0
+            attention_mask = attention_mask.to(model.device)
+            pos_ids = torch.full((1,1), n_past, dtype=torch.int64, device=model.device)
+            cache_prune_num = 0 if idx < kv_window_size else 1
+            with torch.no_grad():
+                outputs = model(input_ids,
+                                position_ids=pos_ids,
+                                attention_mask=attention_mask,
+                                attn_softmax_bf16=True,
+                                use_cache=True,
+                                reuse_cache=True,
+                                cache_prune_num = cache_prune_num,
+                                )
+                logits = outputs.logits.view(-1, model.config.vocab_size).cpu()
+                logits = logits.to(torch.float32)
+                label = encodings.input_ids[:, idx + 1 : idx + 2].to(logits.device).view(-1)
+                neg_log_likelihood = loss_fn(logits, label)
+                perplexity = neg_log_likelihood.exp()
+            pbar.set_description(f"nll: {neg_log_likelihood.item():>5.2f}, ppl: {perplexity.item():>8.2f}")
+
+            # Store data and save every 10 tokens
+            logs["latency"].append(time.time() - start_t)
+            logs["input_length"].append(idx + 1)
+            logs["nll"].append(neg_log_likelihood.item())
+            logs["ppl"].append(perplexity.item())
+            logs["overall_ppl"].append(torch.tensor(logs["nll"]).mean().exp().item())
+            logs["hpu_ram_allocated"].append(get_hpu_memory_stats().get('memory_allocated (GB)'))  # in GB
+            if num_processed_tokens % 10 == 0:
+                try:
+                    pd.DataFrame(logs).to_csv(output_file, index=False)
+                except KeyboardInterrupt as ex:
+                    # If there's a Keyboard Interrupt, still write the file, and then stop
+                    pd.DataFrame(logs).to_csv(output_file, index=False)
+                    raise ex
+
+            num_processed_tokens += 1
+            if num_tokens and num_processed_tokens >= num_tokens:
+                break
+        if num_tokens and num_processed_tokens >= num_tokens:
+                break
+
+    pd.DataFrame(logs).to_csv(output_file, index=False)
+    print(f"overall_ppl: {logs['overall_ppl'][-1]: >8.2f}")
+    return
diff --git a/examples/habana/streaming_llm/plot_perplexity.py b/examples/habana/streaming_llm/plot_perplexity.py
new file mode 100644
index 00000000000..f7e305f058b
--- /dev/null
+++ b/examples/habana/streaming_llm/plot_perplexity.py
@@ -0,0 +1,141 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from https://github.com/tomaarsen/attention_sinks
+
+First run `run_streaming_llm.py` to generate one or more `csv` files.
+This script can plot those csv files.
+
+Usage:
+python benchmark/plot_perplexity.py
+python benchmark/plot_perplexity.py --features perplexity latency --title "Log perplexity & latency of Llama 2 7B as a function of input lengths"
+"""
+
+
+import argparse
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+
+FEATURE_DF_MAP = {
+    "perplexity": "overall_ppl",
+    "memory": "hpu_ram_allocated",
+    "latency": "latency",
+}
+FEATURE_STYLE_MAP = {
+    "perplexity": "-",
+    "memory": "--",
+    "latency": ":",
+}
+FEATURE_LABEL_MAP = {
+    "perplexity": "Perplexity (log), lower is better",
+    "memory": "HPU RAM Usage (GB), lower is better",
+    "latency": "Time per token (sec), lower is better",
+}
+
+
+def plot(
+    features: List[str],
+    output_dir: str = "outputs",
+    title: Optional[str] = None,
+    perplexity_limit: Optional[float] = None,
+    skip_first: int = 100,
+):
+    output_dir = Path(output_dir)
+
+    fig, ax = plt.subplots()
+    ax.set_xlabel("Input Sequence Length")
+
+    for feature_i, feature in enumerate(features):
+        # If we already plotted on this ax, make a new one
+        if feature_i:
+            ax = ax.twinx()
+
+        for file in output_dir.glob("*.csv"):
+            experiment = file.stem
+            df = pd.read_csv(file)
+            X = df["input_length"][skip_first:]
+            Y = df[FEATURE_DF_MAP[feature]][skip_first:]
+            if feature == "perplexity":
+                Y = np.log(Y)
+            if feature == "latency":
+                poly = np.polyfit(X, Y, 20)
+                poly_y = np.poly1d(poly)(X)
+                ax.plot(X, poly_y, FEATURE_STYLE_MAP[feature], label=f"{experiment} {feature}")
+            else:
+                ax.plot(X, Y, FEATURE_STYLE_MAP[feature], label=f"{experiment} {feature}")
+
+        ax.set_ylabel(FEATURE_LABEL_MAP[feature])
+        if perplexity_limit and feature == "perplexity":
+            ax.set_ylim(top=min(ax.get_ylim()[1], perplexity_limit))
+
+        ax.legend(loc=[1, 2, 7][feature_i])  # upper right, upper left, center right
+
+    ax.set_title(title.replace("\\n", "\n") or "Log perplexity as a function of input lengths")
+    fig.tight_layout()
+
+    return fig
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Where csv files have been logged
+    parser.add_argument("--output_dir", type=str, default="benchmark/outputs")
+    parser.add_argument(
+        "--features", choices=["perplexity", "memory", "latency"], nargs="+", default=["perplexity", "memory"]
+    )
+    parser.add_argument("--title", type=str, default="Log perplexity as a function of input lengths")
+    parser.add_argument("--log_perplexity_limit", type=float, default=5.0)
+    # Perplexity starts a bit unstable, so we skip the start
+    parser.add_argument("--skip_first", type=int, default=100)
+    parser.add_argument("--figure_dir", type=str, default="perplexity.svg")
+
+    args = parser.parse_args()
+
+    figure = plot(
+        args.features,
+        output_dir=args.output_dir,
+        title=args.title,
+        perplexity_limit=args.log_perplexity_limit,
+        skip_first=args.skip_first,
+    )
+
+    figure.savefig(args.figure_dir)
+    # plt.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/habana/streaming_llm/run_bf16_streaming.sh b/examples/habana/streaming_llm/run_bf16_streaming.sh
new file mode 100644
index 00000000000..72410ce0c0b
--- /dev/null
+++ b/examples/habana/streaming_llm/run_bf16_streaming.sh
@@ -0,0 +1,12 @@
+MODEL_NAME_OR_PATH=${MODEL:-01-ai/Yi-34B-Chat}
+echo "========== using model: ${MODEL_NAME_OR_PATH} =========="
+python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=HuggingFaceH4/mt_bench_prompts \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --max_new_token=512 \
+    --num_sample=-1 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs
diff --git a/examples/habana/streaming_llm/run_fp8_streaming.sh b/examples/habana/streaming_llm/run_fp8_streaming.sh
new file mode 100644
index 00000000000..49a6c274666
--- /dev/null
+++ b/examples/habana/streaming_llm/run_fp8_streaming.sh
@@ -0,0 +1,27 @@
+MODEL_NAME_OR_PATH=${MODEL:-01-ai/Yi-34B-Chat}
+echo "========== using model: ${MODEL_NAME_OR_PATH} =========="
+echo "========== START TO MEASURE =========="
+QUANT_CONFIG=../quantization_config/maxabs_measure.json python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=HuggingFaceH4/mt_bench_prompts \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --max_new_token=32 \
+    --num_sample=-1 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs
+echo "========== FINISH MEASUREMENT =========="
+
+echo "========== START TO QUANT AND RUN =========="
+QUANT_CONFIG=../quantization_config/maxabs_quant.json python run_streaming_llm.py \
+    --model_name_or_path=${MODEL_NAME_OR_PATH} \
+    --dataset=HuggingFaceH4/mt_bench_prompts \
+    --attention_sink_window_size=1020 \
+    --attention_sink_size=4 \
+    --max_new_token=512 \
+    --num_sample=-1 \
+    --bf16 \
+    --use_kv_cache \
+    --use_hpu_graphs \
+    --fp8
diff --git a/examples/habana/streaming_llm/run_streaming_llm.py b/examples/habana/streaming_llm/run_streaming_llm.py
new file mode 100644
index 00000000000..0be0dfc4322
--- /dev/null
+++ b/examples/habana/streaming_llm/run_streaming_llm.py
@@ -0,0 +1,167 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import sys
+import argparse
+from typing import Any, Dict, List
+
+import torch
+from datasets import load_dataset
+from transformers import TextStreamer
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from run_generation import setup_parser
+from utils import print_memory_stats, initialize_model
+
+
+def create_prompts(samples: Dict[str, List[Any]]) -> Dict[str, Any]:
+    return {"prompt": [prompt for prompts in samples["prompt"] for prompt in prompts]}
+
+
+@torch.no_grad()
+def greedy_generate(model, tokenizer, dataset, args, generation_config, max_new_tokens=512, n_round=-1):
+    streamer = TextStreamer(tokenizer, skip_special_tokens=True)
+    new_line_tokens = tokenizer("\n\n", return_tensors="pt", add_special_tokens=False).input_ids
+    num_token = 0
+    count_round = 0
+
+    generation_config.max_new_tokens = max_new_tokens
+    generation_config.do_sample = False
+    generation_config.top_p = None
+    generation_config.use_cache = True
+    generation_config.attn_softmax_bf16 = True
+    generation_config.reuse_cache = True
+    generation_config.ignore_eos=False
+    generation_config.bucket_size = -1
+    generation_config.attention_sink_size = args.attention_sink_size
+    generation_config.attention_sink_window_size = args.attention_sink_window_size
+    print(generation_config)
+    use_lazy_mode = True
+    if args.torch_compile and model.config.model_type == "llama":
+        use_lazy_mode = False
+    for prompt_index, prompt in enumerate(dataset["prompt"]):
+        if tokenizer.chat_template is not None:
+            # Use the chat template initially, as it adds the system prompt if the model has one,
+            # and then use [INST] and [/INST]
+            if prompt_index:
+                prompt = f"[INST] {prompt} [/INST]"
+            else:
+                prompt = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False)
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+        input_ids = input_ids.to(model.device)
+
+        outputs = model.generate(
+            input_ids,
+            generation_config=generation_config,
+            streamer=streamer,
+            lazy_mode=use_lazy_mode,
+            hpu_graphs=args.use_hpu_graphs,
+            profiling_steps=args.profiling_steps,
+            profiling_warmup_steps=args.profiling_warmup_steps,
+        ).cpu()
+
+        # ignore padding token
+        num_token += (outputs.shape[-1] - outputs[0].tolist().count(generation_config.pad_token_id))
+        streamer.put(new_line_tokens)
+        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+        print_memory_stats()
+        print("total token: {}k".format(num_token / 1000.0))
+        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+        count_round += 1
+        if n_round > 0 and count_round >= n_round:
+            break
+
+def setup_streaming_llm_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Streaming LLM script for HPU"
+    )
+
+    # Dataset args, not recommended to change
+    # streaming demo: HuggingFaceH4/mt_bench_prompts
+    # ppl: emozilla/pg19-test
+    parser.add_argument("--dataset", type=str, default="HuggingFaceH4/mt_bench_prompts")
+    parser.add_argument("--data_column", type=str, default="text")
+    parser.add_argument("--task", type=str, default=None)
+    parser.add_argument("--split", type=str, default="test")
+    parser.add_argument("--num_sample", type=int, default=-1)
+    parser.add_argument("--num_tokens", type=int, default=8192)
+
+    # Window size for attention_sinks
+    parser.add_argument("--attention_sink_window_size", type=int, default=1020)
+    # Attention Sink whole window size is calculated with args.attention_sink_window_size + args.attention_sink_size
+    parser.add_argument("--attention_sink_size", type=int, default=4)
+
+    # compute perplexity and log
+    parser.add_argument("--perplexity", action="store_true")
+    parser.add_argument("--output_dir", type=str, default="benchmark/outputs")
+    parser.add_argument("--overwrite", action="store_true")
+
+    args = setup_parser(parser)
+
+    return args
+
+def main():
+    args = setup_streaming_llm_parser()
+    model, tokenizer, generation_config = initialize_model(args)
+
+    if args.perplexity:  # compute perplexity
+        from perplexity import compute_perplexity
+        # Set up the dataset
+        dataset = load_dataset(args.dataset, args.task, split=args.split, streaming=True)
+        compute_perplexity(
+            model,
+            tokenizer,
+            dataset,
+            kv_window_size=args.attention_sink_window_size + args.attention_sink_size,
+            output_dir=args.output_dir,
+            data_column=args.data_column,
+            num_samples=1,  # No support for more than one instance now
+            num_tokens=args.num_tokens,
+            overwrite=args.overwrite,
+        )
+    else:  # streaming generation demo
+        # Set up the dataset
+        dataset = load_dataset(args.dataset, split="train")
+        dataset = dataset.map(create_prompts, batched=True, remove_columns=dataset.column_names)
+
+        greedy_generate(model, tokenizer, dataset, args, generation_config,
+                        max_new_tokens=args.max_new_tokens, n_round=args.num_sample)
+
+    if args.quant_config:
+        import habana_quantization_toolkit
+        habana_quantization_toolkit.finish_measurements(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/habana/utils.py b/examples/habana/utils.py
index 29dece41461..1e3bfffab6e 100644
--- a/examples/habana/utils.py
+++ b/examples/habana/utils.py
@@ -334,6 +334,13 @@ def setup_generation_config(args, model, tokenizer):
     bad_words_ids = None
     force_words_ids = None
 
+    attention_sink_size = None
+    attention_sink_window_size = None
+    if hasattr(args, "attention_sink_size"):
+        attention_sink_size = args.attention_sink_size
+    if hasattr(args, "attention_sink_window_size"):
+        attention_sink_window_size = args.attention_sink_window_size
+
     is_optimized = model_is_optimized(model.config)
     # Generation configuration
     generation_config = copy.deepcopy(model.generation_config)
@@ -360,6 +367,9 @@ def setup_generation_config(args, model, tokenizer):
         assert generation_config.bucket_size > 0
     # TODO this will also influence
     generation_config.use_flash_attention = False
+    # attention_sinks
+    generation_config.attention_sink_size = attention_sink_size
+    generation_config.attention_sink_window_size = attention_sink_window_size
     return generation_config
 
 
@@ -382,12 +392,16 @@ def initialize_model(args):
 
     model_kwargs = {
         "revision": "main",
-        "token":None, 
+        "token":None,
     }
 
     model_kwargs["device_map"] = "auto"
     model_kwargs["offload_folder"] = "/tmp/offload_folder/"
 
+    if hasattr(args, "attention_sink_size") and hasattr(args, "attention_sink_window_size"):
+        model_kwargs["attention_sink_size"] = args.attention_sink_size
+        model_kwargs["attention_sink_window_size"] = args.attention_sink_window_size
+
     model = (
         setup_model(args, model_dtype, model_kwargs)
         if not use_deepspeed
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
index 8130f375fc1..b9bc86f1c56 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -66,3 +66,5 @@ def __init__(self, **kwargs):
         self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
         self.flash_attention_causal_mask = kwargs.get("flash_attention_causal_mask", None)
         self.use_fused_rope = kwargs.get("use_fused_rope", None)
+        self.attention_sink_size = kwargs.get("attention_sink_size", None)
+        self.attention_sink_window_size = kwargs.get("attention_sink_window_size", None)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index 62fed0fbb14..aa05cc65aba 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -744,6 +744,12 @@ def generate(
                 calculated_max_length = input_ids.shape[-1] + generation_config.max_new_tokens
             if generation_config.use_cache and generation_config.reuse_cache:
                 bs, _ = input_ids.shape
+                # attention_sinks has fixed kv_cache_len
+                if generation_config.attention_sink_size is not None and \
+                    generation_config.attention_sink_window_size is not None:
+                    attn_window_len = generation_config.attention_sink_size + \
+                            generation_config.attention_sink_window_size
+                    calculated_max_length = max(calculated_max_length, attn_window_len)
                 if not is_greedy_or_beam_and_bucket:
                     unwrap_deepspeed_model(self).allocate_kv_cache(
                         bs * generation_config.num_beams, calculated_max_length, token_idx
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 61fb4858baa..8fb9ef66ee2 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -329,6 +329,7 @@ def pre_attn_forward(
         flash_attention_recompute: Optional[bool] = False,
         flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
+        cache_prune_num: int = 0,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
@@ -341,6 +342,7 @@ def pre_attn_forward(
         - add new args use_flash_attention
         - add new arg flash_attention_recompute
         - add new arg flash_attention_causal_mask
+        - add new arg cache_prune_num for attention_sinks
         """
         bsz, q_len, _ = hidden_states.size()
 
@@ -521,6 +523,7 @@ def forward(
         flash_attention_recompute: Optional[bool] = False,
         flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
+        cache_prune_num: int = 0,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -532,6 +535,7 @@ def forward(
         - add new args use_flash_attention
         - add new arg flash_attention_recompute
         - add new arg flash_attention_causal_mask
+        - add new arg cache_prune_num for attention_sinks
         """
         if "padding_mask" in kwargs:
             warnings.warn(
@@ -554,6 +558,7 @@ def forward(
             flash_attention_recompute=flash_attention_recompute,
             flash_attention_causal_mask=flash_attention_causal_mask,
             cache_idx=cache_idx,
+            cache_prune_num = cache_prune_num,
             **kwargs,
         )
         self.self_attn.attention_all_reduce(hidden_states)
@@ -586,6 +591,7 @@ def pre_attn(
         flash_attention_recompute: Optional[bool] = False,
         flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
+        cache_prune_num: int = 0,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
@@ -603,6 +609,7 @@ def pre_attn(
             flash_attention_recompute,
             flash_attention_causal_mask,
             cache_idx=cache_idx,
+            cache_prune_num = cache_prune_num,
         )
         return hidden_states, attn_weights, present_key_value
 
@@ -697,6 +704,7 @@ def forward(
         flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
         lazy_mode: Optional[bool] = True,
+        cache_prune_num: int = 0,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
         Copied from LlamaModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
@@ -708,6 +716,7 @@ def forward(
         - add new arg flash_attention_recompute
         - add new arg flash_attention_causal_mask
         - add new arg lazy_mode
+        - add new arg cache_prune_num for attention_sinks
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -768,6 +777,9 @@ def forward(
 
         # HPU specific mask generation
         if ignore_cache_position:
+            # workaround for attention_sinks attention_mask which has fixed seq_len at dim -1
+            if hasattr(self, "attention_sink_size") and hasattr(self, "attention_sink_window_size"):
+                past_seen_tokens = self.attention_sink_size + self.attention_sink_window_size - seq_length
             causal_mask = _gaudi_prepare_4d_causal_attention_mask(
                 attention_mask,
                 input_ids.shape if input_ids is not None else (batch_size, seq_length),
@@ -831,6 +843,7 @@ def forward(
                     flash_attention_recompute=flash_attention_recompute,
                     flash_attention_causal_mask=flash_attention_causal_mask,
                     cache_idx=cache_idx,
+                    cache_prune_num = cache_prune_num,
                 )
             hidden_states = layer_outputs[0]
 
@@ -871,6 +884,7 @@ class GaudiLlamaForCausalLM(LlamaForCausalLM):
     - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
     - add new args attn_softmax_bf16
     - add new args reuse_cache
+    - add new arg cache_prune_num for attention_sinks
     """
 
     def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
@@ -904,6 +918,7 @@ def forward(
         flash_attention_causal_mask: Optional[bool] = False,
         cache_idx: int = None,
         lazy_mode: Optional[bool] = True,
+        cache_prune_num: int = 0,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -934,6 +949,7 @@ def forward(
             flash_attention_causal_mask=flash_attention_causal_mask,
             cache_idx=cache_idx,
             lazy_mode=lazy_mode,
+            cache_prune_num=cache_prune_num,
         )
         hidden_states = outputs[0]
         _, seq_len, _ = hidden_states.shape
@@ -980,6 +996,10 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
     ):
         past_length = 0
+        if not hasattr(self, "kv_past_token_length"):
+            self.kv_past_token_length = 0
+        using_attention_sinks = (hasattr(self, "attention_sink_size") and
+                                 hasattr(self, "attention_sink_window_size"))
 
         reuse_cache = kwargs.get("reuse_cache")
         if past_key_values is not None:
@@ -1018,8 +1038,32 @@ def prepare_inputs_for_generation(
             input_ids = input_ids[:, :token_idx]
             attention_mask = attention_mask[:, :token_idx]
 
+        # prepare postion_ids and attention_mask for attention_sinks
+        cache_prune_num = 0
+        kv_cache_len = kwargs.get("kv_cache_len", None)
         position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
+        q_len = input_ids.shape[-1]
+        if using_attention_sinks:
+            assert (kv_cache_len and kv_cache_len == self.attention_sink_size + self.attention_sink_window_size)
+            self.kv_past_token_length = min(self.kv_past_token_length, kv_cache_len)
+            position_ids = torch.arange(self.kv_past_token_length,
+                                        self.kv_past_token_length + q_len,
+                                        device=input_ids.device)
+            attn_sink_mask = torch.ones((q_len, kv_cache_len), device=input_ids.device)
+            if self.kv_past_token_length < kv_cache_len:
+                attn_sink_mask[:, self.kv_past_token_length:] = 0
+            mask = torch.zeros((q_len, q_len), device=input_ids.device)
+            mask_cond = torch.arange(mask.size(-1), device=mask.device)
+            mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 1)
+            if self.kv_past_token_length + q_len > kv_cache_len:
+                cache_prune_num = (self.kv_past_token_length + q_len) - kv_cache_len
+                position_ids = position_ids - cache_prune_num
+            attn_sink_mask.index_copy_(-1, position_ids, mask)
+            attention_mask= attn_sink_mask[None, None, :, :].expand(input_ids.shape[0], 1, q_len, kv_cache_len)
+            position_ids = position_ids.unsqueeze(0)
+        self.kv_past_token_length += q_len
+
+        if attention_mask is not None and position_ids is None and not using_attention_sinks:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
@@ -1069,10 +1113,37 @@ def prepare_inputs_for_generation(
                 "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
                 "cache_idx": kwargs.get("cache_idx"),
                 "lazy_mode": kwargs.get("lazy_mode"),
+                "cache_prune_num": cache_prune_num,
             }
         )
         return model_inputs
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        # Separate Attention Sink kwargs from regular kwargs
+        attention_sink_kwargs = {key: value for key, value in kwargs.items() if key.startswith("attention_sink")}
+        for key in attention_sink_kwargs:
+            v = kwargs.pop(key)
+            assert isinstance(v, int)
+
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            **kwargs,
+        )
+
+        if len(attention_sink_kwargs) > 0:
+            from intel_extension_for_transformers.transformers.modeling.modeling_gaudi.streaming_llm \
+            import enable_streaming_llm
+
+            enable_streaming_llm(model, **attention_sink_kwargs)
+            model.attention_sink_size = attention_sink_kwargs.get("attention_sink_size")
+            model.attention_sink_window_size = attention_sink_kwargs.get("attention_sink_window_size")
+            model.model.attention_sink_size = attention_sink_kwargs.get("attention_sink_size")
+            model.model.attention_sink_window_size = attention_sink_kwargs.get("attention_sink_window_size")
+
+        return model
+
 
 def apply_customized_rope(q, k, cos, sin, position_ids):
     if q.device.type == "hpu" and has_fused_rope:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
new file mode 100644
index 00000000000..e9e8235c794
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
@@ -0,0 +1,300 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from https://github.com/tomaarsen/attention_sinks
+Note (accelerate inference with hpu graphs in V1.15.1):
+  1. avoid using data dependent dynamic flow
+  2. avoid updating tensor by in-place view (a[:, idx] = c)
+  3. make all shapes static
+"""
+
+
+from typing import Optional, Tuple
+import types
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from .modeling_llama import (GaudiLlamaAttention,
+                             gaudi_llama_repeat_kv,
+                             has_fused_rope,
+                             FusedSDPA,
+                             KVCache)
+from transformers.models.llama.modeling_llama import rotate_half
+
+__all__ = ["enable_gaudi_llama_pos_shift_attention", "enable_gaudi_llama_pos_shift_kv_cache"]
+
+
+def gaudi_apply_rotary_pos_emb_single(x, cos, sin, position_ids):
+    # TODO shape dimension check
+    if x.device.type == "hpu" and has_fused_rope:
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+        return FusedRoPE.apply(
+            x, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        )
+    else:
+        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        x_embed = (x * cos) + (rotate_half(x) * sin)
+        return x_embed
+
+def gaudi_llama_pos_shift_kv_cache_allocate(self, inp_seq_len, dtype, device, shape):
+    assert (
+        self.window_size > inp_seq_len
+    ), f"inp_seq_len ({inp_seq_len}) must be the less than window_size ({self.window_size})."
+    if self.cache is None:
+        self.inp_seq_len = inp_seq_len
+        bs, num_heads, seq_len, head_dim = shape
+        sink_shape = (bs, num_heads, self.window_size, head_dim)
+        self.cache = torch.zeros(sink_shape, dtype=dtype, device=device)
+    else:
+        self.inp_seq_len = inp_seq_len
+
+def gaudi_llama_pos_shift_kv_cache_update(self, prev, cur, dim, idx, inp_seq_len):
+    if idx is not None:
+        prev.index_copy_(dim, idx, cur)
+        return prev
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+def gaudi_llama_pos_shift_kv_cache_forward(self, cur, dim, idx, prune_num):
+    update_idx = torch.arange(self.attention_sink_size, self.window_size - prune_num, device=idx.device)
+    shift_idx = torch.arange(self.attention_sink_size + prune_num, self.window_size, device=idx.device)
+    shift_cache = torch.index_select(self.cache, dim, shift_idx)
+    self.cache.index_copy_(dim, update_idx, shift_cache)
+    return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
+
+def gaudi_llama_pos_shift_pre_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
+    token_idx: Optional[torch.Tensor] = None,
+    attn_softmax_bf16: Optional[bool] = False,
+    reuse_cache: Optional[bool] = False,
+    use_flash_attention: Optional[bool] = False,
+    flash_attention_recompute: Optional[bool] = False,
+    flash_attention_causal_mask: Optional[bool] = False,
+    cache_idx: int = None,
+    cache_prune_num: int = 0,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """
+    Copied from LlamaAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+    - add new args token_idx
+    - optimize KV cache
+    - add new args attn_softmax_bf16
+    - add new args reuse_cache
+    - add new args use_flash_attention
+    - add new arg flash_attention_recompute
+    - add new arg flash_attention_causal_mask
+    - add new arg cache_prune_num for attention_sinks
+    """
+    bsz, q_len, _ = hidden_states.size()
+
+    if self.config.pretraining_tp > 1:
+        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440)
+    key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = self.kv_cache_max_sl
+    if past_key_value is not None:
+        if token_idx is None:
+            if reuse_cache:
+                kv_seq_len = past_key_value[0][-2]
+            elif hasattr(past_key_value, "get_usable_length"):
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            else:
+                kv_seq_len += past_key_value[0].shape[-2]
+        else:
+            if reuse_cache:
+                kv_seq_len = past_key_value[0][-2]
+            else:
+                kv_seq_len = past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states = gaudi_apply_rotary_pos_emb_single(query_states, cos, sin, position_ids)
+
+    if use_cache:
+        # reuse k, v, self_attention
+        if reuse_cache:
+            key_states = self.k_cache(key_states, 2, position_ids.squeeze(0), cache_prune_num)
+            value_states = self.v_cache(value_states, 2, position_ids.squeeze(0), cache_prune_num)
+            past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
+        else:
+            if past_key_value is None:
+                past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
+                past_value = torch.zeros(
+                    key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
+                )
+                past_key_value = (past_key, past_value)
+            key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
+            value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+            if token_idx is None:
+                past_key_value = (key_states, value_states)
+
+        if cache_idx is not None and q_len == 1:
+            key_states = key_states[:, :, :cache_idx, :]
+            value_states = value_states[:, :, :cache_idx, :]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, :, :cache_idx]
+            kv_seq_len = key_states.shape[-2]
+    else:
+        past_key_value = None
+
+    ### Shift Pos: key pos is the pos in cache
+    key_position_ids = torch.arange(kv_seq_len, device=position_ids.device).unsqueeze(0)
+    key_states = gaudi_apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
+
+    if use_flash_attention and FusedSDPA:
+        import habana_frameworks.torch.hpu as ht
+
+        if q_len == 1:
+            # next token
+            with ht.sdp_kernel(enable_recompute=False):
+                attn_output = FusedSDPA.apply(
+                    query_states, key_states, value_states, attention_mask, 0.0, False, None
+                )
+        else:
+            # first token
+            if flash_attention_causal_mask:
+                # causal masking on first token requires inputs to be of the same length
+                with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                    attn_output = FusedSDPA.apply(query_states, key_states, value_states, None, 0.0, True, None)
+            else:
+                with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                    attn_output = FusedSDPA.apply(
+                        query_states, key_states, value_states, attention_mask, 0.0, False, None
+                    )
+
+    else:
+        query_states, key_states, value_states, attention_mask = gaudi_llama_repeat_kv(
+            query_states, key_states, value_states, attention_mask, self.num_key_value_groups
+        )
+
+        attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.norm_factor
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask
+            if cache_position is not None:
+                causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        if attn_softmax_bf16:
+            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
+        else:
+            # upcast attention to fp32
+            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+                query_states.dtype
+            )
+        attn_output = self.matmul_av(attn_weights, value_states)
+        attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+def enable_gaudi_llama_pos_shift_attention(model, max_attention_window_size):
+    for name, module in reversed(model._modules.items()):
+        if len(list(module.children())) > 0:
+            enable_gaudi_llama_pos_shift_attention(
+                module, max_attention_window_size
+            )
+
+        if isinstance(module, GaudiLlamaAttention):
+            model._modules[name].pre_attn_forward = types.MethodType(
+                gaudi_llama_pos_shift_pre_attn_forward, model._modules[name]
+            )
+            model._modules[name].kv_cache_max_sl = max_attention_window_size
+
+def enable_gaudi_llama_pos_shift_kv_cache(model, attention_sink_size, window_size):
+    for name, module in reversed(model._modules.items()):
+        if len(list(module.children())) > 0:
+            enable_gaudi_llama_pos_shift_kv_cache(
+                module, attention_sink_size, window_size
+            )
+
+        if isinstance(module, KVCache):
+            model._modules[name].allocate = types.MethodType(
+                gaudi_llama_pos_shift_kv_cache_allocate, model._modules[name]
+            )
+            model._modules[name].update = types.MethodType(
+                gaudi_llama_pos_shift_kv_cache_update, model._modules[name]
+            )
+            model._modules[name].forward = types.MethodType(
+                gaudi_llama_pos_shift_kv_cache_forward, model._modules[name]
+            )
+            model._modules[name].attention_sink_size = attention_sink_size
+            model._modules[name].window_size = window_size
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py
new file mode 100644
index 00000000000..db71e0524d0
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py
@@ -0,0 +1,51 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from https://github.com/mit-han-lab/streaming-llm/tree/main
+"""
+
+
+def enable_streaming_llm(model, attention_sink_size=4, attention_sink_window_size=1020):
+    max_attention_window_size = attention_sink_window_size + attention_sink_size
+    if "llama" in model.config.model_type:
+        from .models.llama.pos_shift_llama import (
+            enable_gaudi_llama_pos_shift_attention,
+            enable_gaudi_llama_pos_shift_kv_cache
+        )
+
+        enable_gaudi_llama_pos_shift_attention(model, max_attention_window_size)
+        enable_gaudi_llama_pos_shift_kv_cache(model,
+                                              attention_sink_size,
+                                              max_attention_window_size)
+    else:
+        raise ValueError(f"got {model.config.model_type}")

From 3a934c5eca373c47b8b53434fbbda56ac51e343a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 01:58:22 +0000
Subject: [PATCH 11/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../modeling/modeling_gaudi/models/llama/pos_shift_llama.py      | 1 -
 .../transformers/modeling/modeling_gaudi/streaming_llm.py        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
index e9e8235c794..12fa9181a94 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
@@ -29,7 +29,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Adapted from https://github.com/tomaarsen/attention_sinks
 Note (accelerate inference with hpu graphs in V1.15.1):
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py
index db71e0524d0..a297aca21c8 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/streaming_llm.py
@@ -29,7 +29,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Adapted from https://github.com/mit-han-lab/streaming-llm/tree/main
 """

From fb2966e4328807deee13a6b3b3c10be3bbfe9e35 Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Wed, 22 May 2024 10:40:49 +0800
Subject: [PATCH 12/25] fix the pylint issue

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../generation/configuration_utils.py         |   3 +-
 .../modeling_gaudi/generation/utils.py        | 103 +++++++++++-------
 .../models/albert/modeling_albert.py          |   1 -
 .../models/bart/modeling_bart.py              |   1 -
 .../models/blip/modeling_blip.py              |   5 +-
 .../models/blip/modeling_blip_text.py         |   7 --
 .../models/bloom/modeling_bloom.py            |   9 +-
 .../models/codegen/modeling_codegen.py        |   6 +-
 .../models/falcon/modeling_falcon.py          |  42 ++++---
 .../models/gpt2/modeling_gpt2.py              |   8 +-
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  13 ++-
 .../models/gpt_neox/modeling_gpt_neox.py      |   8 +-
 .../models/gptj/modeling_gptj.py              |  16 +--
 .../models/llama/modeling_llama.py            |  64 +++++------
 .../models/mistral/modeling_mistral.py        |  46 ++++----
 .../models/mixtral/modeling_mixtral.py        |  48 ++++----
 .../models/modeling_all_models.py             |  14 +--
 .../models/modeling_attn_mask_utils.py        |   7 +-
 .../modeling_gaudi/models/mpt/modeling_mpt.py |  10 +-
 .../modeling_gaudi/models/opt/modeling_opt.py |  15 +--
 .../modeling_gaudi/models/phi/modeling_phi.py |  26 +++--
 .../models/speecht5/modeling_speecht5.py      |  10 +-
 .../models/swin/modeling_swin.py              |   2 +-
 .../modeling_gaudi/models/t5/modeling_t5.py   |   9 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  18 +--
 25 files changed, 253 insertions(+), 238 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
index b9bc86f1c56..47a63fa5f33 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -17,7 +17,8 @@
 
 class GaudiGenerationConfig(GenerationConfig):
     """
-    This class extends [`transformers.generation.GenerationConfig`](https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py)
+    This class extends [`transformers.generation.GenerationConfig`]
+    (https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py)
     to add HPU-specific arguments for generation.
 
     Arg:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index aa05cc65aba..d1748bc4c76 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -125,7 +125,6 @@ def _expand_inputs_for_generation(
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...].
 
-        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L704
         The tensor `token_idx` is not expanded.
         """
 
@@ -202,8 +201,8 @@ def _prepare_decoder_input_ids_for_generation(
         if token_idx is None:
             if isinstance(decoder_start_token_id, list):
                 if len(decoder_start_token_id) != batch_size:
-                    raise ValueError(
-                        f"`decoder_start_token_id` expected to have length {batch_size} but got {len(decoder_start_token_id)}"
+                    raise ValueError(f"`decoder_start_token_id` expected to have \
+                      length {batch_size} but got {len(decoder_start_token_id)}"
                     )
                 decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
                 decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
@@ -212,7 +211,8 @@ def _prepare_decoder_input_ids_for_generation(
                     torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
                 )
         else:
-            # creating padded decoder_input_ids to achieve static shapes. Later new tokens once generated are copied in to decoder_input_ids based on token_idx
+            # creating padded decoder_input_ids to achieve static shapes. 
+            # Later new tokens once generated are copied in to decoder_input_ids based on token_idx
             max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length
             decoder_input_ids_start = (
                 torch.ones((batch_size, max_length), dtype=torch.long, device=device) * decoder_start_token_id
@@ -253,8 +253,6 @@ def _update_model_kwargs_for_generation(
         standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
         """
-        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L745
-
         Adds support for `token_idx`, which is necessary for using static shapes.
         """
         # mark to identify starting from second token
@@ -312,7 +310,8 @@ def update_model_kwargs_for_bucketing(
         self, params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile=False
     ):
         if params["need_expansion"]:
-            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
+            # Pad inputs to have static shapes during generation, 
+            # this gives better performance than dynamic shapes on HPUs
             pad_amount = params["allocated_space"] - input_ids.shape[-1]
             input_ids = torch.nn.functional.pad(input_ids, (0, pad_amount), value=pad_token_id)
             if model_kwargs["attention_mask"] is not None:
@@ -360,8 +359,10 @@ def create_pad_arg(pad_amount, i, j):
                             pad_tuple = create_pad_arg(pad_amount, i, j)
                             # Different models might have different shapes of kv-cache
                             # create_pad_arg handles them on a per-model basis
-                            # This is a necessary (but not sufficient) condition: what ever dimension we are padding, should be a multiple of bucket_size
-                            # This check is added in case we get a new model with a new kv-cache structure, and we attempt to pad some wrong dimension
+                            # This is a necessary (but not sufficient) condition: 
+                            # what ever dimension we are padding, should be a multiple of bucket_size
+                            # This check is added in case we get a new model with a new kv-cache structure, 
+                            # and we attempt to pad some wrong dimension
                             assert (
                                 model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == 0
                             )
@@ -398,8 +399,9 @@ def generate(
 
         <Tip warning={true}>
 
-        Most generation-controlling parameters are set in [`transformers.generation.generation_config`] which, if not passed, will be set to the
-        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        Most generation-controlling parameters are set in [`transformers.generation.generation_config`] 
+        which, if not passed, will be set to the model's default generation configuration. 
+        You can override any `generation_config` by passing the corresponding
         parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
 
         For an overview of generation strategies and code examples, check out the [following
@@ -470,14 +472,15 @@ def generate(
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
 
         Return:
-            [`transformers.utils.ModelOutput`] or `torch.LongTensor`: A [`transformers.generationutils.ModelOutput`] (if `return_dict_in_generate=True`
+            [`transformers.utils.ModelOutput`] or `torch.LongTensor`: 
+            A [`transformers.generationutils.ModelOutput`] (if `return_dict_in_generate=True`
             or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
-                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
-                [`transformers.generationutils.ModelOutput`] types are:
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), 
+            the possible [`transformers.generationutils.ModelOutput`] types are:
                     - [`transformers.generation.GenerateDecoderOnlyOutput`],
                     - [`transformers.generation.GenerateBeamDecoderOnlyOutput`]
-                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
-                [`transformers.generationutils.ModelOutput`] types are:
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`),
+                the possible [`transformers.generationutils.ModelOutput`] types are:
                     - [`transformers.generation.GenerateEncoderDecoderOutput`],
                     - [`transformers.generation.GenerateBeamEncoderDecoderOutput`]
         """
@@ -509,10 +512,8 @@ def generate(
                 new_generation_config = GaudiGenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
                     warnings.warn(
-                        "You have modified the pretrained model configuration to control generation. This is a"
-                        " deprecated strategy to control generation and will be removed soon, in a future version."
-                        " Please use and modify the model generation configuration (see"
-                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+                        "You have modified the pretrained model configuration to control generation."
+                        "This is a deprecated strategy to control generation and will be removed in a future version."
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
@@ -604,13 +605,16 @@ def generate(
                 assert generation_config.bucket_size >= 0, "please set valid bucket_size to use bucket_internal"
 
         if generation_config.static_shapes:
-            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
+            # Pad inputs to have static shapes during generation, 
+            # this gives better performance than dynamic shapes on HPUs
             # In encoder_decoder models, Inputs are already padded
 
             if not self.config.is_encoder_decoder:
-                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), then that is taken care in greedy_search()
+                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), 
+                # then that is taken care in greedy_search()
                 if not is_greedy_or_beam_and_bucket:
-                    # token_idx is the current index in the generation process, it is incremented each time a new token is generated
+                    # token_idx is the current index in the generation process,
+                    # it is incremented each time a new token is generated
                     token_idx = inputs_tensor.shape[-1]
                     model_kwargs["token_idx"] = torch.tensor(token_idx, device=inputs_tensor.device)
                     model_kwargs["token_idx_cpu"] = token_idx
@@ -1143,8 +1147,8 @@ def contrastive_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
-        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        r"""Generates sequences of token ids for models with a language modeling head using **contrastive search** 
+        and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
 
@@ -1231,7 +1235,11 @@ def contrastive_search(
         ...     **input_ids, penalty_alpha=0.6, top_k=4, stopping_criteria=stopping_criteria
         ... )
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
+        ['DeepMind Company is a company that focuses on the development and \
+          commercialization of artificial intelligence (AI). \
+          DeepMind’s mission is to help people understand and solve problems \
+          that are difficult to solve in the world today.\n\n\
+          In this post, we talk about the benefits of deep learning in business and how it']
         ```
         """
 
@@ -1258,8 +1266,8 @@ def greedy_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
-        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
 
@@ -1317,7 +1325,8 @@ def greedy_search(
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`]
+            [`transformers.generation.GenerateDecoderOnlyOutput`], 
+            [`transformers.generation.GenerateEncoderDecoderOutput`]
             or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
@@ -1667,7 +1676,8 @@ def sample(
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
+            [`transformers.generation.GenerateDecoderOnlyOutput`],
+            [`transformers.generation.GenerateEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
@@ -2001,7 +2011,8 @@ def beam_search(
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`],
+            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
@@ -2590,7 +2601,8 @@ def beam_sample(
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`],
+            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
@@ -2736,11 +2748,15 @@ def group_beam_search(
                 model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if [`transformers.generation.BeamSearchDecoderOnlyOutput`] if
-            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
-            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`],
+            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: 
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if 
+            [`transformers.generation.BeamSearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or 
+            a [`transformers.generation.GenerateBeamEncoderDecoderOutput`]
+            if `model.config.is_encoder_decoder=True`.
 
         Examples:
 
@@ -2883,7 +2899,8 @@ def constrained_beam_search(
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
+            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`],
+            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
@@ -3214,8 +3231,8 @@ def assisted_decoding(
 
         <Tip warning={true}>
 
-        In most cases, you do not need to call [`transformers.generation.GenerationMixin.candidate_decoding`] directly. Use
-        generate() instead. For an overview of generation strategies and code examples, check the [following
+        In most cases, you do not need to call [`transformers.generation.GenerationMixin.candidate_decoding`].
+        Use generate() instead. For an overview of generation strategies and code examples, check the [following
         guide](../generation_strategies).
 
         </Tip>
@@ -3225,7 +3242,8 @@ def assisted_decoding(
                 The sequence used as a prompt for the generation.
             candidate_generator (`CandidateGenerator`, *optional*):
                 A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
-                more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
+                more information, the documentation of [`CandidateGenerator`] should be read.
+                Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
                 same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
@@ -3276,7 +3294,8 @@ def assisted_decoding(
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
+            [`transformers.generation.GenerateDecoderOnlyOutput`],
+            [`transformers.generation.GenerateEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
             `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
index 6ac9b80073b..a9b84baf1ef 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/albert/modeling_albert.py
@@ -33,7 +33,6 @@ def gaudi_albert_forward(
     return_dict: Optional[bool] = None,
 ) -> Union[BaseModelOutputWithPooling, Tuple]:
     """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/models/albert/modeling_albert.py#L689
     except that mixed precision is disabled for computing:
         extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
index cab69760e15..c5b958463cf 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
@@ -43,7 +43,6 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from modeling_bart.py: https://raw.githubusercontent.com/huggingface/transformers/648d0deb1dd28a5d9956e63d8cf8c18f96a6a2aa/src/transformers/models/bart/modeling_bart.py
 # The difference is: modified dynamic shapes to static shapes with `mark_step` for performance improvement.
 
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
index 2a31547669e..fcfe3b18c9a 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
@@ -30,7 +30,6 @@ def gaudi_BlipForConditionalGeneration_generate(
     **generate_kwargs,
 ) -> torch.LongTensor:
     """
-    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1022
     The only differences are:
         - wrap hpu graph for each part
     """
@@ -83,10 +82,10 @@ def gaudi_BlipForQuestionAnswering_generate(
     **generate_kwargs,
 ) -> torch.LongTensor:
     """
-    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1236
     The only differences are:
         - wrap hpu graph for each part
-        - torch.full add dtype=torch.int64, or else the default type is torch.float32. lead to coredump in embedding layer
+        - torch.full add dtype=torch.int64, or else the default type is torch.float32.
+        lead to coredump in embedding layer
     """
     if generate_kwargs.get("hpu_graphs", True):
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
index 386d50a3d37..9428ff068aa 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip_text.py
@@ -42,7 +42,6 @@ def gaudi_BlipTextSelfAttention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor]:
     """
-    Copied from BlipTextSelfAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L143
     The only differences are:
         - add token_idx
     """
@@ -135,7 +134,6 @@ def gaudi_BlipTextAttention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor]:
     """
-    Copied from BlipTextAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L265
     The only differences are:
         - add token_idx
     """
@@ -166,7 +164,6 @@ def gaudi_BlipTextLayer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor]:
     """
-    Copied from BlipTextLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L333
     The only differences are:
         - add token_idx
     """
@@ -220,7 +217,6 @@ def gaudi_BlipTextEncoder_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
     """
-    Copied from BlipTextEncoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L391
     The only differences are:
         - add token_idx
     """
@@ -317,7 +313,6 @@ def gaudi_BlipTextModel_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
     """
-    Copied from BlipTextModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L666
     The only differences are:
         - add token_idx
     """
@@ -448,7 +443,6 @@ def gaudi_BlipTextLMHead_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
     """
-    Copied from BlipTextLMHeadModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L820
     The only differences are:
         - add token_idx
     """
@@ -507,7 +501,6 @@ def gaudi_BlipTextLMHead_prepare_inputs_for_generation(
     self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, **model_kwargs
 ):
     """
-    Copied from BlipTextLMHeadModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L910
     The only differences are:
         - add token_idx support, add position_ids
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
index bc958571d64..21a569ade10 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -173,7 +173,8 @@ def gaudi_bloom_attention_forward(
     # change view to [batch_size, num_heads, q_length, kv_length]
     attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
 
-    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype 
+    # - [batch_size, num_heads, q_length, kv_length]
     input_dtype = attention_scores.dtype
     attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
     attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
@@ -334,7 +335,8 @@ def gaudi_bloom_model_forward(
     **deprecated_arguments,
 ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
     if deprecated_arguments.pop("position_ids", False) is not False:
-        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to 
+        # `False` allows to detect if users were passing explicitly `None`
         warnings.warn(
             "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
             " passing `position_ids`.",
@@ -529,7 +531,8 @@ def forward(
         are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
         """
         if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to
+            # `False` allows to detect if users were passing explicitly `None`
             warnings.warn(
                 "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                 " passing `position_ids`.",
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
index 23fbb596890..b7d67d5d544 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/codegen/modeling_codegen.py
@@ -42,7 +42,6 @@ def forward(
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
         """
-        Copied from CodeGenAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
         The only differences are:
         - add new args token_idx
         - optimize KV cache
@@ -59,7 +58,7 @@ def forward(
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
         value = value.permute(0, 2, 1, 3)
 
-        embed_positions = self.embed_positions
+        embed_positions = self.embed_positions # pylint: disable=E0203
         if embed_positions.device != position_ids.device:
             embed_positions = embed_positions.to(position_ids.device)
             self.embed_positions = embed_positions
@@ -129,7 +128,6 @@ def gaudi_codegen_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
     """
-    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
     The only differences are:
     - add new args token_idx
     """
@@ -175,7 +173,6 @@ def gaudi_codegen_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
     The only differences are:
     - add new args token_idx
     """
@@ -319,7 +316,6 @@ def gaudi_codegen_model_forward(
 
 class GaudiCodeGenForCausalLM(CodeGenForCausalLM):
     """
-    Inherits from CodeGenForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index 89f92795f72..10a6e9829bb 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -82,8 +82,8 @@ def gaudi_falcon_attention_split_heads(
     self, fused_qkv: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Copied from FalconAttention._split_heads https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/falcon/modeling_falcon.py
-    Changing index operation of qkv[:::] to use torch.index_select to work around gradient accuracy issue and improve performance.
+    Changing index operation of qkv[:::] to use torch.index_select to work around gradient 
+    accuracy issue and improve performance.
     """
     if self.new_decoder_architecture:
         batch, seq_len, _ = fused_qkv.shape
@@ -139,16 +139,10 @@ def gaudi_falcon_attention_forward(
     **kwargs,
 ):
     """
-    Copied from FalconAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
     The only differences are:
     - add new args token_idx and position_ids
     - replace F.scaled_dot_product_attention with Habana torch's version
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-
     fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
     # 3 x [batch_size, seq_length, num_heads, head_dim]
     (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
@@ -209,7 +203,9 @@ def gaudi_falcon_attention_forward(
                         value_layer,
                         attention_mask,
                         0.0,
-                        # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                        # The query_length > 1 is necessary to match with 
+                        # AttentionMaskConverter.to_causal_4d that does not create a 
+                        # causal mask in case query_length == 1.
                         self.is_causal and attention_mask is None and query_length > 1,
                     )
             else:
@@ -223,7 +219,9 @@ def gaudi_falcon_attention_forward(
                     value_layer,
                     attention_mask,
                     0.0,
-                    # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                    # The query_length > 1 is necessary to match with 
+                    # AttentionMaskConverter.to_causal_4d that does not create a causal
+                    #  mask in case query_length == 1.
                     is_causal=self.is_causal and attention_mask is None and query_length > 1,
                 )
             # Performance improvement for HPU
@@ -273,9 +271,11 @@ def gaudi_falcon_attention_forward(
             # change view to [batch_size, num_heads, q_length, kv_length]
             attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
 
-            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype
+            # - [batch_size, num_heads, q_length, kv_length]
             input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` 
+            # have a minimum value of `-3.4e+38`
             if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
                 attention_scores = attention_scores.to(torch.float32)
 
@@ -319,16 +319,11 @@ def gaudi_falcon_decoder_layer_forward(
     **kwargs,
 ):
     """
-    Copied from FalconDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into attention inputs
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-
     residual = hidden_states
 
     if self.config.new_decoder_architecture:
@@ -380,7 +375,7 @@ def gaudi_falcon_decoder_layer_forward(
 
 class GaudiFalconModel(FalconModel):
     """
-    Inherits from FalconModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into decoder inputs
@@ -477,7 +472,8 @@ def forward(
                 alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
 
                 attention_mask_2d = attention_mask
-                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched.
+                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as 
+                # we need to mask alibi using the 4D attention_mask untouched.
                 attention_mask = _gaudi_prepare_4d_causal_attention_mask(
                     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
                 )
@@ -492,8 +488,8 @@ def forward(
                         torch.finfo(alibi.dtype).min,
                     )
 
-                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-                    # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with 
+                    # the memory-efficient attention backend
                     if seq_length > 1:
                         attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
                             attention_mask, attention_mask_2d, unmasked_value=0.0
@@ -571,7 +567,7 @@ def forward(
 
 class GaudiFalconForCausalLM(FalconForCausalLM):
     """
-    Inherits from FalconForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into model inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
index 793b79fc0fb..c23734947e3 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
@@ -22,7 +22,7 @@
 
 class GaudiGPT2Attention(GPT2Attention):
     """
-    Copied from GPT2Attention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    
     The only differences are:
     - optimize KV cache
     """
@@ -195,7 +195,7 @@ def gaudi_gpt2_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
     """
-    Copied from GPT2Block.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -272,7 +272,7 @@ def gaudi_gpt2_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    Copied from GPT2Model.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    
     The only differences are:
     - disable autocast for attention_mask
     - add new args token_idx
@@ -456,7 +456,7 @@ def gaudi_gpt2_forward(
 
 class GaudiGPT2LMHeadModel(GPT2LMHeadModel):
     """
-    Copied from GPT2LMHeadModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+    
     The only differences are:
     - add new args token_idx
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
index b7874bf8ca6..48afcea668b 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -39,7 +39,7 @@ def gaudi_gpt_bigcode_attention_forward(
     Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
 ]:
     """
-    Copied from GPTBigCodeAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -109,7 +109,7 @@ def gaudi_gpt_bigcode_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
     """
-    Copied from GPTBigCodeBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -183,7 +183,7 @@ def gaudi_gpt_bigcode_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    Copied from GPTBigCodeModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    
     The only differences are:
     - add new args token_idx
     - if token_idx and past_key_values are passed, set self_attention_mask based on the static shape of past_key_values
@@ -259,12 +259,13 @@ def gaudi_gpt_bigcode_model_forward(
 
         if query_length > 1 and attention_mask is not None:
             # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-            # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+            # 
             self_attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
                 self_attention_mask, attention_mask, unmasked_value=True
             )
 
-        # SDPA with a custom mask is much faster in fp16/fp32 dtype rather than bool. Cast here to floating point instead of at every layer.
+        # SDPA with a custom mask is much faster in fp16/fp32 dtype rather than bool.
+        # Cast here to floating point instead of at every layer.
         dtype = self.wte.weight.dtype
         self_attention_mask = torch.where(
             self_attention_mask,
@@ -371,7 +372,7 @@ def gaudi_gpt_bigcode_model_forward(
 
 class GaudiGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
     """
-    Inherits from GPTBigCodeForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
index 161d0ac2e20..b65e03c23a4 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
@@ -40,7 +40,7 @@ def gaudi_gpt_neox_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from GPTNeoXAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -124,7 +124,7 @@ def gaudi_gpt_neox_layer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from GPTNeoxLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -180,7 +180,7 @@ def gaudi_gpt_neox_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from GPTNeoxModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -306,7 +306,7 @@ def gaudi_gpt_neox_model_forward(
 
 class GaudiGPTNeoXForCausalLM(GPTNeoXForCausalLM):
     """
-    Inherits from GPTNeoXForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt_neox/modeling_gpt_neox.py
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
index a4e279766b1..0c53554fc35 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
@@ -87,7 +87,7 @@ def forward(
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
         """
-        Copied from GPTJAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+        
         The only differences are:
         - add new args token_idx
         - remove is_torch_fx_proxy
@@ -108,7 +108,9 @@ def forward(
 
             q_rot = query[:, :, :, : self.rotary_dim]
             q_pass = query[:, :, :, self.rotary_dim :]
-            # Note: it appears that if we use bf16 RoPE(whether use fused kernel or not), there could be acc issue, hence use fp32 RoPE here Fused kernel feasibility needs to be confirmed in the future
+            # Note: it appears that if we use bf16 RoPE(whether use fused kernel or not),
+            # there could be acc issue, hence use fp32 RoPE here Fused kernel feasibility
+            # needs to be confirmed in the future
             k_rot = apply_rotary_pos_emb(k_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
             q_rot = apply_rotary_pos_emb(q_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
 
@@ -135,8 +137,8 @@ def forward(
                 value = torch.cat([past_value, value], dim=-2)
 
         if use_cache is True:
-            # Note that this cast is quite ugly, but is not implemented before ROPE as the original codebase keeps the key in float32 all along the computation.
-            # Reference: https://github.com/kingoflolz/mesh-transformer-jax/blob/f8315e3003033b23f21d78361b288953064e0e76/mesh_transformer/layers.py#L128
+            # Note that this cast is quite ugly, but is not implemented before ROPE as the
+            # original codebase keeps the key in float32 all along the computation.
             present = (key.to(hidden_states.dtype), value)
         else:
             present = None
@@ -169,7 +171,7 @@ def gaudi_gptj_block_forward(
     cos: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
     """
-    Copied from GPTJBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    
     The only differences are:
     - add new args token_idx
     - pass sin and cos from upper level as they are identical for each attn block
@@ -220,7 +222,7 @@ def gaudi_gptj_model_forward(
     cos: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from GPTJModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    
     The only differences are:
     - add new args token_idx
     - pass sin and cos from upper level as they are identical for each attn block
@@ -401,7 +403,7 @@ def gaudi_gptj_model_forward(
 
 class GaudiGPTJForCausalLM(GPTJForCausalLM):
     """
-    Inherits from GPTJForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 8fb9ef66ee2..665c385758f 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -64,7 +64,7 @@
 
 def gaudi_llama_rmsnorm_forward(self, hidden_states):
     """
-    Copied from LlamaRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -128,12 +128,16 @@ def gaudi_llama_repeat_kv(
     n_rep: int,
 ):
     """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    
     The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during 
+          matmuls so need to expand and reshape them.
+        - Add new args query_states, key_states, value_states and attention_mask and
+          update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to 
+    (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_key_value_heads, 1, seqlen, head_dim)
     """
     batch, num_key_value_heads, kv_len, head_dim = key_states.shape
     if n_rep == 1 or num_key_value_heads == 1:
@@ -295,7 +299,7 @@ def update_sincos_cache(self, seq_len):
         # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
+        if seq_len > self.max_position_embeddings: # pylint: disable=E0203
             self.max_position_embeddings = seq_len
             _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
 
@@ -333,7 +337,7 @@ def pre_attn_forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
-        Copied from LlamaAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        
         The only differences are:
         - add new args token_idx
         - optimize KV cache
@@ -368,7 +372,6 @@ def pre_attn_forward(
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440)
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
@@ -489,7 +492,7 @@ def post_attn_forward(self, attn_output):
 
 class GaudiLlamaDecoderLayer(LlamaDecoderLayer):
     def __init__(self, config: LlamaConfig, layer_idx: int):
-        super(LlamaDecoderLayer, self).__init__()
+        super(GaudiLlamaDecoderLayer, self).__init__()
         self.hidden_size = config.hidden_size
 
         self.self_attn = GaudiLlamaAttention(config=config, layer_idx=layer_idx)
@@ -527,7 +530,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
-        Copied from LlamaDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        
         The only differences are:
         - add new args token_idx
         - add new args attn_softmax_bf16
@@ -537,11 +540,6 @@ def forward(
         - add new arg flash_attention_causal_mask
         - add new arg cache_prune_num for attention_sinks
         """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
         residual = hidden_states
         hidden_states, self_attn_weights, present_key_value = self.pre_attn(
             hidden_states,
@@ -642,16 +640,16 @@ def post_mlp(self, hidden_states, residual):
 
 class GaudiLlamaModel(LlamaModel):
     """
-    Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L909
+    
     """
 
     def __init__(self, config: LlamaConfig):
         """
-        Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L917
+        
         1. set fill_value to 1 instead of True
         2. add device=self.device
         """
-        super(LlamaModel, self).__init__(config)
+        super(GaudiLlamaModel, self).__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -663,7 +661,6 @@ def __init__(self, config: LlamaConfig):
         self.gradient_checkpointing = False
 
         # Register a causal mask to separate causal and padding mask creation. Merging happens in the attention class.
-        # NOTE: This is not friendly with TorchScript, ONNX, ExportedProgram serialization for very large `max_position_embeddings`.
         causal_mask = torch.full(
             (config.max_position_embeddings, config.max_position_embeddings),
             fill_value=1,
@@ -707,7 +704,7 @@ def forward(
         cache_prune_num: int = 0,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
-        Copied from LlamaModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+        
         The only differences are:
         - add new args token_idx
         - add new args attn_softmax_bf16
@@ -876,7 +873,7 @@ def forward(
 
 class GaudiLlamaForCausalLM(LlamaForCausalLM):
     """
-    Inherits from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
@@ -1015,18 +1012,21 @@ def prepare_inputs_for_generation(
                     max_cache_length = None
 
                 # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
+                # 1 - If the length of the attention_mask exceeds the length of input_ids, 
+                # then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache 
+                # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
+                # 2 - If the past_length is smaller than input_ids',
+                # then input_ids holds all input tokens. We can discard input_ids based on the past_length.
                 elif past_length < input_ids.shape[1]:
                     input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+                # 3 - Otherwise (past_length >= input_ids.shape[1]), 
+                # let's assume input_ids only has unprocessed tokens.
 
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                # If we are about to go beyond the maximum cache length,
+                # we need to crop the input attention mask.
                 if (
                     max_cache_length is not None
                     and attention_mask is not None
@@ -1034,7 +1034,8 @@ def prepare_inputs_for_generation(
                 ):
                     attention_mask = attention_mask[:, -max_cache_length:]
         elif reuse_cache and token_idx is not None:
-            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
+            # With reuse_cache, KV cache is pre allocated hence for the 1st token
+            # we can slice the inputs till token idx for the fwd pass
             input_ids = input_ids[:, :token_idx]
             attention_mask = attention_mask[:, :token_idx]
 
@@ -1093,7 +1094,6 @@ def prepare_inputs_for_generation(
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
-            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
             # TODO: use `next_tokens` directly instead.
             model_inputs = {"input_ids": input_ids.contiguous()}
 
@@ -1154,5 +1154,5 @@ def apply_customized_rope(q, k, cos, sin, position_ids):
             k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
         )
     else:
-        # keep the same implementation as Transformers v4.37.2
+
         return apply_rotary_pos_emb(q, k, cos[position_ids], sin[position_ids])
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index b7e0932b5a9..ec82b896544 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -68,7 +68,7 @@ def update(prev, cur, dim, idx):
 
 def gaudi_mistral_rmsnorm_forward(self, hidden_states):
     """
-    Copied from MistralRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py
+    
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -97,12 +97,15 @@ def gaudi_mistral_repeat_kv(
     n_rep: int,
 ):
     """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mistral/modeling_mistral.py
+    
     The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during 
+          matmuls so need to expand and reshape them.
         - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    The query states go from (batch, num_heads, seqlen, head_dim) to 
+    (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to 
+    (batch, num_key_value_heads, 1, seqlen, head_dim)
     """
     batch, num_key_value_heads, kv_len, head_dim = key_states.shape
     if n_rep == 1 or num_key_value_heads == 1:
@@ -122,7 +125,6 @@ def gaudi_mistral_repeat_kv(
 
     return query_states, key_states, value_states, attention_mask
 
-
 class GaudiMistralAttention(MistralAttention):
     def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
         super().__init__(config, layer_idx)
@@ -141,7 +143,7 @@ def update_sincos_cache(self, seq_len):
         # Call rotary emb forward() to update cos/sin cache when inferring more than self.max_position_embeddings
         # This helps in avoiding creation of these caches during actual model forward pass and
         # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
+        if seq_len > self.max_position_embeddings: # pylint: disable=E0203
             self.max_position_embeddings = seq_len
             _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
 
@@ -174,7 +176,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
-         Copied from MistralAttention.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+         
          The only differences are:
          - add new args token_idx
          - add new args reuse_cache
@@ -251,8 +253,8 @@ def forward(
         if attention_mask is not None:
             if attention_mask.size() not in [(bsz, 1, q_len, kv_seq_len), (bsz, 1, 1, q_len, kv_seq_len)]:
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or {(bsz, 1, 1, q_len, kv_seq_len)},"
-                    f" but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or "
+                    f"{(bsz, 1, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
 
             attn_weights = attn_weights + attention_mask
@@ -312,7 +314,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
-        Copied from MistralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        
         The only differences are:
         - add new args token_idx
         """
@@ -378,7 +380,7 @@ def forward(
         attn_softmax_bf16: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
-        Copied from MistralModel.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        
         The only differences are:
         - add new args token_idx
         """
@@ -546,7 +548,7 @@ def forward(
         attn_softmax_bf16: Optional[bool] = False,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
-        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        
         The only differences are:
         - add new args token_idx
         """
@@ -612,7 +614,7 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         """
-        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
+        
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
@@ -633,18 +635,22 @@ def prepare_inputs_for_generation(
                     max_cache_length = None
 
                 # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
+                # 1 - If the length of the attention_mask exceeds the length of input_ids,
+                # then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache 
+                # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # 2 - If the past_length is smaller than input_ids',
+                # then input_ids holds all input tokens. We can discard
                 # input_ids based on the past_length.
                 elif past_length < input_ids.shape[1]:
                     input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+                # 3 - Otherwise (past_length >= input_ids.shape[1]),
+                # let's assume input_ids only has unprocessed tokens.
 
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                # If we are about to go beyond the maximum cache length,
+                # we need to crop the input attention mask.
                 if (
                     max_cache_length is not None
                     and attention_mask is not None
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 9e56ed4de8a..6c7b2212653 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -94,7 +94,7 @@ def apply_customized_rope(q, k, cos, sin, position_ids):
 
 def gaudi_mixtral_rmsnorm_forward(self, hidden_states):
     """
-    Copied from MixtralRMSNorm.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -123,12 +123,16 @@ def gaudi_mixtral_repeat_kv(
     n_rep: int,
 ):
     """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    
     The only differences are:
-    - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-    - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls
+      so need to expand and reshape them.
+    - Add new args query_states, key_states, value_states and attention_mask and
+      update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to
+    (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_key_value_heads, 1, seqlen, head_dim)
     """
     batch, num_key_value_heads, kv_len, head_dim = key_states.shape
     if n_rep == 1 or num_key_value_heads == 1:
@@ -188,15 +192,11 @@ def gaudi_mixtral_attention_forward(
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    Copied from MixtralAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    
     The only differences are:
     - add new args token_idx
     - optimize KV cache
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
     bsz, q_len, _ = hidden_states.size()
 
     query_states = self.q_proj(hidden_states)
@@ -279,7 +279,7 @@ def gaudi_mixtral_attention_forward(
 
 def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Copied from MixtralSparseMoeBlock.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    
     The only differences are:
     - optimize expert forward, remove dynamic control and dynamic shape
     """
@@ -339,15 +339,10 @@ def gaudi_mixtral_decoder_layer_forward(
     **kwargs,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    Copied from MixtralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    
     The only differences are:
     - add new args token_idx
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-
     htcore.mark_step()
     residual = hidden_states
 
@@ -402,7 +397,7 @@ def gaudi_mixtral_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, MoeModelOutputWithPast]:
     """
-    Copied from MixtralModel.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1069
+    
     The only differences are:
     - add new args token_idx
     """
@@ -558,7 +553,7 @@ def gaudi_mixtral_model_forward(
 
 class GaudiMixtralForCausalLM(MixtralForCausalLM):
     """
-    Inherits from MixtralForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1231
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
@@ -667,16 +662,19 @@ def prepare_inputs_for_generation(
                     max_cache_length = None
 
                 # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
+                # 1 - If the length of the attention_mask exceeds the length of input_ids,
+                # then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache
+                # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # 2 - If the past_length is smaller than input_ids',
+                # then input_ids holds all input tokens. We can discard
                 # input_ids based on the past_length.
                 elif past_length < input_ids.shape[1]:
                     input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+                # 3 - Otherwise (past_length >= input_ids.shape[1]),
+                # let's assume input_ids only has unprocessed tokens.
 
                 # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
                 if (
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
index c95284cafd5..8f8d1825ccc 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
@@ -24,7 +24,6 @@
 
 def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> torch.Tensor:
     """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L640
     except that mixed precision is disabled for computing:
         encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
     """
@@ -53,7 +52,6 @@ def gaudi_get_extended_attention_mask(
     self, attention_mask: torch.Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
 ) -> torch.Tensor:
     """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L692
     except that mixed precision is disabled for computing:
         extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
     """
@@ -101,7 +99,6 @@ def gaudi_get_extended_attention_mask(
 
 def gaudi_conv1d_forward(self, x):
     """
-    Same as https://github.com/huggingface/transformers/blob/3335724376319a0c453049d0cd883504f530ff52/src/transformers/pytorch_utils.py#L100
     but moves reshape before view for tpc auto fusion.
     """
     size_out = x.size()[:-1] + (self.nf,)
@@ -125,13 +122,16 @@ def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> P
         return config
 
     # Otherwise, fallback to original implementation
-    # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_utils.py#L1542
+
     if hard_check_only:
         if not cls._supports_sdpa:
             raise ValueError(
-                f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
-                " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe"
-                ' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
+                f"{cls.__name__} does not support an attention implementation through "
+                "torch.nn.functional.scaled_dot_product_attention yet."
+                " Please request support: https://github.com/huggingface/transformers/issues/28005. "
+                'this error is a bug, please open an issue in Transformers GitHub repository.'
+                'load your model with the argument `attn_implementation="eager"` meanwhile.'
+                ' Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
             )
         if not is_torch_sdpa_available():
             raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.")
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
index fe776330131..58d542b48e7 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
@@ -21,10 +21,9 @@
 @dataclass
 class GaudiAttentionMaskConverter(AttentionMaskConverter):
     """
-    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L21
-
     Differences:
-    - replace `triu` with similar logic here: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L169
+    - replace `triu` with similar logic here: 
+
     """
 
     @staticmethod
@@ -70,8 +69,6 @@ def _gaudi_prepare_4d_causal_attention_mask(
     sliding_window: Optional[int] = None,
 ):
     """
-    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L278
-
     Differences:
     - replace `AttentionMaskConverter` by `GaudiAttentionMaskConverter`
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
index 294371700b8..c1ea826fd18 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
@@ -39,7 +39,7 @@ def gaudi_mpt_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from MptAttention.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+    
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -112,7 +112,7 @@ def gaudi_mpt_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from MptBlock.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -165,7 +165,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
         """
-        Copied from MptModel.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        
         The only differences are:
         - add new args token_idx
         """
@@ -284,7 +284,7 @@ def prepare_inputs_for_generation(
         **kwargs,
     ) -> dict:
         """
-        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
@@ -336,7 +336,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         """
-        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        
         The only differences are:
         - add new args token_idx
         """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
index a670eebdaa6..1de0168e4fe 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
@@ -24,7 +24,7 @@
 
 class GaudiOPTLearnedPositionalEmbedding(OPTLearnedPositionalEmbedding):
     """
-    Inherits from OPTLearnedPositionalEmbedding: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     - compute embedding using token_idx if past_key_values_length not 0
@@ -44,7 +44,8 @@ def forward(
             positions = positions[:, past_key_values_length:]
             return torch.nn.Embedding.forward(self, positions + self.offset)
         else:
-            # if not 0, kv cache is enabled and from step = 2, past_key_values_length is equal to the final length of outputs
+            # if not 0, kv cache is enabled and from step = 2, 
+            # past_key_values_length is equal to the final length of outputs
             return torch.nn.Embedding.forward(self, token_idx + self.offset)
 
 
@@ -59,7 +60,7 @@ def gaudi_opt_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    Copied from OPTAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -187,7 +188,7 @@ def gaudi_opt_decoder_layer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    Copied from OPTDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -259,7 +260,7 @@ def gaudi_opt_decoder_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from OPTDecoder.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     - update calculation of mask_seq_length
@@ -412,7 +413,7 @@ def gaudi_opt_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from OPTModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -450,7 +451,7 @@ def gaudi_opt_model_forward(
 
 class GaudiOPTForCausalLM(OPTForCausalLM):
     """
-    Inherits from OPTForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+    
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
index a59aadc0505..f5742529d86 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
@@ -50,7 +50,7 @@ def gaudi_phi_attention_forward(
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    Copied from PhiAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -170,7 +170,7 @@ def gaudi_phi_decoder_layer_forward(
     **kwargs,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    Copied from PhiDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -218,7 +218,7 @@ def gaudi_phi_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    Copied from PhiModel.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -346,7 +346,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
-        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+        
         The only differences are:
         - add new args token_idx
         """
@@ -404,7 +404,7 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         """
-        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
+        
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
@@ -425,18 +425,22 @@ def prepare_inputs_for_generation(
                     max_cache_length = None
 
                 # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
+                # 1 - If the length of the attention_mask exceeds the length of input_ids,
+                # then we are in a setting where
+                # some of the inputs are exclusively passed as part of the cache
+                # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+                # 2 - If the past_length is smaller than input_ids',
+                # then input_ids holds all input tokens. We can discard
                 # input_ids based on the past_length.
                 elif past_length < input_ids.shape[1]:
                     input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+                # 3 - Otherwise (past_length >= input_ids.shape[1]),
+                # let's assume input_ids only has unprocessed tokens.
 
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+                # If we are about to go beyond the maximum cache length,
+                # we need to crop the input attention mask.
                 if (
                     max_cache_length is not None
                     and attention_mask is not None
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
index 138d7d234ac..72d137b3437 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
@@ -37,7 +37,7 @@ def gaudi_SpeechT5SpeechDecoderPrenet_forward(
     speaker_embeddings: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from SpeechT5SpeechDecoderPrenet.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    
     The only differences are:
     - disable dropout in inference, or else hpu graph could not be used
     """
@@ -72,7 +72,7 @@ def gaudi_SpeechT5Attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    Copied from SpeechT5Attention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    
     The only differences are:
     - add new args token_idx
     """
@@ -209,7 +209,7 @@ def gaudi_SpeechT5DecoderLayer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    Copied from SpeechT5DecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    
     The only differences are:
     - add token_idx in self-attention
     """
@@ -285,7 +285,7 @@ def gaudi_SpeechT5Decoder_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    Copied from SpeechT5Decoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    
     The only differences are:
     - add token_idx args
     - use _gaudi_prepare_4d_causal_attention_mask
@@ -420,7 +420,7 @@ def gaudi_generate_speech(
     return_output_lengths: bool = False,
 ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
     """
-    Copied from _generate_speech: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
+    
     The only differences are:
     - add hpu graph wrap
     - add static shape support in kv-cache in _generate_speech
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
index 9ea3b9d28cb..8dc5066d2e2 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
@@ -20,7 +20,7 @@
 
 def gaudi_swin_get_attn_mask(self, height, width, dtype):
     """
-    Copied from SwinLayer.get_attn_mask : https://github.com/huggingface/transformers/blob/main/src/transformers/models/swin/modeling_swin.py
+    
     The only difference is moving img_mask to hpu for performance
     """
     if self.shift_size > 0:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
index fddc9580cce..bff64e72d74 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -41,7 +41,7 @@
 
 def gaudi_t5_layernorm_forward(self, hidden_states):
     """
-    Copied from T5LayerNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+    
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -313,7 +313,7 @@ def gaudi_T5Block_forward(
     else:
         outputs = outputs + attention_outputs
 
-    return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+    return outputs
 
 
 def gaudi_T5Stack_forward(
@@ -452,7 +452,8 @@ def gaudi_T5Stack_forward(
             )
 
         # layer_outputs is a tuple with:
-        # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+        # hidden-states, key-value-states, (self-attention position bias), 
+        # (self-attention weights), (cross-attention position bias), (cross-attention weights)
         if use_cache is False:
             layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
 
@@ -576,7 +577,6 @@ def gaudi_T5ForConditionalGeneration_forward(
 
     if self.config.tie_word_embeddings:
         # Rescale output before projecting on vocab
-        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
         sequence_output = sequence_output * (self.model_dim**-0.5)
 
     lm_logits = self.lm_head(sequence_output)
@@ -587,7 +587,6 @@ def gaudi_T5ForConditionalGeneration_forward(
         # move labels to correct device to enable PP
         labels = labels.to(lm_logits.device)
         loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-        # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
     if not return_dict:
         output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
index 5c05d65065a..7a84045904c 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -32,7 +32,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
     min_masks: int = 0,
 ) -> torch.Tensor:
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L135
+    
     The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
     """
     batch_size, sequence_length = shape
@@ -135,7 +135,7 @@ def _gaudi_wav2vec2_sample_negative_indices(
     features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[torch.Tensor] = None
 ):
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L254
+    
     The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
     """
     batch_size, sequence_length = features_shape
@@ -179,8 +179,9 @@ def _gaudi_wav2vec2_mask_hidden_states(
     attention_mask: Optional[torch.LongTensor] = None,
 ):
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1227
-    Differences are that (1) `mask_time_indices` is not moved to the current device and converted into boolean because this is already done in _compute_mask_indices.
+    
+    Differences are that (1) `mask_time_indices` is not moved to the current device and 
+    converted into boolean because this is already done in _compute_mask_indices.
     (2) index_put operation on hidden_states is replaced by combination of simpler ops (more suitable for HPU graphs)
     """
 
@@ -203,7 +204,8 @@ def _gaudi_wav2vec2_mask_hidden_states(
             min_masks=self.config.mask_time_min_masks,
         )
         # replacement of index_put with combination of simpler ops. Assumption made about sizes of hidden_states (3d),
-        # mask_time_indices (2d), self.masked_spec_embed (1d), for any other combination better to go back to original code using index_put.
+        # mask_time_indices (2d), self.masked_spec_embed (1d),
+        # for any other combination better to go back to original code using index_put.
         # hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
         inverse_mask_time_indices = torch.bitwise_not(mask_time_indices)
         hidden_states = hidden_states * inverse_mask_time_indices.unsqueeze(2) + self.masked_spec_embed.to(
@@ -234,7 +236,7 @@ def gaudi_wav2vec2_encoder_forward(
     return_dict: bool = True,
 ):
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/7790943c91411f4234d11dfbf4c2f21ce7caf088/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L755
+    
     The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)
     """
     all_hidden_states = () if output_hidden_states else None
@@ -310,7 +312,7 @@ def gaudi_wav2vec2_forward(
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282
+    
     The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
     """
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -359,7 +361,7 @@ def gaudi_wav2vec2_forward(
 
 def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2290
+    
     v4.38.2 implementation caused accuracy issue to run pytest Wav2Vec2RobustModelTest.
     """
     hidden_states = hidden_states.unsqueeze(1)

From de64700b6affeeed5f4aecd9c02a11825a23aa59 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 02:42:57 +0000
Subject: [PATCH 13/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../modeling_gaudi/generation/utils.py        | 34 +++++++++----------
 .../models/bloom/modeling_bloom.py            |  4 +--
 .../models/falcon/modeling_falcon.py          | 20 +++++------
 .../models/gpt2/modeling_gpt2.py              |  8 ++---
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 10 +++---
 .../models/gpt_neox/modeling_gpt_neox.py      |  8 ++---
 .../models/gptj/modeling_gptj.py              |  8 ++---
 .../models/llama/modeling_llama.py            | 28 +++++++--------
 .../models/mistral/modeling_mistral.py        | 22 ++++++------
 .../models/mixtral/modeling_mixtral.py        | 14 ++++----
 .../models/modeling_all_models.py             |  4 +--
 .../models/modeling_attn_mask_utils.py        |  2 +-
 .../modeling_gaudi/models/mpt/modeling_mpt.py | 10 +++---
 .../modeling_gaudi/models/opt/modeling_opt.py | 14 ++++----
 .../modeling_gaudi/models/phi/modeling_phi.py | 10 +++---
 .../models/speecht5/modeling_speecht5.py      | 10 +++---
 .../models/swin/modeling_swin.py              |  5 +--
 .../modeling_gaudi/models/t5/modeling_t5.py   |  4 +--
 .../models/wav2vec2/modeling_wav2vec2.py      | 30 ++++------------
 19 files changed, 110 insertions(+), 135 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index d1748bc4c76..0160ead63dc 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -211,7 +211,7 @@ def _prepare_decoder_input_ids_for_generation(
                     torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
                 )
         else:
-            # creating padded decoder_input_ids to achieve static shapes. 
+            # creating padded decoder_input_ids to achieve static shapes.
             # Later new tokens once generated are copied in to decoder_input_ids based on token_idx
             max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length
             decoder_input_ids_start = (
@@ -252,9 +252,7 @@ def _update_model_kwargs_for_generation(
         is_encoder_decoder: bool = False,
         standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
-        """
-        Adds support for `token_idx`, which is necessary for using static shapes.
-        """
+        """Adds support for `token_idx`, which is necessary for using static shapes."""
         # mark to identify starting from second token
         model_kwargs["first_token"] = False
         # update past_key_values
@@ -310,7 +308,7 @@ def update_model_kwargs_for_bucketing(
         self, params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile=False
     ):
         if params["need_expansion"]:
-            # Pad inputs to have static shapes during generation, 
+            # Pad inputs to have static shapes during generation,
             # this gives better performance than dynamic shapes on HPUs
             pad_amount = params["allocated_space"] - input_ids.shape[-1]
             input_ids = torch.nn.functional.pad(input_ids, (0, pad_amount), value=pad_token_id)
@@ -359,9 +357,9 @@ def create_pad_arg(pad_amount, i, j):
                             pad_tuple = create_pad_arg(pad_amount, i, j)
                             # Different models might have different shapes of kv-cache
                             # create_pad_arg handles them on a per-model basis
-                            # This is a necessary (but not sufficient) condition: 
+                            # This is a necessary (but not sufficient) condition:
                             # what ever dimension we are padding, should be a multiple of bucket_size
-                            # This check is added in case we get a new model with a new kv-cache structure, 
+                            # This check is added in case we get a new model with a new kv-cache structure,
                             # and we attempt to pad some wrong dimension
                             assert (
                                 model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == 0
@@ -399,8 +397,8 @@ def generate(
 
         <Tip warning={true}>
 
-        Most generation-controlling parameters are set in [`transformers.generation.generation_config`] 
-        which, if not passed, will be set to the model's default generation configuration. 
+        Most generation-controlling parameters are set in [`transformers.generation.generation_config`]
+        which, if not passed, will be set to the model's default generation configuration.
         You can override any `generation_config` by passing the corresponding
         parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
 
@@ -472,10 +470,10 @@ def generate(
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
 
         Return:
-            [`transformers.utils.ModelOutput`] or `torch.LongTensor`: 
+            [`transformers.utils.ModelOutput`] or `torch.LongTensor`:
             A [`transformers.generationutils.ModelOutput`] (if `return_dict_in_generate=True`
             or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
-                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), 
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`),
             the possible [`transformers.generationutils.ModelOutput`] types are:
                     - [`transformers.generation.GenerateDecoderOnlyOutput`],
                     - [`transformers.generation.GenerateBeamDecoderOnlyOutput`]
@@ -605,12 +603,12 @@ def generate(
                 assert generation_config.bucket_size >= 0, "please set valid bucket_size to use bucket_internal"
 
         if generation_config.static_shapes:
-            # Pad inputs to have static shapes during generation, 
+            # Pad inputs to have static shapes during generation,
             # this gives better performance than dynamic shapes on HPUs
             # In encoder_decoder models, Inputs are already padded
 
             if not self.config.is_encoder_decoder:
-                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), 
+                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0),
                 # then that is taken care in greedy_search()
                 if not is_greedy_or_beam_and_bucket:
                     # token_idx is the current index in the generation process,
@@ -1147,7 +1145,7 @@ def contrastive_search(
         profiling_steps: Optional[int] = 0,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""Generates sequences of token ids for models with a language modeling head using **contrastive search** 
+        r"""Generates sequences of token ids for models with a language modeling head using **contrastive search**
         and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
@@ -1325,7 +1323,7 @@ def greedy_search(
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], 
+            [`transformers.generation.GenerateDecoderOnlyOutput`],
             [`transformers.generation.GenerateEncoderDecoderOutput`]
             or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
             [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
@@ -2750,11 +2748,11 @@ def group_beam_search(
         Return:
             [`transformers.generation.GenerateBeamDecoderOnlyOutput`],
             [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: 
+            `torch.LongTensor`:
             A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if 
+            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if
             [`transformers.generation.BeamSearchDecoderOnlyOutput`] if
-            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or 
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or
             a [`transformers.generation.GenerateBeamEncoderDecoderOutput`]
             if `model.config.is_encoder_decoder=True`.
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
index 21a569ade10..b05fa721aec 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -173,7 +173,7 @@ def gaudi_bloom_attention_forward(
     # change view to [batch_size, num_heads, q_length, kv_length]
     attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
 
-    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype 
+    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype
     # - [batch_size, num_heads, q_length, kv_length]
     input_dtype = attention_scores.dtype
     attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
@@ -335,7 +335,7 @@ def gaudi_bloom_model_forward(
     **deprecated_arguments,
 ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
     if deprecated_arguments.pop("position_ids", False) is not False:
-        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to 
+        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to
         # `False` allows to detect if users were passing explicitly `None`
         warnings.warn(
             "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index 10a6e9829bb..68bbebe7de2 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -82,7 +82,7 @@ def gaudi_falcon_attention_split_heads(
     self, fused_qkv: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Changing index operation of qkv[:::] to use torch.index_select to work around gradient 
+    Changing index operation of qkv[:::] to use torch.index_select to work around gradient
     accuracy issue and improve performance.
     """
     if self.new_decoder_architecture:
@@ -203,8 +203,8 @@ def gaudi_falcon_attention_forward(
                         value_layer,
                         attention_mask,
                         0.0,
-                        # The query_length > 1 is necessary to match with 
-                        # AttentionMaskConverter.to_causal_4d that does not create a 
+                        # The query_length > 1 is necessary to match with
+                        # AttentionMaskConverter.to_causal_4d that does not create a
                         # causal mask in case query_length == 1.
                         self.is_causal and attention_mask is None and query_length > 1,
                     )
@@ -219,7 +219,7 @@ def gaudi_falcon_attention_forward(
                     value_layer,
                     attention_mask,
                     0.0,
-                    # The query_length > 1 is necessary to match with 
+                    # The query_length > 1 is necessary to match with
                     # AttentionMaskConverter.to_causal_4d that does not create a causal
                     #  mask in case query_length == 1.
                     is_causal=self.is_causal and attention_mask is None and query_length > 1,
@@ -274,7 +274,7 @@ def gaudi_falcon_attention_forward(
             # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype
             # - [batch_size, num_heads, q_length, kv_length]
             input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` 
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32`
             # have a minimum value of `-3.4e+38`
             if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
                 attention_scores = attention_scores.to(torch.float32)
@@ -319,7 +319,7 @@ def gaudi_falcon_decoder_layer_forward(
     **kwargs,
 ):
     """
-    
+
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into attention inputs
@@ -375,7 +375,7 @@ def gaudi_falcon_decoder_layer_forward(
 
 class GaudiFalconModel(FalconModel):
     """
-    
+
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into decoder inputs
@@ -472,7 +472,7 @@ def forward(
                 alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
 
                 attention_mask_2d = attention_mask
-                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as 
+                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as
                 # we need to mask alibi using the 4D attention_mask untouched.
                 attention_mask = _gaudi_prepare_4d_causal_attention_mask(
                     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
@@ -488,7 +488,7 @@ def forward(
                         torch.finfo(alibi.dtype).min,
                     )
 
-                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with 
+                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with
                     # the memory-efficient attention backend
                     if seq_length > 1:
                         attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
@@ -567,7 +567,7 @@ def forward(
 
 class GaudiFalconForCausalLM(FalconForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into model inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
index c23734947e3..00cbe7d4960 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt2/modeling_gpt2.py
@@ -22,7 +22,7 @@
 
 class GaudiGPT2Attention(GPT2Attention):
     """
-    
+
     The only differences are:
     - optimize KV cache
     """
@@ -195,7 +195,7 @@ def gaudi_gpt2_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -272,7 +272,7 @@ def gaudi_gpt2_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    
+
     The only differences are:
     - disable autocast for attention_mask
     - add new args token_idx
@@ -456,7 +456,7 @@ def gaudi_gpt2_forward(
 
 class GaudiGPT2LMHeadModel(GPT2LMHeadModel):
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
index 48afcea668b..16a431d82a6 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -39,7 +39,7 @@ def gaudi_gpt_bigcode_attention_forward(
     Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
 ]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -109,7 +109,7 @@ def gaudi_gpt_bigcode_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -183,7 +183,7 @@ def gaudi_gpt_bigcode_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - if token_idx and past_key_values are passed, set self_attention_mask based on the static shape of past_key_values
@@ -259,7 +259,7 @@ def gaudi_gpt_bigcode_model_forward(
 
         if query_length > 1 and attention_mask is not None:
             # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-            # 
+            #
             self_attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
                 self_attention_mask, attention_mask, unmasked_value=True
             )
@@ -372,7 +372,7 @@ def gaudi_gpt_bigcode_model_forward(
 
 class GaudiGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
index b65e03c23a4..23560fe9d12 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gpt_neox/modeling_gpt_neox.py
@@ -40,7 +40,7 @@ def gaudi_gpt_neox_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -124,7 +124,7 @@ def gaudi_gpt_neox_layer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -180,7 +180,7 @@ def gaudi_gpt_neox_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -306,7 +306,7 @@ def gaudi_gpt_neox_model_forward(
 
 class GaudiGPTNeoXForCausalLM(GPTNeoXForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
index 0c53554fc35..ce2587765c1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/gptj/modeling_gptj.py
@@ -87,7 +87,7 @@ def forward(
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         - remove is_torch_fx_proxy
@@ -171,7 +171,7 @@ def gaudi_gptj_block_forward(
     cos: Optional[torch.Tensor] = None,
 ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - pass sin and cos from upper level as they are identical for each attn block
@@ -222,7 +222,7 @@ def gaudi_gptj_model_forward(
     cos: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - pass sin and cos from upper level as they are identical for each attn block
@@ -403,7 +403,7 @@ def gaudi_gptj_model_forward(
 
 class GaudiGPTJForCausalLM(GPTJForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 665c385758f..5d7174a244f 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -64,7 +64,7 @@
 
 def gaudi_llama_rmsnorm_forward(self, hidden_states):
     """
-    
+
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -128,13 +128,13 @@ def gaudi_llama_repeat_kv(
     n_rep: int,
 ):
     """
-    
+
     The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during 
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during
           matmuls so need to expand and reshape them.
         - Add new args query_states, key_states, value_states and attention_mask and
           update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to 
+    The query states go from (batch, num_heads, seqlen, head_dim) to
     (batch, num_key_value_heads, n_rep, seqlen, head_dim)
     The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to
     (batch, num_key_value_heads, 1, seqlen, head_dim)
@@ -337,7 +337,7 @@ def pre_attn_forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         - optimize KV cache
@@ -530,7 +530,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         - add new args attn_softmax_bf16
@@ -639,13 +639,11 @@ def post_mlp(self, hidden_states, residual):
 
 
 class GaudiLlamaModel(LlamaModel):
-    """
-    
-    """
+    """"""
 
     def __init__(self, config: LlamaConfig):
         """
-        
+
         1. set fill_value to 1 instead of True
         2. add device=self.device
         """
@@ -704,7 +702,7 @@ def forward(
         cache_prune_num: int = 0,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         - add new args attn_softmax_bf16
@@ -873,7 +871,7 @@ def forward(
 
 class GaudiLlamaForCausalLM(LlamaForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
@@ -1012,9 +1010,9 @@ def prepare_inputs_for_generation(
                     max_cache_length = None
 
                 # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, 
+                # 1 - If the length of the attention_mask exceeds the length of input_ids,
                 # then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache 
+                # some of the inputs are exclusively passed as part of the cache
                 # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
@@ -1022,7 +1020,7 @@ def prepare_inputs_for_generation(
                 # then input_ids holds all input tokens. We can discard input_ids based on the past_length.
                 elif past_length < input_ids.shape[1]:
                     input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), 
+                # 3 - Otherwise (past_length >= input_ids.shape[1]),
                 # let's assume input_ids only has unprocessed tokens.
 
                 # If we are about to go beyond the maximum cache length,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index ec82b896544..177276d302b 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -68,7 +68,7 @@ def update(prev, cur, dim, idx):
 
 def gaudi_mistral_rmsnorm_forward(self, hidden_states):
     """
-    
+
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -97,14 +97,14 @@ def gaudi_mistral_repeat_kv(
     n_rep: int,
 ):
     """
-    
+
     The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during 
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during
           matmuls so need to expand and reshape them.
         - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to 
+    The query states go from (batch, num_heads, seqlen, head_dim) to
     (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to 
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to
     (batch, num_key_value_heads, 1, seqlen, head_dim)
     """
     batch, num_key_value_heads, kv_len, head_dim = key_states.shape
@@ -176,7 +176,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
-         
+
          The only differences are:
          - add new args token_idx
          - add new args reuse_cache
@@ -314,7 +314,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
@@ -380,7 +380,7 @@ def forward(
         attn_softmax_bf16: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
@@ -548,7 +548,7 @@ def forward(
         attn_softmax_bf16: Optional[bool] = False,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
@@ -614,7 +614,7 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         """
-        
+
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
@@ -637,7 +637,7 @@ def prepare_inputs_for_generation(
                 # Keep only the unprocessed tokens:
                 # 1 - If the length of the attention_mask exceeds the length of input_ids,
                 # then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache 
+                # some of the inputs are exclusively passed as part of the cache
                 # (e.g. when passing input_embeds as input)
                 if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                     input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 6c7b2212653..1b1abec9ebc 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -94,7 +94,7 @@ def apply_customized_rope(q, k, cos, sin, position_ids):
 
 def gaudi_mixtral_rmsnorm_forward(self, hidden_states):
     """
-    
+
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -123,7 +123,7 @@ def gaudi_mixtral_repeat_kv(
     n_rep: int,
 ):
     """
-    
+
     The only differences are:
     - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls
       so need to expand and reshape them.
@@ -192,7 +192,7 @@ def gaudi_mixtral_attention_forward(
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -279,7 +279,7 @@ def gaudi_mixtral_attention_forward(
 
 def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    
+
     The only differences are:
     - optimize expert forward, remove dynamic control and dynamic shape
     """
@@ -339,7 +339,7 @@ def gaudi_mixtral_decoder_layer_forward(
     **kwargs,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -397,7 +397,7 @@ def gaudi_mixtral_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, MoeModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -553,7 +553,7 @@ def gaudi_mixtral_model_forward(
 
 class GaudiMixtralForCausalLM(MixtralForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
index 8f8d1825ccc..984cc60f7e7 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
@@ -98,9 +98,7 @@ def gaudi_get_extended_attention_mask(
 
 
 def gaudi_conv1d_forward(self, x):
-    """
-    but moves reshape before view for tpc auto fusion.
-    """
+    """But moves reshape before view for tpc auto fusion."""
     size_out = x.size()[:-1] + (self.nf,)
     x = torch.mm(x.view(-1, x.size(-1)), self.weight)
     x = x.view(size_out)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
index 58d542b48e7..04cd0a390b7 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_attn_mask_utils.py
@@ -22,7 +22,7 @@
 class GaudiAttentionMaskConverter(AttentionMaskConverter):
     """
     Differences:
-    - replace `triu` with similar logic here: 
+    - replace `triu` with similar logic here:
 
     """
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
index c1ea826fd18..f6f1594a33e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mpt/modeling_mpt.py
@@ -39,7 +39,7 @@ def gaudi_mpt_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -112,7 +112,7 @@ def gaudi_mpt_block_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -165,7 +165,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
@@ -284,7 +284,7 @@ def prepare_inputs_for_generation(
         **kwargs,
     ) -> dict:
         """
-        
+
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
@@ -336,7 +336,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
index 1de0168e4fe..0fb95bdc0a1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/opt/modeling_opt.py
@@ -24,7 +24,7 @@
 
 class GaudiOPTLearnedPositionalEmbedding(OPTLearnedPositionalEmbedding):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - compute embedding using token_idx if past_key_values_length not 0
@@ -44,7 +44,7 @@ def forward(
             positions = positions[:, past_key_values_length:]
             return torch.nn.Embedding.forward(self, positions + self.offset)
         else:
-            # if not 0, kv cache is enabled and from step = 2, 
+            # if not 0, kv cache is enabled and from step = 2,
             # past_key_values_length is equal to the final length of outputs
             return torch.nn.Embedding.forward(self, token_idx + self.offset)
 
@@ -60,7 +60,7 @@ def gaudi_opt_attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -188,7 +188,7 @@ def gaudi_opt_decoder_layer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -260,7 +260,7 @@ def gaudi_opt_decoder_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     - update calculation of mask_seq_length
@@ -413,7 +413,7 @@ def gaudi_opt_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -451,7 +451,7 @@ def gaudi_opt_model_forward(
 
 class GaudiOPTForCausalLM(OPTForCausalLM):
     """
-    
+
     The only differences are:
     - add new args token_idx
     - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
index f5742529d86..77101cb5b9b 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
@@ -50,7 +50,7 @@ def gaudi_phi_attention_forward(
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -170,7 +170,7 @@ def gaudi_phi_decoder_layer_forward(
     **kwargs,
 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -218,7 +218,7 @@ def gaudi_phi_model_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -346,7 +346,7 @@ def forward(
         token_idx: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         """
-        
+
         The only differences are:
         - add new args token_idx
         """
@@ -404,7 +404,7 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         """
-        
+
         The only differences are:
         - add new args token_idx
         - add token_idx into model_inputs
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
index 72d137b3437..d0f63ea3c51 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
@@ -37,7 +37,7 @@ def gaudi_SpeechT5SpeechDecoderPrenet_forward(
     speaker_embeddings: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - disable dropout in inference, or else hpu graph could not be used
     """
@@ -72,7 +72,7 @@ def gaudi_SpeechT5Attention_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    
+
     The only differences are:
     - add new args token_idx
     """
@@ -209,7 +209,7 @@ def gaudi_SpeechT5DecoderLayer_forward(
     token_idx: Optional[torch.Tensor] = None,
 ):
     """
-    
+
     The only differences are:
     - add token_idx in self-attention
     """
@@ -285,7 +285,7 @@ def gaudi_SpeechT5Decoder_forward(
     token_idx: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
     """
-    
+
     The only differences are:
     - add token_idx args
     - use _gaudi_prepare_4d_causal_attention_mask
@@ -420,7 +420,7 @@ def gaudi_generate_speech(
     return_output_lengths: bool = False,
 ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
     """
-    
+
     The only differences are:
     - add hpu graph wrap
     - add static shape support in kv-cache in _generate_speech
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
index 8dc5066d2e2..2706fe3a3bf 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/swin/modeling_swin.py
@@ -19,10 +19,7 @@
 
 
 def gaudi_swin_get_attn_mask(self, height, width, dtype):
-    """
-    
-    The only difference is moving img_mask to hpu for performance
-    """
+    """The only difference is moving img_mask to hpu for performance."""
     if self.shift_size > 0:
         # calculate attention mask for SW-MSA
         img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device="hpu")
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
index bff64e72d74..42c09baa9eb 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -41,7 +41,7 @@
 
 def gaudi_t5_layernorm_forward(self, hidden_states):
     """
-    
+
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
@@ -452,7 +452,7 @@ def gaudi_T5Stack_forward(
             )
 
         # layer_outputs is a tuple with:
-        # hidden-states, key-value-states, (self-attention position bias), 
+        # hidden-states, key-value-states, (self-attention position bias),
         # (self-attention weights), (cross-attention position bias), (cross-attention weights)
         if use_cache is False:
             layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
index 7a84045904c..967b6cc2e2d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -31,10 +31,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
     attention_mask: Optional[torch.LongTensor] = None,
     min_masks: int = 0,
 ) -> torch.Tensor:
-    """
-    
-    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
-    """
+    """The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers)."""
     batch_size, sequence_length = shape
 
     if mask_length < 1:
@@ -134,10 +131,7 @@ def compute_num_masked_span(input_length):
 def _gaudi_wav2vec2_sample_negative_indices(
     features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[torch.Tensor] = None
 ):
-    """
-    
-    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
-    """
+    """The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers)."""
     batch_size, sequence_length = features_shape
 
     # generate indices of the positive vectors themselves, repeat them `num_negatives` times
@@ -178,10 +172,9 @@ def _gaudi_wav2vec2_mask_hidden_states(
     mask_time_indices: Optional[torch.FloatTensor] = None,
     attention_mask: Optional[torch.LongTensor] = None,
 ):
-    """
-    
-    Differences are that (1) `mask_time_indices` is not moved to the current device and 
+    """Differences are that (1) `mask_time_indices` is not moved to the current device and
     converted into boolean because this is already done in _compute_mask_indices.
+
     (2) index_put operation on hidden_states is replaced by combination of simpler ops (more suitable for HPU graphs)
     """
 
@@ -235,10 +228,7 @@ def gaudi_wav2vec2_encoder_forward(
     output_hidden_states: bool = False,
     return_dict: bool = True,
 ):
-    """
-    
-    The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)
-    """
+    """The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)"""
     all_hidden_states = () if output_hidden_states else None
     all_self_attentions = () if output_attentions else None
 
@@ -311,10 +301,7 @@ def gaudi_wav2vec2_forward(
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
-    """
-    
-    The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
-    """
+    """The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error."""
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     output_hidden_states = (
         output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -360,10 +347,7 @@ def gaudi_wav2vec2_forward(
 
 
 def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-    """
-    
-    v4.38.2 implementation caused accuracy issue to run pytest Wav2Vec2RobustModelTest.
-    """
+    """v4.38.2 implementation caused accuracy issue to run pytest Wav2Vec2RobustModelTest."""
     hidden_states = hidden_states.unsqueeze(1)
     hidden_states = torch.nn.functional.unfold(
         hidden_states,

From 455e5c34f48c4ecffe6422a66347370afff901dc Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Wed, 22 May 2024 07:16:38 +0000
Subject: [PATCH 14/25] fix the pylint issue

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 .../modeling/modeling_gaudi/generation/utils.py            | 7 ++++---
 .../modeling_gaudi/models/falcon/modeling_falcon.py        | 2 +-
 .../modeling_gaudi/models/llama/pos_shift_llama.py         | 2 --
 .../modeling_gaudi/models/mistral/modeling_mistral.py      | 3 ++-
 .../modeling/modeling_gaudi/models/modeling_all_models.py  | 6 +++---
 .../modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py    | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index 0160ead63dc..245e00ab0f9 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -1450,7 +1450,7 @@ def greedy_search(
 
             if bucket_size > 0 and not bucket_internal:
                 # it will not have been padded if bucket_size > 0
-                params = next(inc)
+                params = next(inc) # pylint: disable=E0601
                 input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
                     params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
                 )
@@ -1517,7 +1517,8 @@ def greedy_search(
             if not ignore_eos and eos_token_id is not None:
                 if pad_token_id is None:
                     raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+                next_tokens = next_tokens * unfinished_sequences + \
+                        pad_token_id * (1 - unfinished_sequences) # pylint: disable=E0606
 
             # update generated ids, model inputs, and length for next step
             if token_idx is not None:
@@ -2252,7 +2253,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
 
             if bucket_size > 0:
                 # it will not have been padded if bucket_size > 0
-                params = next(inc)
+                params = next(inc) # pylint: disable=E0606
                 input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
                     params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
                 )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index 68bbebe7de2..4fff8c7e05d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -358,7 +358,7 @@ def gaudi_falcon_decoder_layer_forward(
     outputs = attn_outputs[1:]
 
     # MLP.
-    mlp_output = self.mlp(mlp_layernorm_out)
+    mlp_output = self.mlp(mlp_layernorm_out) # pylint: disable=E0606
 
     if self.config.new_decoder_architecture or self.config.parallel_attn:
         mlp_output += attention_output
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
index 12fa9181a94..ede2c57768b 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
@@ -116,7 +116,6 @@ def gaudi_llama_pos_shift_pre_attn_forward(
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
-    Copied from LlamaAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
     The only differences are:
     - add new args token_idx
     - optimize KV cache
@@ -151,7 +150,6 @@ def gaudi_llama_pos_shift_pre_attn_forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440)
     key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
     value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index 177276d302b..4ba9e6c2f7d 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -507,7 +507,8 @@ def forward(
             next_cache = (
                 next_decoder_cache
                 if not use_new_cache
-                else (next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache)
+                else (next_decoder_cache.to_legacy_cache() # pylint: disable=E1101
+                    if use_legacy_cache else next_decoder_cache)
             )
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
index 984cc60f7e7..969e2b1a3e6 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/modeling_all_models.py
@@ -37,7 +37,7 @@ def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> t
     # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
     # encoder_extended_attention_mask.transpose(-1, -2))
     # torch.finfo must take the dtype of encoder_extended_attention_mask
-    encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # bf16 compatibility
+    encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # pylint: disable=E0601
     encoder_extended_attention_mask = 1.0 - encoder_extended_attention_mask
     #  Fixes issue where the model is not in bf16 and mul is casting it to values out of range resulting in nan
     with torch.autocast(enabled=False, device_type="hpu"):
@@ -122,7 +122,7 @@ def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> P
     # Otherwise, fallback to original implementation
 
     if hard_check_only:
-        if not cls._supports_sdpa:
+        if not cls._supports_sdpa: # pylint: disable=E1101
             raise ValueError(
                 f"{cls.__name__} does not support an attention implementation through "
                 "torch.nn.functional.scaled_dot_product_attention yet."
@@ -134,7 +134,7 @@ def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> P
         if not is_torch_sdpa_available():
             raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.")
 
-    if not is_torch_sdpa_available() or not cls._supports_sdpa:
+    if not is_torch_sdpa_available() or not cls._supports_sdpa: # pylint: disable=E1101
         return config
 
     _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
index 967b6cc2e2d..cef7962fb74 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -228,7 +228,7 @@ def gaudi_wav2vec2_encoder_forward(
     output_hidden_states: bool = False,
     return_dict: bool = True,
 ):
-    """The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)"""
+    """The only difference is that torch.rand device is set to 'hpu'"""
     all_hidden_states = () if output_hidden_states else None
     all_self_attentions = () if output_attentions else None
 

From 83a42b24a109f7344e09481f83ad3981e6ec7890 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 07:23:22 +0000
Subject: [PATCH 15/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
index cef7962fb74..20b4547b179 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/wav2vec2/modeling_wav2vec2.py
@@ -228,7 +228,7 @@ def gaudi_wav2vec2_encoder_forward(
     output_hidden_states: bool = False,
     return_dict: bool = True,
 ):
-    """The only difference is that torch.rand device is set to 'hpu'"""
+    """The only difference is that torch.rand device is set to 'hpu'."""
     all_hidden_states = () if output_hidden_states else None
     all_self_attentions = () if output_attentions else None
 

From 80476f19abdc4a7593d40532f0544e32210fca7d Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Wed, 22 May 2024 15:25:33 +0800
Subject: [PATCH 16/25] add optimum-habana when pylint

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .github/workflows/script/formatScan/pylint.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
index a3ddaddceeb..d9df5f29f7c 100644
--- a/.github/workflows/script/formatScan/pylint.sh
+++ b/.github/workflows/script/formatScan/pylint.sh
@@ -31,6 +31,8 @@ fi
 pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
 pip install accelerate nlpaug nltk schema optimum-intel optimum peft
 pip install --upgrade --force-reinstall transformers==4.36.2
+pip install optimum-habana
+pip install deepspeed
 
 echo "[DEBUG] list pipdeptree..."
 pip install pipdeptree

From 5aedccb12f7060fefc7beed6db37b2b8ae5c7e22 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Wed, 22 May 2024 09:21:41 +0000
Subject: [PATCH 17/25] add pylint comment

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 .../modeling_gaudi/generation/utils.py        | 12 ++++----
 .../models/bart/modeling_bart.py              |  4 +--
 .../models/blip/modeling_blip.py              |  4 +--
 .../models/bloom/modeling_bloom.py            |  6 ++--
 .../models/esm/modeling_esmfold.py            |  2 +-
 .../models/falcon/modeling_falcon.py          |  8 ++---
 .../models/llama/modeling_llama.py            | 29 +++++++++++--------
 .../models/llama/pos_shift_llama.py           |  9 ++++--
 .../models/mistral/modeling_mistral.py        |  4 +--
 .../models/mixtral/modeling_mixtral.py        |  4 +--
 .../modeling_gaudi/models/phi/modeling_phi.py |  2 +-
 .../models/speecht5/modeling_speecht5.py      |  2 +-
 .../modeling_gaudi/models/t5/modeling_t5.py   |  2 +-
 13 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index 245e00ab0f9..c1f68dd05a8 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -810,7 +810,7 @@ def generate(
 
         # In lazy mode, import Habana torch to be able to add mark_step()
         if lazy_mode:
-            import habana_frameworks.torch.core as htcore
+            import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 
             self.htcore_generation = htcore
 
@@ -1462,7 +1462,7 @@ def greedy_search(
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
 
             # forward pass to get next token
-            outputs = self(
+            outputs = self( # pylint: disable=E1102
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
@@ -1816,7 +1816,7 @@ def sample(
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
 
             # forward pass to get next token
-            outputs = self(
+            outputs = self( # pylint: disable=E1102
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
@@ -2285,7 +2285,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                     model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
                 )
                 outputs_per_sub_batch = [
-                    self(
+                    self( # pylint: disable=E1102
                         **inputs_per_sub_batch,
                         return_dict=True,
                         output_attentions=output_attentions,
@@ -2297,7 +2297,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 outputs = stack_model_outputs(outputs_per_sub_batch)
             else:
                 hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-                outputs = self(
+                outputs = self( # pylint: disable=E1102
                     **model_inputs,
                     return_dict=True,
                     output_attentions=output_attentions,
@@ -3050,7 +3050,7 @@ def constrained_beam_search(
 
             hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
 
-            outputs = self(
+            outputs = self( #pylint: disable=E1102
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
index c5b958463cf..2b065979ad4 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bart/modeling_bart.py
@@ -342,7 +342,7 @@ def gaudi_BartEncoder_forward(
         inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
     embed_pos = self.embed_positions(input)
-    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.core as htcore # pylint: disable=E0401 # pylint: disable=E0401
 
     htcore.mark_step()
     embed_pos = embed_pos.to(inputs_embeds.device)
@@ -493,7 +493,7 @@ def gaudi_BartDecoder_forward(
             )
 
     # embed positions
-    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.core as htcore # pylint: disable=E0401 # pylint: disable=E0401
 
     htcore.mark_step()
     positions = self.embed_positions(input, tensor_past_key_values_length)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
index fcfe3b18c9a..ddbfc7474dc 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/blip/modeling_blip.py
@@ -34,7 +34,7 @@ def gaudi_BlipForConditionalGeneration_generate(
         - wrap hpu graph for each part
     """
     if generate_kwargs.get("hpu_graphs", True):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph # pylint: disable=E0401 # pylint: disable=E0401
 
         if not hasattr(self.vision_model, "clear_cache"):
             self.vision_model = wrap_in_hpu_graph(self.vision_model)
@@ -88,7 +88,7 @@ def gaudi_BlipForQuestionAnswering_generate(
         lead to coredump in embedding layer
     """
     if generate_kwargs.get("hpu_graphs", True):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph # pylint: disable=E0401 # pylint: disable=E0401
 
         if not hasattr(self.vision_model, "clear_cache"):
             self.vision_model = wrap_in_hpu_graph(self.vision_model)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
index b05fa721aec..eb6a5f5392a 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/bloom/modeling_bloom.py
@@ -199,7 +199,7 @@ def gaudi_bloom_attention_forward(
         slices = self.hidden_size / self.pretraining_tp
         output_tensor = torch.zeros_like(context_layer)
         for i in range(self.pretraining_tp):
-            output_tensor = output_tensor + F.linear(
+            output_tensor = output_tensor + F.linear( # pylint: disable=E1102
                 context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
                 self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
             )
@@ -468,7 +468,7 @@ def gaudi_bloom_model_forward(
 class GaudiBloomForCausalLM(BloomForCausalLM):
     inference_tp_size = None
 
-    def set_tp_for_inference(tp_for_inference: int):
+    def set_tp_for_inference(self, tp_for_inference: int):
         world = int(os.environ.get("WORLD_SIZE", 1))
         assert tp_for_inference == 1 or tp_for_inference == world, "only setting 1 (no tp) or world size is supported"
         GaudiBloomForCausalLM.inference_tp_size = tp_for_inference
@@ -594,7 +594,7 @@ def _reorder_cache(
 
         Output shares the same memory storage as `past`.
         """
-        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx), training=self.training)
+        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
 
         # Get a copy of `beam_idx` on all the devices where we need those indices.
         device_to_beam_idx = {
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
index b68feacf5ec..011c8b82a42 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/esm/modeling_esmfold.py
@@ -58,7 +58,7 @@ def trunk_iter(s, z, residx, mask):
         for block in self.blocks:
             s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
             if s.device.type == "hpu":
-                import habana_frameworks.torch.core as htcore
+                import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 
                 htcore.mark_step()
         return s, z
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
index 4fff8c7e05d..36de983baf4 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/falcon/modeling_falcon.py
@@ -27,7 +27,7 @@
     FusedSDPA = None
 
 try:
-    from habana_frameworks.torch.hpu import sdp_kernel
+    from habana_frameworks.torch.hpu import sdp_kernel # pylint: disable=E0401
 
     SDPContext = True
 except ImportError:
@@ -40,7 +40,7 @@
     FusedRoPE = None
 
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
@@ -213,7 +213,7 @@ def gaudi_falcon_attention_forward(
                 if self.training is True and query_layer.shape != key_layer.shape:
                     key_layer = torch.broadcast_to(key_layer, query_layer.shape)
                     value_layer = torch.broadcast_to(value_layer, query_layer.shape)
-                attn_output = F.scaled_dot_product_attention(
+                attn_output = F.scaled_dot_product_attention( # pylint: disable=E1102
                     query_layer,
                     key_layer,
                     value_layer,
@@ -253,7 +253,7 @@ def gaudi_falcon_attention_forward(
                         self.is_causal and attention_mask is None and query_length > 1,
                     )
             else:
-                attn_output = F.scaled_dot_product_attention(
+                attn_output = F.scaled_dot_product_attention( # pylint: disable=E1102
                     query_layer,
                     key_layer,
                     value_layer,
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 5d7174a244f..b10429ca673 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -59,7 +59,7 @@
     print("Not using HPU fused scaled dot-product attention kernel.")
     FusedSDPA = None
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 
 
 def gaudi_llama_rmsnorm_forward(self, hidden_states):
@@ -94,14 +94,15 @@ def pre_mlp_forward(self, x):
             down_proj_slices = self.down_proj.weight.split(slice, dim=1)
 
             gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+                [F.linear(x, gate_proj_slices[i]) \
+                    for i in range(self.config.pretraining_tp)], dim=-1) # pylint: disable=E1102
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) \
+                    for i in range(self.config.pretraining_tp)], dim=-1) # pylint: disable=E1102
 
             intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
             down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
+                F.linear(intermediate_states[i], down_proj_slices[i]) \
+                    for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
             output = sum(down_proj)
         else:
             input = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
@@ -358,13 +359,16 @@ def pre_attn_forward(
             key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
             value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
 
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = [F.linear(hidden_states, query_slices[i]) \
+                    for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
             query_states = torch.cat(query_states, dim=-1)
 
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = [F.linear(hidden_states, key_slices[i]) \
+                    for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
             key_states = torch.cat(key_states, dim=-1)
 
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = [F.linear(hidden_states, value_slices[i]) \
+                    for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
             value_states = torch.cat(value_states, dim=-1)
 
         else:
@@ -492,7 +496,7 @@ def post_attn_forward(self, attn_output):
 
 class GaudiLlamaDecoderLayer(LlamaDecoderLayer):
     def __init__(self, config: LlamaConfig, layer_idx: int):
-        super(GaudiLlamaDecoderLayer, self).__init__()
+        super(GaudiLlamaDecoderLayer, self).__init__(config, layer_idx)
         self.hidden_size = config.hidden_size
 
         self.self_attn = GaudiLlamaAttention(config=config, layer_idx=layer_idx)
@@ -956,7 +960,8 @@ def forward(
 
         if self.config.pretraining_tp > 1:
             lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = [F.linear(hidden_states, lm_head_slices[i]) \
+                    for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
             logits = torch.cat(logits, dim=-1)
         else:
             logits = self.lm_head(hidden_states)
@@ -1030,7 +1035,7 @@ def prepare_inputs_for_generation(
                     and attention_mask is not None
                     and cache_length + input_ids.shape[1] > max_cache_length
                 ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
+                    attention_mask = attention_mask[:, -max_cache_length:] # pylint: disable=E1130
         elif reuse_cache and token_idx is not None:
             # With reuse_cache, KV cache is pre allocated hence for the 1st token
             # we can slice the inputs till token idx for the fwd pass
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
index ede2c57768b..9870306c975 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
@@ -136,13 +136,16 @@ def gaudi_llama_pos_shift_pre_attn_forward(
         key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
         value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
 
-        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+        query_states = [F.linear(hidden_states, query_slices[i]) \
+                for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
         query_states = torch.cat(query_states, dim=-1)
 
-        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+        key_states = [F.linear(hidden_states, key_slices[i]) \
+                for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
         key_states = torch.cat(key_states, dim=-1)
 
-        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+        value_states = [F.linear(hidden_states, value_slices[i]) \
+                for i in range(self.config.pretraining_tp)] # pylint: disable=E1102
         value_states = torch.cat(value_states, dim=-1)
 
     else:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
index 4ba9e6c2f7d..52f5ac1fa6f 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mistral/modeling_mistral.py
@@ -22,7 +22,7 @@
 import math
 from typing import List, Optional, Tuple, Union
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -657,7 +657,7 @@ def prepare_inputs_for_generation(
                     and attention_mask is not None
                     and cache_length + input_ids.shape[1] > max_cache_length
                 ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
+                    attention_mask = attention_mask[:, -max_cache_length:] # pylint: disable=E1130
             else:
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 1b1abec9ebc..9044de727aa 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -23,7 +23,7 @@
 import warnings
 from typing import List, Optional, Tuple, Union
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -682,7 +682,7 @@ def prepare_inputs_for_generation(
                     and attention_mask is not None
                     and cache_length + input_ids.shape[1] > max_cache_length
                 ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
+                    attention_mask = attention_mask[:, -max_cache_length:] # pylint: disable=E1130
             else:
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
index 77101cb5b9b..b4c910f0898 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/phi/modeling_phi.py
@@ -446,7 +446,7 @@ def prepare_inputs_for_generation(
                     and attention_mask is not None
                     and cache_length + input_ids.shape[1] > max_cache_length
                 ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
+                    attention_mask = attention_mask[:, -max_cache_length:] # pylint: disable=E1130
             else:
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
index d0f63ea3c51..47f9592edeb 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/speecht5/modeling_speecht5.py
@@ -433,7 +433,7 @@ def gaudi_generate_speech(
                     https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                     """
         )
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph # pylint: disable=E0401
 
     if not hasattr(model.speecht5.encoder, "clear_cache"):
         model.speecht5.encoder = wrap_in_hpu_graph(model.speecht5.encoder)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
index 42c09baa9eb..b12b49c8c2e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/t5/modeling_t5.py
@@ -15,7 +15,7 @@
 import warnings
 from typing import Optional, Tuple, Union
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore # pylint: disable=E0401
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss

From 72f313f5e7c05a9e2505f8caabeab038fe7f3c7a Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Thu, 23 May 2024 11:15:24 +0800
Subject: [PATCH 18/25] add comment to avoid pylint check

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../modeling/modeling_gaudi/models/llama/modeling_llama.py   | 2 +-
 .../modeling/modeling_gaudi/models/llama/pos_shift_llama.py  | 5 +++--
 .../modeling_gaudi/models/mixtral/modeling_mixtral.py        | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index b10429ca673..94da9f563ea 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -423,7 +423,7 @@ def pre_attn_forward(
             past_key_value = None
 
         if use_flash_attention and FusedSDPA:
-            import habana_frameworks.torch.hpu as ht
+            import habana_frameworks.torch.hpu as ht # pylint: disable=E0401
 
             if q_len == 1:
                 # next token
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
index 9870306c975..592af1e1033 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama.py
@@ -57,7 +57,8 @@
 def gaudi_apply_rotary_pos_emb_single(x, cos, sin, position_ids):
     # TODO shape dimension check
     if x.device.type == "hpu" and has_fused_rope:
-        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 \
+            as FusedRoPE # pylint: disable=E0401
         return FusedRoPE.apply(
             x, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
         )
@@ -206,7 +207,7 @@ def gaudi_llama_pos_shift_pre_attn_forward(
     key_states = gaudi_apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
 
     if use_flash_attention and FusedSDPA:
-        import habana_frameworks.torch.hpu as ht
+        import habana_frameworks.torch.hpu as ht # pylint: disable=E0401
 
         if q_len == 1:
             # next token
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 9044de727aa..75d5236da8e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -67,7 +67,7 @@
 def update(prev, cur, dim, idx, inp_seq_len):
     orig_cur = cur
     if prev.dtype == torch.float8_e4m3fn:
-        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2
+        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2 # pylint: disable=E0401
 
         cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0]
     if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
@@ -238,7 +238,7 @@ def gaudi_mixtral_attention_forward(
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
     if FusedSDPA:
-        import habana_frameworks.torch.hpu as ht
+        import habana_frameworks.torch.hpu as ht # pylint: disable=E0401
 
         if q_len == 1:
             # next token

From 4642ee01ab84a26f476a608ad5642c41756c810b Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Thu, 23 May 2024 15:17:10 +0800
Subject: [PATCH 19/25] ignore modeling_gaudi pylint

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .github/CODEOWNERS                            | 1 +
 .github/workflows/script/formatScan/pylint.sh | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7ff7a1dccda..958a03eddb9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,7 @@
 /intel_extension_for_transformers/neural_chat liang1.lv@intel.com
 /workflows/chatbot liang1.lv@intel.com
 /intel_extension_for_transformers/transformers/llm/quantization penghui.cheng@intel.com
+/intel_extension_for_transformers/transformers/modeling/modeling_gaudi xi2.chen@intel.com
 /intel_extension_for_transformers/transformers penghui.cheng@intel.com
 /intel_extension_for_transformers/utils penghui.cheng@intel.com
 /docs wenxin.zhang@intel.com
diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
index ab41681b8df..e9263493051 100644
--- a/.github/workflows/script/formatScan/pylint.sh
+++ b/.github/workflows/script/formatScan/pylint.sh
@@ -43,7 +43,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --max-line-length=120 \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,cv2,PIL.Image \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,intel_extension_for_transformers.transformers.modeling.modeling_gaudi,cv2,PIL.Image \
     /intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
 exit_code1=$?
 
@@ -53,7 +53,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --disable=no-name-in-module,import-error,no-member,undefined-variable,no-value-for-parameter,unexpected-keyword-arg,not-callable,no-self-argument,too-many-format-args,invalid-unary-operand-type,too-many-function-args \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,cv2,PIL.Image \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,intel_extension_for_transformers.transformers.modeling.modeling_gaudi,cv2,PIL.Image \
     /intel-extension-for-transformers/intel_extension_for_transformers >> ${log_dir}/pylint.json
 exit_code2=$?
 

From 0f714b50ae54dfc46c064a5603fa6320c49c216d Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Thu, 23 May 2024 16:59:39 +0800
Subject: [PATCH 20/25] manual fix the pylint

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../modeling/modeling_gaudi/generation/utils.py   | 15 +++++++++------
 .../modeling_gaudi/models/llama/modeling_llama.py |  4 ++--
 .../models/mixtral/modeling_mixtral.py            |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index c1f68dd05a8..e78d099cd86 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -24,7 +24,7 @@
 import torch.distributed as dist
 from transformers.generation.beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-from transformers.generation.candidate_generator import CandidateGenerator
+from transformers.generation.candidate_generator import CandidateGenerator # pylint: disable=E0611
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import (
     StoppingCriteriaList,
@@ -43,13 +43,13 @@
     GenerationMode,
     _split_model_inputs,
     stack_model_outputs,
-)
+) # pylint: disable=E0611
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.utils import ModelOutput
 
 from optimum.utils import logging
 
-from optimum.habana.utils import HabanaProfile
+from optimum.habana.utils import HabanaProfile # pylint: disable=E0611
 from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model
 from .configuration_utils import GaudiGenerationConfig
 
@@ -1755,7 +1755,8 @@ def sample(
             eos_token_id = [eos_token_id]
         eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_logits = output_logits if output_logits is not None else \
+            self.generation_config.output_logits # pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
@@ -2088,7 +2089,8 @@ def beam_search(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_logits = output_logits if output_logits is not None else \
+            self.generation_config.output_logits# pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
@@ -2979,7 +2981,8 @@ def constrained_beam_search(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_logits = output_logits if output_logits is not None else \
+            self.generation_config.output_logits # pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
index 94da9f563ea..11dcf4e6d34 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/llama/modeling_llama.py
@@ -18,7 +18,7 @@
 
 import torch
 import torch.nn.functional as F
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.cache_utils import Cache, DynamicCache, StaticCache # pylint: disable=E0611
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import (
@@ -1158,4 +1158,4 @@ def apply_customized_rope(q, k, cos, sin, position_ids):
         )
     else:
 
-        return apply_rotary_pos_emb(q, k, cos[position_ids], sin[position_ids])
+        return apply_rotary_pos_emb(q, k, cos[position_ids], sin[position_ids]) # pylint: disable=E1120
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 75d5236da8e..f6411ef17ce 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -625,7 +625,7 @@ def forward(
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
-            )
+            ) # pylint: disable=E1121
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 

From 22557de051efb3a11f409764197a61b8fcae2b3e Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Thu, 23 May 2024 18:04:17 +0800
Subject: [PATCH 21/25] Update pylint.sh

Signed-off-by: VincyZhang <wenxin.zhang@intel.com>
---
 .github/workflows/script/formatScan/pylint.sh | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
index e9263493051..eeb71beb604 100644
--- a/.github/workflows/script/formatScan/pylint.sh
+++ b/.github/workflows/script/formatScan/pylint.sh
@@ -43,25 +43,15 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --max-line-length=120 \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,intel_extension_for_transformers.transformers.modeling.modeling_gaudi,cv2,PIL.Image \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,intel_extension_for_transformers.transformers.modeling.modeling_gaudi.generation.utils,intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.mixtral.modeling_mixtral,cv2,PIL.Image \
     /intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
-exit_code1=$?
-
-python -m pylint -f json --disable=R,C,W,E1129 \
-    --enable=line-too-long \
-    --max-line-length=120 \
-    --disable=no-name-in-module,import-error,no-member,undefined-variable,no-value-for-parameter,unexpected-keyword-arg,not-callable,no-self-argument,too-many-format-args,invalid-unary-operand-type,too-many-function-args \
-    --extension-pkg-whitelist=numpy,nltk \
-    --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,intel_extension_for_transformers.transformers.modeling.modeling_gaudi,cv2,PIL.Image \
-    /intel-extension-for-transformers/intel_extension_for_transformers >> ${log_dir}/pylint.json
-exit_code2=$?
+exit_code=$?
 
 $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------" && $RESET
 cat ${log_dir}/pylint.json
 $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
 
-if [ ${exit_code1} -ne 0 ] || [ ${exit_code2} -ne 0 ]; then
+if [ ${exit_code} -ne 0 ]; then
     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET
     exit 1
 fi

From 17614be382a241531d1151802fd62cfd517bc615 Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Thu, 23 May 2024 19:18:41 +0800
Subject: [PATCH 22/25] fix line by line pylint

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../modeling_gaudi/generation/utils.py        | 20 +++++++++----------
 .../models/mixtral/modeling_mixtral.py        |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index e78d099cd86..7fc383efe3b 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -24,13 +24,13 @@
 import torch.distributed as dist
 from transformers.generation.beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-from transformers.generation.candidate_generator import CandidateGenerator # pylint: disable=E0611
+from transformers.generation.candidate_generator import CandidateGenerator # pylint: disable=E0401
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import (
     StoppingCriteriaList,
     validate_stopping_criteria,
 )
-from transformers.generation.utils import (
+from transformers.generation.utils import ( # pylint: disable=E0611
     NEED_SETUP_CACHE_CLASSES_MAPPING,
     GenerateBeamDecoderOnlyOutput,
     GenerateBeamEncoderDecoderOutput,
@@ -43,14 +43,14 @@
     GenerationMode,
     _split_model_inputs,
     stack_model_outputs,
-) # pylint: disable=E0611
+)
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.utils import ModelOutput
 
 from optimum.utils import logging
 
 from optimum.habana.utils import HabanaProfile # pylint: disable=E0611
-from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model
+from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model # pylint: disable=E0611
 from .configuration_utils import GaudiGenerationConfig
 
 
@@ -1755,8 +1755,8 @@ def sample(
             eos_token_id = [eos_token_id]
         eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else \
-            self.generation_config.output_logits # pylint: disable=E1101
+        if output_logits is None:
+            output_logits = self.generation_config.output_logits # pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
@@ -2089,8 +2089,8 @@ def beam_search(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else \
-            self.generation_config.output_logits# pylint: disable=E1101
+        if output_logits is None:
+            output_logits = self.generation_config.output_logits # pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
@@ -2981,8 +2981,8 @@ def constrained_beam_search(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else \
-            self.generation_config.output_logits # pylint: disable=E1101
+        if output_logits is None:
+            output_logits = self.generation_config.output_logits # pylint: disable=E1101
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
         )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index f6411ef17ce..021fde45c75 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -620,12 +620,12 @@ def forward(
 
         aux_loss = None
         if output_router_logits:
-            aux_loss = load_balancing_loss_func(
+            aux_loss = load_balancing_loss_func( # pylint: disable=E1121
                 outputs.router_logits if return_dict else outputs[-1],
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
-            ) # pylint: disable=E1121
+            ) 
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 

From e02739f9870351315ef707223113c495e87e7546 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 11:19:29 +0000
Subject: [PATCH 23/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 021fde45c75..590663c707c 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -625,7 +625,7 @@ def forward(
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
-            ) 
+            )
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 

From 2661a1e2030310294b0223a49c933fd26d5191cf Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Thu, 23 May 2024 19:24:30 +0800
Subject: [PATCH 24/25] disable before the line

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
index 590663c707c..cfda2b0bff1 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/models/mixtral/modeling_mixtral.py
@@ -620,7 +620,8 @@ def forward(
 
         aux_loss = None
         if output_router_logits:
-            aux_loss = load_balancing_loss_func( # pylint: disable=E1121
+            # pylint: disable=E1121
+            aux_loss = load_balancing_loss_func(
                 outputs.router_logits if return_dict else outputs[-1],
                 self.num_experts,
                 self.num_experts_per_tok,

From cd82c90cb1865d8be33b6d1b7b3fe725de5d9edb Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Fri, 24 May 2024 09:16:52 +0800
Subject: [PATCH 25/25] pylint check

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .../modeling/modeling_gaudi/generation/utils.py             | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
index 7fc383efe3b..770c53e9456 100755
--- a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/utils.py
@@ -24,7 +24,8 @@
 import torch.distributed as dist
 from transformers.generation.beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-from transformers.generation.candidate_generator import CandidateGenerator # pylint: disable=E0401
+# pylint: disable=E0401,E0611
+from transformers.generation.candidate_generator import CandidateGenerator
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import (
     StoppingCriteriaList,
@@ -50,7 +51,8 @@
 from optimum.utils import logging
 
 from optimum.habana.utils import HabanaProfile # pylint: disable=E0611
-from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model # pylint: disable=E0611
+# pylint: disable=E0401,E0611
+from optimum.habana.transformers.integrations.deepspeed import unwrap_deepspeed_model
 from .configuration_utils import GaudiGenerationConfig