intel · VincyZhang · May 24, 2024 · Mar 29, 2024 · Mar 29, 2024 · Apr 9, 2024
diff --git a/..._for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py b/..._for_transformers/neural_chat/examples/finetuning/multi_modal/eval/mmmu_eval/run_llava.py
@@ -17,7 +17,7 @@
 from optimum.habana.transformers.generation.utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
 if "llava" not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
     MODELS_OPTIMIZED_WITH_STATIC_SHAPES.append("llava")
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from intel_extension_for_transformers.transformers.modeling.modeling_gaudi import adapt_transformers_to_gaudi
 adapt_transformers_to_gaudi()
 
 import torch

diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -458,7 +458,7 @@ def load_model(
         # Tweak generation so that it runs faster on Gaudi
         # pylint: disable=E0401
         # pylint: disable=E0611
-        from optimum.habana.transformers.modeling_utils import (
+        from intel_extension_for_transformers.transformers.modeling.modeling_gaudi import (
             adapt_transformers_to_gaudi,
         )
 

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling_utils import adapt_transformers_to_gaudi
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py b/intel_extension_for_transformers/transformers/modeling/modeling_gaudi/generation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_utils import GaudiGenerationConfig
+from .stopping_criteria import (
+    gaudi_MaxLengthCriteria_call,
+    gaudi_MaxNewTokensCriteria_call,
+)
+from .utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES, GaudiGenerationMixin
diff --git a/...n_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py b/...n_for_transformers/transformers/modeling/modeling_gaudi/generation/configuration_utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.generation import GenerationConfig
+
+
+class GaudiGenerationConfig(GenerationConfig):
+    """
+    This class extends [`transformers.generation.GenerationConfig`](https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py)
+    to add HPU-specific arguments for generation.
+
+    Arg:
+    trim_logit (`bool`, *optional):
+        Calculate logits only for the last token to save memory in the first step.
+    static_shapes (`bool`, *optional*):
+        Whether to use static shapes for generation or not. It will run faster on HPUs with static shapes
+        but not all models support it. If not specified, it will automatically be set to `True` if the given
+        model supports it.
+    ignore_eos (`bool`, *optional*):
+        Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
+        If not specified, it will automatically be set to `True` if lazy mode is on.
+    attn_softmax_bf16 (`bool`, *optional*):
+        Whether to run attention softmax layer in lower precision provided that the model supports it and
+        is also running in lower precision.
+    limit_hpu_graphs (`bool`, *optional*):
+        Skip HPU Graph usage for first token to save memory
+    reuse_cache (`bool`, *optional*):
+        Whether to reuse key/value cache for decoding. It should save memory.
+    bucket_size (`int`, *optional*):
+        If negative (default=-1) pad to max if `static_shapes` is set. Else start with
+        `shape = bucket_size * ceil(prompt_len/bucket_size)` and then grow space by `bucket_size` when needed.
+        Only active if `static_shapes` is used. Can't be used with `reuse_cache`.
+    bucket_internal (`bool`, *optional*):
+        Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.
+    kv_cache_fp8 (`bool`, *optional*):
+        Store kv-cache in float8 when kv-cache is used
+    use_flash_attention (`bool`, *optional*):
+        Whether to use flash attention optimization.
+    flash_attention_recompute (`bool`, *optional*):
+        Whether to enable recompute if use Habana flash attention.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.trim_logits = kwargs.get("trim_logits", None)
+        self.static_shapes = kwargs.get("static_shapes", None)
+        self.ignore_eos = kwargs.get("ignore_eos", None)
+        self.attn_softmax_bf16 = kwargs.get("attn_softmax_bf16", None)
+        self.limit_hpu_graphs = kwargs.get("limit_hpu_graphs", None)
+        self.reuse_cache = kwargs.get("reuse_cache", None)
+        self.bucket_size = kwargs.get("bucket_size", -1)
+        self.bucket_internal = kwargs.get("bucket_internal", None)
+        self.reduce_recompile = kwargs.get("reduce_recompile", None)
+        self.kv_cache_fp8 = kwargs.get("kv_cache_fp8", None)
+        self.use_flash_attention = kwargs.get("use_flash_attention", None)
+        self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
+        self.use_fused_rope = kwargs.get("use_fused_rope", None)
diff --git a/...ion_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py b/...ion_for_transformers/transformers/modeling/modeling_gaudi/generation/stopping_criteria.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from optimum.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_MaxLengthCriteria_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+    token_idx = kwargs.get("token_idx", None)
+    if token_idx is not None:
+        return token_idx >= self.max_length
+    else:
+        cur_len = input_ids.shape[-1]
+        is_done = cur_len >= self.max_length
+        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
+            logger.warning_once(
+                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
+                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
+                "exceptions, performance degradation, or nothing at all."
+            )
+        return is_done
+
+
+def gaudi_MaxNewTokensCriteria_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+    token_idx = kwargs.get("token_idx", None)
+    if token_idx is not None:
+        return token_idx >= self.max_length
+    else:
+        return input_ids.shape[-1] >= self.max_length