inital code for GPU Shape Recommendator

elizjo · elizjo · commit 58bfc004180e · 2025-07-07T13:40:39.000-07:00
diff --git a/ads/aqua/cli.py b/ads/aqua/cli.py
@@ -14,6 +14,7 @@
 from ads.aqua.finetuning import AquaFineTuningApp
 from ads.aqua.model import AquaModelApp
 from ads.aqua.modeldeployment import AquaDeploymentApp
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
 from ads.common.utils import LOG_LEVELS
 
 
@@ -29,6 +30,7 @@ class AquaCommand:
     fine_tuning = AquaFineTuningApp
     deployment = AquaDeploymentApp
     evaluation = AquaEvaluationApp
+    recommend = AquaRecommendApp
 
     def __init__(
         self,
@@ -94,18 +96,20 @@ def _validate_value(flag, value):
                 "If you intend to chain a function call to the result, please separate the "
                 "flag and the subsequent function call with separator `-`."
             )
-    
+
     @staticmethod
     def install():
         """Install ADS Aqua Extension from wheel file. Set enviroment variable `AQUA_EXTENSTION_PATH` to change the wheel file path.
 
-        Return 
+        Return
         ------
         int:
             Installatation status.
         """
         import subprocess
 
-        wheel_file_path = os.environ.get("AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl")
-        status =  subprocess.run(f"pip install {wheel_file_path}",shell=True)
-        return status.check_returncode
+        wheel_file_path = os.environ.get(
+            "AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl"
+        )
+        status = subprocess.run(f"pip install {wheel_file_path}", shell=True)
+        return status.check_returncode
diff --git a/ads/aqua/extension/__init__.py b/ads/aqua/extension/__init__.py
@@ -12,6 +12,7 @@
 )
 from ads.aqua.extension.evaluation_handler import __handlers__ as __eval_handlers__
 from ads.aqua.extension.finetune_handler import __handlers__ as __finetune_handlers__
+from ads.aqua.extension.gpu_recommend_handler import __handlers__ as __gpu_handlers__
 from ads.aqua.extension.model_handler import __handlers__ as __model_handlers__
 from ads.aqua.extension.ui_handler import __handlers__ as __ui_handlers__
 from ads.aqua.extension.ui_websocket_handler import __handlers__ as __ws_handlers__
@@ -24,6 +25,7 @@
     + __ui_handlers__
     + __eval_handlers__
     + __ws_handlers__
+    + __gpu_handlers__
 )
 
 
diff --git a/ads/aqua/extension/recommend_handler.py b/ads/aqua/extension/recommend_handler.py
@@ -0,0 +1,50 @@
+
+from tornado.web import HTTPError
+
+from ads.aqua.common.decorator import handle_exceptions
+from ads.aqua.extension.base_handler import AquaAPIhandler
+from ads.aqua.extension.errors import Errors
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
+from ads.config import COMPARTMENT_OCID
+
+
+class AquaRecommendHandler(AquaAPIhandler):
+    """
+    Handler for Aqua GPU Recommendation REST APIs.
+
+    Methods
+    -------
+    get(self, id: Union[str, List[str]])
+        Retrieves a list of AQUA deployments or model info or logs by ID.
+    post(self, *args, **kwargs)
+        Obtains the eligible compute shapes that would fit the specifed model, context length, model weights, and quantization level.
+
+    Raises
+    ------
+    HTTPError: For various failure scenarios such as invalid input format, missing data, etc.
+    """
+
+    @handle_exceptions
+    def post(self, *args, **kwargs):  # noqa: ARG002
+        """
+        Lists the eligible GPU compute shapes for the specifed model.
+
+        Returns
+        -------
+        List[ComputeShapeSummary]:
+            The list of the model deployment shapes.
+        """
+        try:
+            input_data = self.get_json_body()
+            # input_data["compartment_id"] = self.get_argument("compartment_id", default=COMPARTMENT_OCID)
+        except Exception as ex:
+            raise HTTPError(400, Errors.INVALID_INPUT_DATA_FORMAT) from ex
+
+        if not input_data:
+            raise HTTPError(400, Errors.NO_INPUT_DATA)
+
+        self.finish(AquaRecommendApp().which_gpu(**input_data))
+
+__handlers__ = [
+    ("gpu-shape-recommendation/?([^/]*)", AquaRecommendHandler),
+]
diff --git a/ads/aqua/shaperecommend/__init__.py b/ads/aqua/shaperecommend/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+from ads.aqua.shaperecommend.recommend import AquaGPURecommendApp
+
+__all__ = ["AquaGPURecommendApp"]
diff --git a/ads/aqua/shaperecommend/constants.py b/ads/aqua/shaperecommend/constants.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+"""
+aqua.shaperecommend.constants
+~~~~~~~~~~~~~~
+
+This module contains constants used in Aqua GPU Recommendation for Models.
+
+LLAMA_REQUIRED_FIELDS refer to fields necessary for calculating model memory for GQA Architecture Models
+
+MOE_REQUIRED_FIELDS refer to fields necessary for Mixture of Experts (MoE) Architecture Models
+
+NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)
+"""
+LLAMA_REQUIRED_FIELDS = [
+    "num_hidden_layers", "hidden_size", "num_attention_heads",
+    "num_key_value_heads", "head_dim", "intermediate_size", "vocab_size"
+]
+
+MOE_REQUIRED_FIELDS = LLAMA_REQUIRED_FIELDS + [
+    "num_local_experts", "intermediate_size"
+]
+
+NEXT_QUANT = {
+    "float32": ["bfloat16", "float16", "int8"],
+    "bfloat16": ["float16", "int8"],
+    "float16": ["int8"],
+    "int8": ["8bit", "4bit (Not Recommended)"],
+    "8bit": ["4bit (Not Recommended)"],
+    "4bit": ["No smaller quantization available"]
+}
diff --git a/ads/aqua/shaperecommend/estimator.py b/ads/aqua/shaperecommend/estimator.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from ads.aqua.app import logger
+from ads.aqua.shaperecommend.constants import LLAMA_REQUIRED_FIELDS, MOE_REQUIRED_FIELDS
+from ads.aqua.shaperecommend.llm_config import LLMConfig
+
+
+class MemoryEstimator(BaseModel):
+    """
+    The generic estimator for Transformer Architecture models (OPT/ Bloom)
+    Used as a fallback estimator if model identified is not a MoE or GQA Architecture Model.
+    Has properties to estimate the KV Cache size, Model size, and total footprint (KV Cache + Model size)
+    """
+
+    llm_config: LLMConfig = Field(
+        ...,
+        description="The model's config.json file with the necessary parameters for model size and KV cache estimation/",
+    )
+    batch_size: int = (
+        1  # we assume that estimation for batch sizes are not supported yet
+    )
+    seq_len: Optional[int] = Field(
+        4096, description="The max-seq-len to estimate the size of the KV cache."
+    )
+
+    @property
+    def kv_cache_memory(self) -> float:
+        """
+        Estimates the KV cache size (in GB) using the LLM config.json parameters.
+
+        Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation
+        """
+        seq_len = self.seq_len or self.llm_config.max_seq_len
+        c = self.llm_config
+        kv_cache_dtype_bytes = (
+            c.bytes_per_parameter
+        )  # vLLM uses model's weight/quantization applied to KV cache
+
+        total_bytes = (
+            self.batch_size
+            * c.num_hidden_layers
+            * 2
+            * c.num_attention_heads
+            * seq_len
+            * c.head_dim
+            * kv_cache_dtype_bytes
+        )
+        return total_bytes / 1e9
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Estimates the model size (in GB) based on estimating the model parameter size and model weights
+
+        Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
+        """
+        c = self.llm_config
+        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
+        embedding_params = (
+            embedding_count * c.vocab_size * c.hidden_size
+        )  # input and output untied
+        layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2)  # GPT-style
+        num_params = layer_params + embedding_params
+
+        return num_params * c.bytes_per_parameter / 1e9
+
+    # @property
+    # def model_overhead(self) -> float:
+    #     overhead = max(1, math.ceil(0.0 * self.model_memory))
+    #     return overhead
+
+    @property
+    def total_memory(self) -> float:
+        """
+        Computes the total memory footprint of the model (KV cache & model size from estimated parameters)
+        """
+        return self.model_memory + self.kv_cache_memory
+
+
+# Specialized estimators:
+class LlamaMemoryEstimator(MemoryEstimator):
+    """
+    Estimator for GQA-type architectures. Handles tied (memory savings) and untied embeddings,
+    and uses grouped attention (GQA) for more efficient KV cache memory estimation.
+
+    KV cache: Use num_attention_heads (assumes GQA)
+    Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
+    """
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Returns estimated model parameter memory (in GB), accurately accounting
+        for Llama-style attention and MLP, and tied or untied embeddings.
+        """
+        c = self.llm_config
+
+        embedding_params, attn_params = self._calc_attn_embed_params()
+
+        # MLP params
+        gate_proj = c.hidden_size * c.intermediate_size
+        up_proj = c.hidden_size * c.intermediate_size
+        down_proj = c.intermediate_size * c.hidden_size
+        mlp_params = gate_proj + up_proj + down_proj
+
+        # Total per-layer
+        layer_params = attn_params + mlp_params
+        # Total params
+        num_params = c.num_hidden_layers * layer_params + embedding_params
+        return num_params * c.bytes_per_parameter / 1e9
+
+    @property
+    def kv_cache_memory(self) -> float:
+        """
+        Returns estimated KV cache memory in GB for GQA models.
+
+        Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
+        num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
+        """
+        c = self.llm_config
+        seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
+        kv_cache_dtype_bytes = c.bytes_per_parameter
+        kv_heads = c.num_key_value_heads
+
+        total_bytes = (
+            self.batch_size
+            * c.num_hidden_layers
+            * 2
+            * kv_heads
+            * seq_len
+            * c.head_dim
+            * kv_cache_dtype_bytes
+        )
+        return total_bytes / 1e9
+
+    def _calc_attn_embed_params(self) -> tuple:
+        """
+        Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
+        """
+        c = self.llm_config
+
+        # Embedding parameters
+        # assume tied embeddings unless tie_word_embeddings = False
+        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
+        embedding_params = embedding_count * c.vocab_size * c.hidden_size
+
+        q_proj = c.hidden_size * c.hidden_size
+        k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
+        v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
+        o_proj = c.hidden_size * c.hidden_size
+        attn_params = q_proj + k_proj + v_proj + o_proj
+
+        return embedding_params, attn_params
+
+
+class MixtureMemoryEstimator(LlamaMemoryEstimator):
+    """
+    Estimator for Mixture-of-Experts (MoE) architectures (e.g., Mixtral, MoE Llama).
+    Adds extra expert parallelism block parameter count to LlamaMemoryEstimator logic.
+    """
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Accounts for the increase in model parameters due to additional expert MLP blocks in MoE Models.
+
+        Returns the estimated memory size of the MoE Model (in GB).
+        """
+        c = self.llm_config
+        # Attention parameter count (Llama-style)
+        embedding_params, attn_params = self._calc_attn_embed_params()
+
+        # MoE MLP params per layer
+        moe_params_per_layer = (
+            c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
+        )
+        total_params = (
+            c.num_hidden_layers * (attn_params + moe_params_per_layer)
+            + embedding_params
+        )
+
+        # Convert to GB
+        return total_params * c.bytes_per_parameter / 1e9
+
+
+def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
+    """
+    Extracts the correct estimator based on the defined parameters in the config.json
+    See constants.py for LLMConfig parameters necessary for specific estimators.
+    Uses MemoryEstimator as a fallback if parameters needed for GQA and MoE Architectures are missing.
+
+    Returns the appropriate MemoryEstimator based on the fields defined by the model's config.json (as represented by LLMConfig).
+    """
+    if all(
+        hasattr(llm_config, f) and getattr(llm_config, f) is not None
+        for f in MOE_REQUIRED_FIELDS
+    ):
+        return MixtureMemoryEstimator(llm_config=llm_config, **kwargs)
+    elif all(
+        hasattr(llm_config, f) and getattr(llm_config, f) is not None
+        for f in LLAMA_REQUIRED_FIELDS
+    ):
+        return LlamaMemoryEstimator(llm_config=llm_config, **kwargs)
+    else:
+        logger.warning(
+            "Falling back to generic GPT estimator: required fields missing from config.json file in model."
+        )
+        return MemoryEstimator(llm_config=llm_config, **kwargs)
diff --git a/ads/aqua/shaperecommend/llm_config.py b/ads/aqua/shaperecommend/llm_config.py
diff --git a/ads/aqua/shaperecommend/recommend.py b/ads/aqua/shaperecommend/recommend.py
diff --git a/ads/aqua/shaperecommend/shape_report.py b/ads/aqua/shaperecommend/shape_report.py

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@`
`12`	`12`	`)`
`13`	`13`	`from ads.aqua.extension.evaluation_handler import __handlers__ as __eval_handlers__`
`14`	`14`	`from ads.aqua.extension.finetune_handler import __handlers__ as __finetune_handlers__`
	`15`	`+from ads.aqua.extension.gpu_recommend_handler import __handlers__ as __gpu_handlers__`
`15`	`16`	`from ads.aqua.extension.model_handler import __handlers__ as __model_handlers__`
`16`	`17`	`from ads.aqua.extension.ui_handler import __handlers__ as __ui_handlers__`
`17`	`18`	`from ads.aqua.extension.ui_websocket_handler import __handlers__ as __ws_handlers__`
`@@ -24,6 +25,7 @@`
`24`	`25`	`+ __ui_handlers__`
`25`	`26`	`+ __eval_handlers__`
`26`	`27`	`+ __ws_handlers__`
	`28`	`+ + __gpu_handlers__`
`27`	`29`	`)`
`28`	`30`
`29`	`31`