Skip to content

Commit 02a3ce5

Browse files
authored
ODSC-68333: Adds Config Validation for Multi-Model Deployment (#1068)
2 parents 6a893b6 + 9958da0 commit 02a3ce5

File tree

6 files changed

+666
-99
lines changed

6 files changed

+666
-99
lines changed

README-development.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ All the unit tests can be found [here](https://github.com/oracle/accelerated-dat
248248
The following commands detail how the unit tests can be run.
249249
```
250250
# Run all tests in AQUA project
251-
python -m pytest -q tests/unitary/with_extras/aqua/test_deployment.py
251+
python -m pytest -q tests/unitary/with_extras/aqua/*
252252
253253
# Run all tests specific to a module within in AQUA project (ex. test_deployment.py, test_model.py, etc.)
254254
python -m pytest -q tests/unitary/with_extras/aqua/test_deployment.py

ads/aqua/modeldeployment/deployment.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
AquaDeploymentConfig,
4242
AquaDeploymentDetail,
4343
ConfigurationItem,
44+
ConfigValidationError,
4445
CreateModelDeploymentDetails,
4546
ModelDeploymentConfigSummary,
4647
)
@@ -156,6 +157,22 @@ def create(
156157
defined_tags=defined_tags,
157158
)
158159
else:
160+
model_ids = [model.model_id for model in create_deployment_details.models]
161+
162+
try:
163+
model_config_summary = self.get_multimodel_deployment_config(
164+
model_ids=model_ids
165+
)
166+
167+
if not model_config_summary.gpu_allocation:
168+
raise AquaValueError(model_config_summary.error_message)
169+
170+
create_deployment_details.validate_multimodel_deployment_feasibility(
171+
models_config_summary=model_config_summary
172+
)
173+
except ConfigValidationError as err:
174+
raise AquaValueError(f"{err}") from err
175+
159176
aqua_model = model_app.create_multi(
160177
models=create_deployment_details.models,
161178
compartment_id=compartment_id,

ads/aqua/modeldeployment/entities.py

Lines changed: 198 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from oci.data_science.models import ModelDeployment, ModelDeploymentSummary
88
from pydantic import BaseModel, Field, model_validator
99

10+
from ads.aqua import logger
1011
from ads.aqua.common.entities import AquaMultiModelRef, ShapeInfo
1112
from ads.aqua.common.enums import Tags
1213
from ads.aqua.config.utils.serializer import Serializable
@@ -142,101 +143,6 @@ class Config:
142143
extra = "ignore"
143144

144145

145-
class CreateModelDeploymentDetails(BaseModel):
146-
"""Class for creating Aqua model deployments."""
147-
148-
instance_shape: str = Field(
149-
..., description="The instance shape used for deployment."
150-
)
151-
display_name: str = Field(..., description="The name of the model deployment.")
152-
compartment_id: Optional[str] = Field(None, description="The compartment OCID.")
153-
project_id: Optional[str] = Field(None, description="The project OCID.")
154-
description: Optional[str] = Field(
155-
None, description="The description of the deployment."
156-
)
157-
model_id: Optional[str] = Field(None, description="The model OCID to deploy.")
158-
models: Optional[List[AquaMultiModelRef]] = Field(
159-
None, description="List of models for multimodel deployment."
160-
)
161-
instance_count: int = Field(
162-
None, description="Number of instances used for deployment."
163-
)
164-
log_group_id: Optional[str] = Field(
165-
None, description="OCI logging group ID for logs."
166-
)
167-
access_log_id: Optional[str] = Field(
168-
None,
169-
description="OCID for access logs. "
170-
"https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_using_logging.htm",
171-
)
172-
predict_log_id: Optional[str] = Field(
173-
None,
174-
description="OCID for prediction logs."
175-
"https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_using_logging.htm",
176-
)
177-
bandwidth_mbps: Optional[int] = Field(
178-
None, description="Bandwidth limit on the load balancer in Mbps."
179-
)
180-
web_concurrency: Optional[int] = Field(
181-
None, description="Number of worker processes/threads for handling requests."
182-
)
183-
server_port: Optional[int] = Field(
184-
None, description="Server port for the Docker container image."
185-
)
186-
health_check_port: Optional[int] = Field(
187-
None, description="Health check port for the Docker container image."
188-
)
189-
env_var: Optional[Dict[str, str]] = Field(
190-
default_factory=dict, description="Environment variables for deployment."
191-
)
192-
container_family: Optional[str] = Field(
193-
None, description="Image family of the model deployment container runtime."
194-
)
195-
memory_in_gbs: Optional[float] = Field(
196-
None, description="Memory (in GB) for the selected shape."
197-
)
198-
ocpus: Optional[float] = Field(
199-
None, description="OCPU count for the selected shape."
200-
)
201-
model_file: Optional[str] = Field(
202-
None, description="File used for model deployment."
203-
)
204-
private_endpoint_id: Optional[str] = Field(
205-
None, description="Private endpoint ID for model deployment."
206-
)
207-
container_image_uri: Optional[str] = Field(
208-
None,
209-
description="Image URI for model deployment container runtime "
210-
"(ignored for service-managed containers). "
211-
"Required parameter for BYOC based deployments if this parameter was not set during "
212-
"model registration.",
213-
)
214-
cmd_var: Optional[List[str]] = Field(
215-
None, description="Command variables for the container runtime."
216-
)
217-
freeform_tags: Optional[Dict] = Field(
218-
None, description="Freeform tags for model deployment."
219-
)
220-
defined_tags: Optional[Dict] = Field(
221-
None, description="Defined tags for model deployment."
222-
)
223-
224-
@model_validator(mode="before")
225-
@classmethod
226-
def validate(cls, values: Any) -> Any:
227-
"""Ensures exactly one of `model_id` or `models` is provided."""
228-
model_id = values.get("model_id")
229-
models = values.get("models")
230-
if bool(model_id) == bool(models): # Both set or both unset
231-
raise ValueError(
232-
"Exactly one of `model_id` or `models` must be provided to create a model deployment."
233-
)
234-
return values
235-
236-
class Config:
237-
extra = "ignore"
238-
239-
240146
class ShapeInfoConfig(Serializable):
241147
"""Describes how many memory and cpu to this model for specific shape.
242148
@@ -382,6 +288,17 @@ class GPUShapeAllocation(Serializable):
382288
class Config:
383289
extra = "allow"
384290

291+
class ConfigValidationError(Exception):
292+
"""Exception raised for config validation."""
293+
294+
def __init__(
295+
self,
296+
message: str = """Validation failed: The provided model group configuration is incompatible with the selected instance shape.
297+
Please verify the GPU count per model and ensure multi-model deployment is supported for the chosen instance shape.""",
298+
):
299+
super().__init__(
300+
message
301+
)
385302

386303
class ModelDeploymentConfigSummary(Serializable):
387304
"""Top-level configuration model for OCI-based deployments.
@@ -413,3 +330,189 @@ class ModelDeploymentConfigSummary(Serializable):
413330

414331
class Config:
415332
extra = "allow"
333+
334+
335+
class CreateModelDeploymentDetails(BaseModel):
336+
"""Class for creating Aqua model deployments."""
337+
338+
instance_shape: str = Field(
339+
..., description="The instance shape used for deployment."
340+
)
341+
display_name: str = Field(..., description="The name of the model deployment.")
342+
compartment_id: Optional[str] = Field(None, description="The compartment OCID.")
343+
project_id: Optional[str] = Field(None, description="The project OCID.")
344+
description: Optional[str] = Field(
345+
None, description="The description of the deployment."
346+
)
347+
model_id: Optional[str] = Field(None, description="The model OCID to deploy.")
348+
models: Optional[List[AquaMultiModelRef]] = Field(
349+
None, description="List of models for multimodel deployment."
350+
)
351+
instance_count: int = Field(
352+
None, description="Number of instances used for deployment."
353+
)
354+
log_group_id: Optional[str] = Field(
355+
None, description="OCI logging group ID for logs."
356+
)
357+
access_log_id: Optional[str] = Field(
358+
None,
359+
description="OCID for access logs. "
360+
"https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_using_logging.htm",
361+
)
362+
predict_log_id: Optional[str] = Field(
363+
None,
364+
description="OCID for prediction logs."
365+
"https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_using_logging.htm",
366+
)
367+
bandwidth_mbps: Optional[int] = Field(
368+
None, description="Bandwidth limit on the load balancer in Mbps."
369+
)
370+
web_concurrency: Optional[int] = Field(
371+
None, description="Number of worker processes/threads for handling requests."
372+
)
373+
server_port: Optional[int] = Field(
374+
None, description="Server port for the Docker container image."
375+
)
376+
health_check_port: Optional[int] = Field(
377+
None, description="Health check port for the Docker container image."
378+
)
379+
env_var: Optional[Dict[str, str]] = Field(
380+
default_factory=dict, description="Environment variables for deployment."
381+
)
382+
container_family: Optional[str] = Field(
383+
None, description="Image family of the model deployment container runtime."
384+
)
385+
memory_in_gbs: Optional[float] = Field(
386+
None, description="Memory (in GB) for the selected shape."
387+
)
388+
ocpus: Optional[float] = Field(
389+
None, description="OCPU count for the selected shape."
390+
)
391+
model_file: Optional[str] = Field(
392+
None, description="File used for model deployment."
393+
)
394+
private_endpoint_id: Optional[str] = Field(
395+
None, description="Private endpoint ID for model deployment."
396+
)
397+
container_image_uri: Optional[str] = Field(
398+
None,
399+
description="Image URI for model deployment container runtime "
400+
"(ignored for service-managed containers). "
401+
"Required parameter for BYOC based deployments if this parameter was not set during "
402+
"model registration.",
403+
)
404+
cmd_var: Optional[List[str]] = Field(
405+
None, description="Command variables for the container runtime."
406+
)
407+
freeform_tags: Optional[Dict] = Field(
408+
None, description="Freeform tags for model deployment."
409+
)
410+
defined_tags: Optional[Dict] = Field(
411+
None, description="Defined tags for model deployment."
412+
)
413+
414+
@model_validator(mode="before")
415+
@classmethod
416+
def validate(cls, values: Any) -> Any:
417+
"""Ensures exactly one of `model_id` or `models` is provided."""
418+
model_id = values.get("model_id")
419+
models = values.get("models")
420+
if bool(model_id) == bool(models): # Both set or both unset
421+
raise ValueError(
422+
"Exactly one of `model_id` or `models` must be provided to create a model deployment."
423+
)
424+
return values
425+
426+
def validate_multimodel_deployment_feasibility(self, models_config_summary: ModelDeploymentConfigSummary):
427+
"""
428+
Validates whether the user input of a model group (List[AquaMultiModelRef], 2+ models with a specified gpu count per model)
429+
is feasible for a multi model deployment on the user's selected shape (instance_shape)
430+
431+
Validation Criteria:
432+
- GPU Capacity: Ensures that the total number of GPUs requested by all models in the group does not exceed the GPU capacity of the selected instance shape.
433+
- Verifies that all models in the group are compatible with the selected instance shape.
434+
- Ensures that each model’s GPU allocation, as specified by the user, matches the requirements in the model's deployment configuration.
435+
- Confirms that the selected instance shape supports multi-model deployment.
436+
- Requires user input for the model group to be considered a valid multi-model deployment.
437+
438+
439+
Parameters
440+
----------
441+
models_config_summary : ModelDeploymentConfigSummary, optional
442+
An instance of ModelDeploymentConfigSummary containing all required
443+
fields (GPU Allocation, Deployment Configuration) for creating a multi model deployment via Aqua.
444+
445+
Raises
446+
-------
447+
ConfigValidationError:
448+
When the deployment is NOT a multi model deployment
449+
When assigned GPU Allocations per model are NOT within the number of GPUs available in the instance shape
450+
When all models in model group can NOT be deployed on the instance shape with the selected GPU count
451+
"""
452+
if not self.models:
453+
logger.error(
454+
"User defined model group (List[AquaMultiModelRef]) is None."
455+
)
456+
raise ConfigValidationError("Multi-model deployment requires at least one model, but none were provided. Please add one or more models to the model group to proceed.")
457+
458+
selected_shape = self.instance_shape
459+
460+
if selected_shape not in models_config_summary.gpu_allocation:
461+
logger.error(
462+
f"The model group is not compatible with the selected instance shape {selected_shape}"
463+
)
464+
raise ConfigValidationError(f"The model group is not compatible with the selected instance shape '{selected_shape}'. Select a different instance shape.")
465+
466+
total_available_gpus = models_config_summary.gpu_allocation[selected_shape].total_gpus_available
467+
468+
model_deployment_config = models_config_summary.deployment_config
469+
470+
required_model_keys = [model.model_id for model in self.models]
471+
missing_model_keys = required_model_keys - model_deployment_config.keys()
472+
473+
if len(missing_model_keys) > 0:
474+
logger.error(
475+
f"Missing the following model entry with key {missing_model_keys} in ModelDeploymentConfigSummary"
476+
)
477+
raise ConfigValidationError("One or more selected models are missing from the configuration, preventing validation for deployment on the given shape.")
478+
479+
sum_model_gpus = 0
480+
481+
for model in self.models:
482+
sum_model_gpus += model.gpu_count
483+
484+
aqua_deployment_config = model_deployment_config[model.model_id]
485+
486+
if selected_shape not in aqua_deployment_config.shape:
487+
logger.error(
488+
f"Model with OCID {model.model_id} in the model group is not compatible with the selected instance shape: {selected_shape}"
489+
)
490+
raise ConfigValidationError(
491+
"Select a different instance shape. One or more models in the group are incompatible with the selected instance shape."
492+
)
493+
494+
495+
multi_model_configs = aqua_deployment_config.configuration.get(
496+
selected_shape, ConfigurationItem()
497+
).multi_model_deployment
498+
499+
valid_gpu_configurations = [gpu_shape_config.gpu_count for gpu_shape_config in multi_model_configs]
500+
if model.gpu_count not in valid_gpu_configurations:
501+
valid_gpu_str = ", ".join(map(str, valid_gpu_configurations))
502+
logger.error(
503+
f"Model {model.model_id} allocated {model.gpu_count} GPUs by user, but its deployment configuration requires either {valid_gpu_str} GPUs."
504+
)
505+
raise ConfigValidationError(
506+
"Change the GPU count for one or more models in the model group. Adjust GPU allocations per model or choose a larger instance shape."
507+
)
508+
509+
if sum_model_gpus > total_available_gpus:
510+
logger.error(
511+
f"Selected shape {selected_shape} has {total_available_gpus} GPUs while model group has {sum_model_gpus} GPUs."
512+
)
513+
raise ConfigValidationError(
514+
"Total requested GPU count exceeds the available GPU capacity for the selected instance shape. Adjust GPU allocations per model or choose a larger instance shape."
515+
)
516+
517+
class Config:
518+
extra = "ignore"

0 commit comments

Comments
 (0)