1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: MIT-0
3
+
4
+ Imds :
5
+ ImdsSupport : v1.0
6
+ Image :
7
+ Os : ubuntu2204
8
+ HeadNode :
9
+ InstanceType : m5.8xlarge
10
+ Networking :
11
+ SubnetId : ${PUBLIC_SUBNET_ID}
12
+ AdditionalSecurityGroups :
13
+ - ${SECURITY_GROUP}
14
+ Ssh :
15
+ KeyName : ${KEY_PAIR_NAME}
16
+ LocalStorage :
17
+ RootVolume :
18
+ Size : 500
19
+ DeleteOnTermination : true # that's your root and /home volume for users
20
+ Iam :
21
+ AdditionalIamPolicies : # grant ECR, SSM and S3 read access
22
+ - Policy : arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
23
+ - Policy : arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
24
+ - Policy : arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
25
+ - Policy : arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
26
+ CustomActions :
27
+ OnNodeConfigured :
28
+ Sequence :
29
+ - Script : ' https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
30
+ - Script : ' https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/nccl/postinstall.sh'
31
+ Args :
32
+ - v2.23.4-1 # NCCL version
33
+ - v1.11.0-aws # AWS OFI NCCL version
34
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_slurm_exporter.sh'
35
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh'
36
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/configure_pyxis_enroot_headnode.sh'
37
+ OnNodeStart :
38
+ Sequence :
39
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/install_prometheus.sh'
40
+ Args :
41
+ - ${AMPREMOTEWRITEURL}
42
+ - ${AWS_REGION}
43
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/install_grafana.sh'
44
+ Imds :
45
+ Secured : false
46
+ Scheduling :
47
+ Scheduler : slurm
48
+ SlurmSettings :
49
+ ScaledownIdletime : 60
50
+ QueueUpdateStrategy : DRAIN
51
+ CustomSlurmSettings :
52
+ # Simple accounting to text file /home/slurm/slurm-job-completions.txt.
53
+ - JobCompType : jobcomp/filetxt
54
+ - JobCompLoc : /home/slurm/slurm-job-completions.txt
55
+ - JobAcctGatherType : jobacct_gather/linux
56
+ SlurmQueues :
57
+ - Name : compute-gpu
58
+ CapacityType : ONDEMAND
59
+ Networking :
60
+ SubnetIds :
61
+ - ${PRIVATE_SUBNET_ID}
62
+ PlacementGroup :
63
+ Enabled : true # set this to false if using a targeted ODCR
64
+ AdditionalSecurityGroups :
65
+ - ${SECURITY_GROUP}
66
+ ComputeSettings :
67
+ LocalStorage :
68
+ EphemeralVolume :
69
+ MountDir : /scratch # each instance has a local scratch on NVMe
70
+ RootVolume :
71
+ Size : 200
72
+ ComputeResources :
73
+ - Name : distributed-ml
74
+ InstanceType : ${INSTANCE}
75
+ MinCount : ${NUM_INSTANCES} # if min = max then capacity is maintained and will
76
+ MaxCount : ${NUM_INSTANCES} # not scale down
77
+ Efa :
78
+ Enabled : true
79
+ CapacityReservationTarget :
80
+ CapacityReservationId : ${CAPACITY_RESERVATION_ID}
81
+ CustomActions :
82
+ OnNodeConfigured :
83
+ Sequence :
84
+ - Script : ' https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
85
+ - Script : ' https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/nccl/postinstall.sh'
86
+ Args :
87
+ - v2.23.4-1 # NCCL version
88
+ - v1.11.0-aws # AWS OFI NCCL version
89
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh'
90
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh'
91
+ OnNodeStart :
92
+ Sequence :
93
+ - Script : ' https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/configure_pyxis_enroot_computenode.sh'
94
+ SharedStorage :
95
+ - Name : HomeDirs
96
+ MountDir : /home
97
+ StorageType : FsxOpenZfs
98
+ FsxOpenZfsSettings :
99
+ VolumeId : ${FSXO_ID}
100
+ - MountDir : /fsx
101
+ Name : fsx
102
+ StorageType : FsxLustre
103
+ FsxLustreSettings :
104
+ FileSystemId : ${FSX_ID}
105
+ Monitoring :
106
+ DetailedMonitoring : true
107
+ Logs :
108
+ CloudWatch :
109
+ Enabled : true # good for debug
110
+ Dashboards :
111
+ CloudWatch :
112
+ Enabled : true # provide basic dashboards
113
+ Tags :
114
+ - Key : ' Grafana'
115
+ Value : ' true'
0 commit comments