Skip to content

Commit 0e68c54

Browse files
committed
add MetadataConfiguration for FSxL filesystem
remove metadata config
1 parent 2a20863 commit 0e68c54

File tree

3 files changed

+119
-0
lines changed

3 files changed

+119
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT-0
3+
4+
Imds:
5+
ImdsSupport: v1.0
6+
Image:
7+
Os: ubuntu2204
8+
HeadNode:
9+
InstanceType: m5.8xlarge
10+
Networking:
11+
SubnetId: ${PUBLIC_SUBNET_ID}
12+
AdditionalSecurityGroups:
13+
- ${SECURITY_GROUP}
14+
Ssh:
15+
KeyName: ${KEY_PAIR_NAME}
16+
LocalStorage:
17+
RootVolume:
18+
Size: 500
19+
DeleteOnTermination: true # that's your root and /home volume for users
20+
Iam:
21+
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
22+
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
23+
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
24+
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
25+
- Policy: arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
26+
CustomActions:
27+
OnNodeConfigured:
28+
Sequence:
29+
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
30+
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/nccl/postinstall.sh'
31+
Args:
32+
- v2.23.4-1 # NCCL version
33+
- v1.11.0-aws # AWS OFI NCCL version
34+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_slurm_exporter.sh'
35+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_head_node_exporter.sh'
36+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/configure_pyxis_enroot_headnode.sh'
37+
OnNodeStart:
38+
Sequence:
39+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/install_prometheus.sh'
40+
Args:
41+
- ${AMPREMOTEWRITEURL}
42+
- ${AWS_REGION}
43+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/install_grafana.sh'
44+
Imds:
45+
Secured: false
46+
Scheduling:
47+
Scheduler: slurm
48+
SlurmSettings:
49+
ScaledownIdletime: 60
50+
QueueUpdateStrategy: DRAIN
51+
CustomSlurmSettings:
52+
# Simple accounting to text file /home/slurm/slurm-job-completions.txt.
53+
- JobCompType: jobcomp/filetxt
54+
- JobCompLoc: /home/slurm/slurm-job-completions.txt
55+
- JobAcctGatherType: jobacct_gather/linux
56+
SlurmQueues:
57+
- Name: compute-gpu
58+
CapacityType: ONDEMAND
59+
Networking:
60+
SubnetIds:
61+
- ${PRIVATE_SUBNET_ID}
62+
PlacementGroup:
63+
Enabled: true # set this to false if using a targeted ODCR
64+
AdditionalSecurityGroups:
65+
- ${SECURITY_GROUP}
66+
ComputeSettings:
67+
LocalStorage:
68+
EphemeralVolume:
69+
MountDir: /scratch # each instance has a local scratch on NVMe
70+
RootVolume:
71+
Size: 200
72+
ComputeResources:
73+
- Name: distributed-ml
74+
InstanceType: ${INSTANCE}
75+
MinCount: ${NUM_INSTANCES} # if min = max then capacity is maintained and will
76+
MaxCount: ${NUM_INSTANCES} # not scale down
77+
Efa:
78+
Enabled: true
79+
CapacityReservationTarget:
80+
CapacityReservationId: ${CAPACITY_RESERVATION_ID}
81+
CustomActions:
82+
OnNodeConfigured:
83+
Sequence:
84+
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
85+
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/nccl/postinstall.sh'
86+
Args:
87+
- v2.23.4-1 # NCCL version
88+
- v1.11.0-aws # AWS OFI NCCL version
89+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_dcgm_exporter.sh'
90+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh'
91+
OnNodeStart:
92+
Sequence:
93+
- Script: 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/geniac/1.architectures/2.aws-parallelcluster/deployment-guides/post-install-scripts/configure_pyxis_enroot_computenode.sh'
94+
SharedStorage:
95+
- Name: HomeDirs
96+
MountDir: /home
97+
StorageType: FsxOpenZfs
98+
FsxOpenZfsSettings:
99+
VolumeId: ${FSXO_ID}
100+
- MountDir: /fsx
101+
Name: fsx
102+
StorageType: FsxLustre
103+
FsxLustreSettings:
104+
FileSystemId: ${FSX_ID}
105+
Monitoring:
106+
DetailedMonitoring: true
107+
Logs:
108+
CloudWatch:
109+
Enabled: true # good for debug
110+
Dashboards:
111+
CloudWatch:
112+
Enabled: true # provide basic dashboards
113+
Tags:
114+
- Key: 'Grafana'
115+
Value: 'true'

1.architectures/2.aws-parallelcluster/templates/parallelcluster-prerequisites-p1.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ Resources:
307307
DeploymentType: PERSISTENT_1
308308
PerUnitStorageThroughput: !Ref PerUnitStorageThroughput
309309

310+
310311
OpenZFSFileSystem:
311312
Type: AWS::FSx::FileSystem
312313
Properties:

1.architectures/2.aws-parallelcluster/templates/parallelcluster-prerequisites.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,9 @@ Resources:
307307
DataCompressionType: !Ref Compression
308308
DeploymentType: PERSISTENT_2
309309
PerUnitStorageThroughput: !Ref PerUnitStorageThroughput
310+
MetadataConfiguration:
311+
Mode: AUTOMATIC
312+
310313

311314
OpenZFSFileSystem:
312315
Type: AWS::FSx::FileSystem

0 commit comments

Comments
 (0)