Skip to content

Commit 8b0e3da

Browse files
KeitaWnghtm
andcommitted
add self-managed grafana option
Co-authored-by: Matthew Nightingale <[email protected]>
1 parent 0e68c54 commit 8b0e3da

File tree

1 file changed

+127
-0
lines changed

1 file changed

+127
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
AWSTemplateFormatVersion: "2010-09-09"
2+
Description: CloudFormation template to monitor SageMaker Hyperpod - launches a t2.medium instance with 30GB of storage, security group, IAM role for Prometheus access, Grafana setup, and a Prometheus workspace.
3+
4+
Parameters:
5+
LatestAmiId:
6+
Type: 'AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>'
7+
Default: '/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2'
8+
Description: "The latest Amazon Linux 2 AMI ID."
9+
10+
Resources:
11+
MySecurityGroup:
12+
Type: "AWS::EC2::SecurityGroup"
13+
Properties:
14+
GroupDescription: "Allow ingress on port 3000 for Grafana access"
15+
SecurityGroupIngress:
16+
- IpProtocol: "tcp"
17+
FromPort: 3000
18+
ToPort: 3000
19+
CidrIp: "0.0.0.0/0"
20+
21+
GrafanaEC2Role:
22+
Type: "AWS::IAM::Role"
23+
Properties:
24+
AssumeRolePolicyDocument:
25+
Version: '2012-10-17'
26+
Statement:
27+
- Effect: Allow
28+
Principal:
29+
Service: ec2.amazonaws.com
30+
Action: "sts:AssumeRole"
31+
Policies:
32+
- PolicyName: "PrometheusAccessPolicy"
33+
PolicyDocument:
34+
Version: '2012-10-17'
35+
Statement:
36+
- Effect: Allow
37+
Action:
38+
- aps:ListWorkspaces
39+
- aps:DescribeWorkspace
40+
- aps:QueryMetrics
41+
- aps:GetLabels
42+
- aps:GetSeries
43+
- aps:GetMetricMetadata
44+
Resource: "*"
45+
ManagedPolicyArns:
46+
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
47+
48+
MyInstanceProfile:
49+
Type: "AWS::IAM::InstanceProfile"
50+
Properties:
51+
Roles:
52+
- !Ref GrafanaEC2Role
53+
54+
APSWorkspace:
55+
Type: "AWS::APS::Workspace"
56+
Properties:
57+
Alias: !Sub "${AWS::StackName}-Hyperpod-WorkSpace"
58+
Tags:
59+
- Key: "Name"
60+
Value: "SageMaker Hyperpod PrometheusMetrics"
61+
62+
MyInstance:
63+
Type: "AWS::EC2::Instance"
64+
Properties:
65+
InstanceType: "t2.medium"
66+
ImageId: !Ref LatestAmiId
67+
IamInstanceProfile: !Ref MyInstanceProfile
68+
SecurityGroupIds:
69+
- !Ref MySecurityGroup
70+
BlockDeviceMappings:
71+
- DeviceName: "/dev/xvda"
72+
Ebs:
73+
VolumeSize: 30
74+
UserData:
75+
Fn::Base64: !Sub |
76+
#!/bin/bash
77+
78+
# Update system packages
79+
sudo yum update -y
80+
81+
# Install Docker
82+
echo "Installing Docker..."
83+
sudo amazon-linux-extras install docker -y
84+
85+
# Start Docker service
86+
echo "Starting Docker service..."
87+
sudo systemctl start docker
88+
89+
# Enable Docker to start on boot
90+
sudo systemctl enable docker
91+
92+
# Add the current user (ec2-user) to the Docker group to run Docker commands without sudo
93+
echo "Adding ec2-user to Docker group..."
94+
sudo usermod -aG docker ec2-user
95+
96+
# Pull the latest Grafana image
97+
echo "Pulling the latest Grafana Docker image..."
98+
docker pull grafana/grafana:latest
99+
100+
# Run Grafana container with automatic restart
101+
echo "Starting Grafana container with restart policy..."
102+
docker run -d -p 3000:3000 --name=grafana --restart always grafana/grafana:latest
103+
104+
# Print Grafana access info
105+
echo "Docker and Grafana setup complete."
106+
echo "Grafana is running at http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):3000"
107+
echo "Default Grafana login credentials are admin/admin. Please change the password after the first login."
108+
109+
# Note: Log out and log back in for Docker permissions to take effect
110+
echo "Please log out and back in for Docker group permissions to apply."
111+
Tags:
112+
- Key: "Name"
113+
Value: "OS-Grafana"
114+
115+
116+
Outputs:
117+
InstanceId:
118+
Description: "Instance ID of the EC2 instance"
119+
Value: !Ref MyInstance
120+
PrometheusWorkspaceId:
121+
Description: "ID of the Amazon Managed Prometheus Workspace"
122+
Value: !Ref APSWorkspace
123+
AMPRemoteWriteURL:
124+
Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]]
125+
GrafanaInstanceAddress:
126+
Description: "Grafana address with port 3000 for the EC2 instance"
127+
Value: !Sub "http://${MyInstance.PublicIp}:3000"

0 commit comments

Comments
 (0)