|
| 1 | +AWSTemplateFormatVersion: "2010-09-09" |
| 2 | +Description: CloudFormation template to monitor SageMaker Hyperpod - launches a t2.medium instance with 30GB of storage, security group, IAM role for Prometheus access, Grafana setup, and a Prometheus workspace. |
| 3 | + |
| 4 | +Parameters: |
| 5 | + LatestAmiId: |
| 6 | + Type: 'AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>' |
| 7 | + Default: '/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2' |
| 8 | + Description: "The latest Amazon Linux 2 AMI ID." |
| 9 | + |
| 10 | +Resources: |
| 11 | + MySecurityGroup: |
| 12 | + Type: "AWS::EC2::SecurityGroup" |
| 13 | + Properties: |
| 14 | + GroupDescription: "Allow ingress on port 3000 for Grafana access" |
| 15 | + SecurityGroupIngress: |
| 16 | + - IpProtocol: "tcp" |
| 17 | + FromPort: 3000 |
| 18 | + ToPort: 3000 |
| 19 | + CidrIp: "0.0.0.0/0" |
| 20 | + |
| 21 | + GrafanaEC2Role: |
| 22 | + Type: "AWS::IAM::Role" |
| 23 | + Properties: |
| 24 | + AssumeRolePolicyDocument: |
| 25 | + Version: '2012-10-17' |
| 26 | + Statement: |
| 27 | + - Effect: Allow |
| 28 | + Principal: |
| 29 | + Service: ec2.amazonaws.com |
| 30 | + Action: "sts:AssumeRole" |
| 31 | + Policies: |
| 32 | + - PolicyName: "PrometheusAccessPolicy" |
| 33 | + PolicyDocument: |
| 34 | + Version: '2012-10-17' |
| 35 | + Statement: |
| 36 | + - Effect: Allow |
| 37 | + Action: |
| 38 | + - aps:ListWorkspaces |
| 39 | + - aps:DescribeWorkspace |
| 40 | + - aps:QueryMetrics |
| 41 | + - aps:GetLabels |
| 42 | + - aps:GetSeries |
| 43 | + - aps:GetMetricMetadata |
| 44 | + Resource: "*" |
| 45 | + ManagedPolicyArns: |
| 46 | + - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore |
| 47 | + |
| 48 | + MyInstanceProfile: |
| 49 | + Type: "AWS::IAM::InstanceProfile" |
| 50 | + Properties: |
| 51 | + Roles: |
| 52 | + - !Ref GrafanaEC2Role |
| 53 | + |
| 54 | + APSWorkspace: |
| 55 | + Type: "AWS::APS::Workspace" |
| 56 | + Properties: |
| 57 | + Alias: !Sub "${AWS::StackName}-Hyperpod-WorkSpace" |
| 58 | + Tags: |
| 59 | + - Key: "Name" |
| 60 | + Value: "SageMaker Hyperpod PrometheusMetrics" |
| 61 | + |
| 62 | + MyInstance: |
| 63 | + Type: "AWS::EC2::Instance" |
| 64 | + Properties: |
| 65 | + InstanceType: "t2.medium" |
| 66 | + ImageId: !Ref LatestAmiId |
| 67 | + IamInstanceProfile: !Ref MyInstanceProfile |
| 68 | + SecurityGroupIds: |
| 69 | + - !Ref MySecurityGroup |
| 70 | + BlockDeviceMappings: |
| 71 | + - DeviceName: "/dev/xvda" |
| 72 | + Ebs: |
| 73 | + VolumeSize: 30 |
| 74 | + UserData: |
| 75 | + Fn::Base64: !Sub | |
| 76 | + #!/bin/bash |
| 77 | + |
| 78 | + # Update system packages |
| 79 | + sudo yum update -y |
| 80 | + |
| 81 | + # Install Docker |
| 82 | + echo "Installing Docker..." |
| 83 | + sudo amazon-linux-extras install docker -y |
| 84 | + |
| 85 | + # Start Docker service |
| 86 | + echo "Starting Docker service..." |
| 87 | + sudo systemctl start docker |
| 88 | + |
| 89 | + # Enable Docker to start on boot |
| 90 | + sudo systemctl enable docker |
| 91 | + |
| 92 | + # Add the current user (ec2-user) to the Docker group to run Docker commands without sudo |
| 93 | + echo "Adding ec2-user to Docker group..." |
| 94 | + sudo usermod -aG docker ec2-user |
| 95 | + |
| 96 | + # Pull the latest Grafana image |
| 97 | + echo "Pulling the latest Grafana Docker image..." |
| 98 | + docker pull grafana/grafana:latest |
| 99 | + |
| 100 | + # Run Grafana container with automatic restart |
| 101 | + echo "Starting Grafana container with restart policy..." |
| 102 | + docker run -d -p 3000:3000 --name=grafana --restart always grafana/grafana:latest |
| 103 | + |
| 104 | + # Print Grafana access info |
| 105 | + echo "Docker and Grafana setup complete." |
| 106 | + echo "Grafana is running at http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):3000" |
| 107 | + echo "Default Grafana login credentials are admin/admin. Please change the password after the first login." |
| 108 | + |
| 109 | + # Note: Log out and log back in for Docker permissions to take effect |
| 110 | + echo "Please log out and back in for Docker group permissions to apply." |
| 111 | + Tags: |
| 112 | + - Key: "Name" |
| 113 | + Value: "OS-Grafana" |
| 114 | + |
| 115 | + |
| 116 | +Outputs: |
| 117 | + InstanceId: |
| 118 | + Description: "Instance ID of the EC2 instance" |
| 119 | + Value: !Ref MyInstance |
| 120 | + PrometheusWorkspaceId: |
| 121 | + Description: "ID of the Amazon Managed Prometheus Workspace" |
| 122 | + Value: !Ref APSWorkspace |
| 123 | + AMPRemoteWriteURL: |
| 124 | + Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]] |
| 125 | + GrafanaInstanceAddress: |
| 126 | + Description: "Grafana address with port 3000 for the EC2 instance" |
| 127 | + Value: !Sub "http://${MyInstance.PublicIp}:3000" |
0 commit comments