一、CloudWatch服务安装
Amazon Linux 2系统安装Agent。
Bash
#!/bin/bash
rpm -ivh https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm
sudo tee -a /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json<<-'EOF'
{
"logs":{
"logs_collected":{
"files":{
"collect_list":[
{
"file_path":"/logArchive/hcaextension/info*.log",
"log_group_name":"RGC-Prod-3in1oven",
"log_stream_name":"info.logs"
},
{
"file_path":"/logArchive/hcaextension/http*.log",
"log_group_name":"RGC-Prod-3in1oven",
"log_stream_name":"http.logs"
}
]
}
}
},
"metrics":{
"aggregation_dimensions":[
[
"InstanceId"
]
],
"append_dimensions":{
"AutoScalingGroupName":"${aws:AutoScalingGroupName}",
"ImageId":"${aws:ImageId}",
"InstanceId":"${aws:InstanceId}",
"InstanceType":"${aws:InstanceType}"
},
"metrics_collected":{
"cpu":{
"measurement":[
"cpu_usage_idle",
"cpu_usage_iowait",
"cpu_usage_user",
"cpu_usage_system"
],
"metrics_collection_interval":180,
"resources":[
"*"
],
"totalcpu":false
},
"disk":{
"measurement":[
"used_percent"
],
"metrics_collection_interval":180,
"resources":[
"/"
]
},
"diskio":{
"measurement":[
"io_time",
"write_bytes",
"read_bytes",
"writes",
"reads"
],
"metrics_collection_interval":180,
"resources":[
"/"
]
},
"mem":{
"measurement":[
"mem_used_percent"
],
"metrics_collection_interval":180
},
"netstat":{
"measurement":[
"tcp_established",
"tcp_time_wait"
],
"metrics_collection_interval":180
},
"statsd":{
"metrics_aggregation_interval":60,
"metrics_collection_interval":180,
"service_address":":8125"
},
"swap":{
"measurement":[
"swap_used_percent"
],
"metrics_collection_interval":180
}
}
}
}
EOF
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
systemctl restart amazon-cloudwatch-agent.service
systemctl enable amazon-cloudwatch-agent.service
二、AWS-CLI批量下发监控
前提条件:本机安装awscli工具。
需要修改的是区域信息、ip_list、实例id、sns_arn信息。
通过脚本自动在CloudWatch上添加监控配置EC2监控。
Python
#!/usr/bin/python
# -*- coding: utf-8-*-
import os
import json
import subprocess
# 1. 配置cli路径和region
Contants ={
"AWSCLI":'"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe" --output json',
"AWSREGION":['eu-central-1'] # 新加坡
}
# 构造字典
class CreateDict(dict):
def __getitem__(self, item):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item]= type(self)()
return value
#########################################################################################################
# 配置告警
# CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getCPUUtilizationComm(name, action, instance_id):
mertic ='CPUUtilization'
print("#####开始配置 %s#####"% mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--unit Percent \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# MEMUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getmem_used_percentComm(name, action, instance_id, instancetype, imageid):
mertic ='mem_used_percent'
print("#####开始配置 %s#####"% mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype}'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)
# DISKUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getdisk_used_percentComm(name, action, instance_id, instancetype, imageid):
mertic ='disk_used_percent'
print("#####开始配置 %s#####"% mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--dimensions "Name=path,Value=/" \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype} Name=device,Value=nvme0n1p1 Name=fstype,Value=ext4 "Name=path,Value=/"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)
#注意因为磁盘无法获取到值和指定变量所以磁盘的值需要在cloudwatch上看下类型值来填写 device和fstype
# NetworkIn,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkInComm(name, action, instance_id):
mertic ='NetworkIn'
print("#####开始配置 %s#####"% mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value=%s"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# NetworkOut,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkOutComm(name, action, instance_id):
mertic ='NetworkOut'
print("#####开始配置 %s#####"% mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# 执行命令函数
def execCommand(comm):
try:
print(comm)
(status, stdout)= subprocess.getstatusoutput(comm)
print(status)
return stdout
except Exception as e:
print(e)
# 获取当前可用区内所有EC2的基础信息
def getAll(get_server_id_list):
# instanceids =["i-0f24b7bf904ea9563","i-0ce745e06c12cbde1"]
# for instanceid in instanceids:
# print(instanceid)
# comm1 ="%s ec2 describe-instances --instance-ids %s"%(Contants['AWSCLI'],instanceid)
comm1 ="%s ec2 describe-instances"% Contants['AWSCLI']
all_data = json.loads(execCommand(comm1))
instance_list =[]
instance_list_modify =[]
for r in all_data['Reservations']:
data ={}
for i in r['Instances']:
data['id']= i['InstanceId']
data['imageid']= i['ImageId']
data['instancetype']= i['InstanceType']
for t in i['Tags']:
if t['Key']=='Name':
data['name']= t['Value']
if not data['name']:
data['name']= i['InstanceId']
instance_list.append(data)
# print(instance_list)
for instance_id in instance_list:
print(instance_id)
if instance_id.get("id")in get_server_id_list:
instance_list_modify.append(instance_id)
#print(instance_list)
print(instance_list_modify)
return instance_list_modify
# 添加报警
def add_alert(data, action):
for i in data:
instance_id = i['id']
name = i['name']
imageid = i['imageid']
instancetype = i['instancetype']
print(instance_id, name, imageid, instancetype)
#print(instance_id, name)
execCommand(getCPUUtilizationComm(name, action, instance_id))
#execCommand(getNetworkInComm(name, action, instance_id))
#execCommand(getNetworkOutComm(name, action, instance_id))
#execCommand(getmem_used_percentComm(name, action, instance_id, instancetype, imageid))
#execCommand(getdisk_used_percentComm(name, action, instance_id, instancetype, imageid))
def get_server_info(instance_list):
server_info =[]
# print(server_dict)
for i in instance_list:
# print(i)
# 显示执行命令
print("aws ec2 describe-instances --output json --instance-ids {0}".format(i))
# print(cmd)
server_dict ={}
data = os.popen("aws ec2 describe-instances --output json --instance-ids {0}".format(i)).read()
json_str = json.loads(data)
"""
# print(json_str["Reservations"][0]["Instances"][0])
server_dict['id']=json_str["Reservations"][0]["Instances"][0]["InstanceId"]
server_dict['imageid']=json_str["Reservations"][0]["Instances"][0]["ImageId"]
server_dict['instancetype']=json_str["Reservations"][0]["Instances"][0]["InstanceType"]
if not json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]:
server_dict['name'] = json_str["Reservations"][0]["Instances"][0]["InstanceId"]
else:
server_dict['name']=json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]
server_info.append(server_dict)
"""
for Reservations_list in json_str["Reservations"]:
for Instances_list in Reservations_list["Instances"]:
server_dict['id']= Instances_list["InstanceId"]
server_dict['imageid']= Instances_list["ImageId"]
server_dict['instancetype']= Instances_list["InstanceType"]
# if not Instances_list["Tags"][0]["Value"]:
# server_dict['name']= Instances_list["InstanceId"]
# else:
# server_dict['name']= Instances_list["Tags"][0]["Value"]
for tag_item in Instances_list["Tags"]:
name=tag_item["Key"]
if name =="Name":
server_dict['name']= tag_item["Value"]
break
if i == server_dict["id"]:
print(server_dict)
server_info.append(server_dict)
return server_info
if __name__ =='__main__':
# 2. 配置sns的arn
sns_arn ="arn:aws:sns:eu-central-1:643xxxxx:xxxx-CloudWatch-Lambda-DingTalk"
ip_list =["i-010bxxxx","i-00xxxxx"]
cli = Contants['AWSCLI']
for i in Contants['AWSREGION']:
print('[Region] ', i)
Contants['AWSCLI']= cli +' --region '+ i
add_alert(get_server_info(ip_list), sns_arn)
三、Amazon SNS创建主题
创建sns主题关联LAMBDA 钉钉程序。
四、Lambda钉钉函数通知脚本
上传如下脚本,通过cloudwatch调式EC2设定的规则来触发告警测试。
Prolog
# _*_coding:utf-8_*_
# python 3.8
# Creation time:2021/11/18
import time
import hmac
import hashlib
import base64
import urllib.parse
import json
import os
import requests
import datetime
def lambda_handler(event, context):
headers ={'Content-Type':'application/json;charset=utf-8'}
token ='ca5533c8cb976c21'
timestamp= str(round(time.time()*1000))
secret ='SEC8d1a31ec5e8e91'
secret_enc = secret.encode('utf-8')
string_to_sign ='{}\n{}'.format(timestamp, secret)
string_to_sign_enc = string_to_sign.encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
# get url
api_url ="https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}".format(token,timestamp, sign)
# msg setting
#message = event['Records'][0]['SNS']
message = event['Records'][0]['Sns']
Timestamp= message['Timestamp']
Subject = message['Subject']
# sns_message = message['Message']
sns_message = json.loads(message['Message'])
NewStateReason = json.loads(event['Records'][0]['Sns']['Message'])['NewStateReason']
current_time =(datetime.datetime.now()+datetime.timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')
if "ALARM"in Subject:
title =''
elif "OK"in Subject:
title =''
else:
title =''
_value = sns_message['Trigger']['Dimensions'][0]['value']
if _value.startswith('/'):
_value = sns_message['Trigger']['Dimensions'][1]['value']
content ="### {title}".format(title=title)+ \
"\n> #### **时间**: "+ current_time + \
"\n> #### **状态**: "+ sns_message['OldStateValue']+" => "+ sns_message['NewStateValue']+ \
"\n> #### **告警名称**: "+ sns_message['AlarmName']+ \
"\n> #### **账户ID**: "+ sns_message['AWSAccountId']+ \
"\n> #### **AWS区域**: "+ sns_message['Region']+ \
"\n> #### **描述**: "+ sns_message['AlarmDescription']+ \
"\n> #### **产品资源**: "+ sns_message['Trigger']['Namespace']+ \
"\n> #### **实例ID**: "+ _value + \
"\n> #### **指标名称**: "+ sns_message['Trigger']['MetricName']+ \
"\n> #### **报警详情**: "+ sns_message['NewStateReason']
msg ={
"msgtype":"markdown",
"markdown":{
"title": title,
"text": content
},
"at":{
"isAtAll":"true"
}
}
# request
request = requests.post(url=api_url, data=json.dumps(msg), headers=headers).content.decode("utf8")
return request
Aws子账户权限调式工具。
https://policysim.aws.amazon.com/
文章来源网络,作者:运维,如若转载,请注明出处:https://shuyeidc.com/wp/151578.html<