Forum Discussion
AKS Pod resource utilization (CPU/Memory) alert
Hi All,
I am trying to set up an alert for AKS pod CPU/Memory utilization alert when a max utilization hits certain threshold (let's say >95%). Sample query for CPU utilization.
let cpuusage = materialize(Perf
| where ObjectName == 'K8SContainer'
| where CounterName has 'cpuUsageNanoCores'
| extend ContainerNameParts = split(InstanceName, '/')
| extend ContainerNamePartCount = array_length(ContainerNameParts)
| extend
PodUIDIndex = ContainerNamePartCount - 2,
ContainerNameIndex = ContainerNamePartCount - 1
| extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])
| summarize AggregatedValue=max(CounterValue) by bin(TimeGenerated, 15m), ContainerName
| project TimeGenerated, ContainerName, AggregatedValue
| join kind = inner (
KubePodInventory
| summarize arg_max(TimeGenerated, *) by ContainerName
| project Name, ContainerName, Namespace, ServiceName
)
on ContainerName
| project
TimeGenerated,
Name,
ServiceName,
ContainerName,
Namespace,
CPU_mCores_Usage=AggregatedValue / 1000000);
let cpurequest=materialize(Perf
| where ObjectName == 'K8SContainer'
| where CounterName == 'cpuRequestNanoCores'
| extend ContainerNameParts = split(InstanceName, '/')
| extend ContainerNamePartCount = array_length(ContainerNameParts)
| extend
PodUIDIndex = ContainerNamePartCount - 2,
ContainerNameIndex = ContainerNamePartCount - 1
| extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])
| project ContainerName, CounterValue
| join kind = inner (KubePodInventory
//| summarize arg_max(TimeGenerated, 24h) by ContainerName, Name, Namespace
| project Name, ContainerName, Namespace
)
on ContainerName
| project Name, Namespace, ContainerName, CpuReq_in_mcores=(CounterValue / 1000000));
let cpulimits = materialize(Perf
| where ObjectName == 'K8SContainer'
| where CounterName == 'cpuLimitNanoCores'
| extend ContainerNameParts = split(InstanceName, '/')
| extend ContainerNamePartCount = array_length(ContainerNameParts)
| extend
PodUIDIndex = ContainerNamePartCount - 2,
ContainerNameIndex = ContainerNamePartCount - 1
| extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])
| extend CpuNanoCoreLimit= CounterValue
| project ContainerName, CpuNanoCoreLimit
| join kind = inner (
KubePodInventory
| summarize arg_max(TimeGenerated, *) by ContainerName
| project Name, ContainerName, Namespace, ServiceName
)
on ContainerName
| project
Name,
ServiceName,
Namespace,
ContainerName,
CPU_mCores_Limit=CpuNanoCoreLimit / 1000000);
cpulimits
| join cpurequest on ContainerName
| join cpuusage on ContainerName
| order by Namespace asc, ContainerName asc
| extend CName = split(ContainerName, '/')
| extend PodName= Name
| extend Cpu_Perct_utilization=round((CPU_mCores_Usage / CPU_mCores_Limit) * 100, 2)
| project
TimeGenerated,
Namespace,
ServiceName,
PodName,
CPU_mCores_Usage,
CPU_mCores_Limit,
CpuReq_in_mcores,
Cpu_Perct_utilization
| sort by TimeGenerated desc
But just wanted to modify the query little bit, wanted to get an alert only when utilization hits maximum continuously 3 times within 30 minutes (by keeping frequency of evaluation 10 min). Please advise.
3 Replies
- loadedlouie270Copper Contributor
Use the "violations" at the bottom of the alert
IT should get you covered
- loadedlouie270Copper Contributor
Try using this
It means, that on that time window, how much "violations" of the threshold, must have in order for the alert to trigger. Hope it Helps.
Keep in mind that in order for this to work you need to have has much "dataPoints" results, at least the amount that you are using in violations.
Example: 3 violations means at least 3 5 min timespam, with aggregated values of 5 minutes.
Hopefully this will make sense. How about this:
let cpuusage = materialize( Perf | where ObjectName == 'K8SContainer' | where CounterName has 'cpuUsageNanoCores' | extend ContainerNameParts = split(InstanceName, '/') | extend ContainerNamePartCount = array_length(ContainerNameParts) | extend PodUIDIndex = ContainerNamePartCount - 2, ContainerNameIndex = ContainerNamePartCount - 1 | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex]) | summarize AggregatedValue=max(CounterValue) by bin(TimeGenerated, 10m), ContainerName | project TimeGenerated, ContainerName, AggregatedValue ); let cpulimits = materialize( Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuLimitNanoCores' | extend ContainerNameParts = split(InstanceName, '/') | extend ContainerNamePartCount = array_length(ContainerNameParts) | extend PodUIDIndex = ContainerNamePartCount - 2, ContainerNameIndex = ContainerNamePartCount - 1 | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex]) | extend CpuNanoCoreLimit= CounterValue | project ContainerName, CpuNanoCoreLimit ); let cpu_threshold_exceeded = cpuusage | join kind=inner cpulimits on ContainerName | extend CPU_mCores_Usage = AggregatedValue / 1000000 | extend CPU_mCores_Limit = CpuNanoCoreLimit / 1000000 | extend Cpu_Perct_utilization = round((CPU_mCores_Usage / CPU_mCores_Limit) * 100, 2) | where Cpu_Perct_utilization > 95 | summarize count() by ContainerName, bin(TimeGenerated, 30m) | where count_ >= 3 | project ContainerName, TimeGenerated, count_; cpu_threshold_exceeded