AKS Pod resource utilization (CPU/Memory) alert

Question

Hi All,I am trying to set up an alert for AKS pod CPU/Memory utilization alert when a max utilization hits certain threshold (let's say &gt;95%). Sample query for CPU utilization.let cpuusage = materialize(Perf&nbsp; &nbsp; | where ObjectName == 'K8SContainer'&nbsp; &nbsp; | where CounterName has 'cpuUsageNanoCores'&nbsp; &nbsp; | extend ContainerNameParts = split(InstanceName, '/')&nbsp; &nbsp; | extend ContainerNamePartCount = array_length(ContainerNameParts) &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; | extend&nbsp; &nbsp; &nbsp; &nbsp; PodUIDIndex = ContainerNamePartCount - 2,&nbsp; &nbsp; &nbsp; &nbsp; ContainerNameIndex = ContainerNamePartCount - 1&nbsp;&nbsp; &nbsp; | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])&nbsp; &nbsp; | summarize AggregatedValue=max(CounterValue) by bin(TimeGenerated, 15m), ContainerName&nbsp; &nbsp; | project TimeGenerated, ContainerName, AggregatedValue&nbsp; &nbsp; | join kind = &nbsp;inner &nbsp; (&nbsp; &nbsp; &nbsp; &nbsp; KubePodInventory&nbsp; &nbsp; &nbsp; &nbsp; | summarize arg_max(TimeGenerated, *) by ContainerName &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp; | project Name, ContainerName, Namespace, ServiceName&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; )&nbsp; &nbsp; &nbsp; &nbsp; on ContainerName&nbsp; &nbsp; | project&nbsp; &nbsp; &nbsp; &nbsp; TimeGenerated,&nbsp; &nbsp; &nbsp; &nbsp; Name,&nbsp; &nbsp; &nbsp; &nbsp; ServiceName,&nbsp; &nbsp; &nbsp; &nbsp; ContainerName,&nbsp; &nbsp; &nbsp; &nbsp; Namespace,&nbsp; &nbsp; &nbsp; &nbsp; CPU_mCores_Usage=AggregatedValue / 1000000);let cpurequest=materialize(Perf&nbsp; &nbsp; | where ObjectName == 'K8SContainer'&nbsp; &nbsp; | where CounterName == 'cpuRequestNanoCores'&nbsp; &nbsp; | extend ContainerNameParts = split(InstanceName, '/')&nbsp; &nbsp; | extend ContainerNamePartCount = array_length(ContainerNameParts) &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; | extend&nbsp; &nbsp; &nbsp; &nbsp; PodUIDIndex = ContainerNamePartCount - 2,&nbsp; &nbsp; &nbsp; &nbsp; ContainerNameIndex = ContainerNamePartCount - 1&nbsp;&nbsp; &nbsp; | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])&nbsp; &nbsp; | project ContainerName, CounterValue &nbsp; &nbsp;&nbsp;&nbsp; &nbsp; | join kind = inner (KubePodInventory&nbsp; &nbsp; &nbsp; &nbsp; //| summarize arg_max(TimeGenerated, 24h) by ContainerName, Name, Namespace &nbsp;&nbsp; &nbsp; &nbsp; &nbsp; | project Name, ContainerName, Namespace &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp; )&nbsp; &nbsp; &nbsp; &nbsp; on ContainerName&nbsp; &nbsp; | project Name, Namespace, ContainerName, CpuReq_in_mcores=(CounterValue / 1000000));let cpulimits = materialize(Perf&nbsp; &nbsp; | where ObjectName == 'K8SContainer'&nbsp; &nbsp; | where CounterName == 'cpuLimitNanoCores'&nbsp; &nbsp; | extend ContainerNameParts = split(InstanceName, '/')&nbsp; &nbsp; | extend ContainerNamePartCount = array_length(ContainerNameParts) &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; | extend&nbsp; &nbsp; &nbsp; &nbsp; PodUIDIndex = ContainerNamePartCount - 2,&nbsp; &nbsp; &nbsp; &nbsp; ContainerNameIndex = ContainerNamePartCount - 1&nbsp;&nbsp; &nbsp; | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])&nbsp; &nbsp; | extend CpuNanoCoreLimit= CounterValue&nbsp; &nbsp; | project ContainerName, CpuNanoCoreLimit&nbsp; &nbsp; | join kind = inner &nbsp; (&nbsp; &nbsp; &nbsp; &nbsp; KubePodInventory&nbsp; &nbsp; &nbsp; &nbsp; | summarize arg_max(TimeGenerated, *) by ContainerName &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp; | project Name, ContainerName, Namespace, ServiceName&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; )&nbsp; &nbsp; &nbsp; &nbsp; on ContainerName&nbsp; &nbsp; | project&nbsp; &nbsp; &nbsp; &nbsp; Name,&nbsp; &nbsp; &nbsp; &nbsp; ServiceName,&nbsp; &nbsp; &nbsp; &nbsp; Namespace,&nbsp; &nbsp; &nbsp; &nbsp; ContainerName,&nbsp; &nbsp; &nbsp; &nbsp; CPU_mCores_Limit=CpuNanoCoreLimit / 1000000);cpulimits| join cpurequest on ContainerName| join cpuusage on ContainerName| order by Namespace asc, ContainerName asc&nbsp;| extend CName = split(ContainerName, '/')| extend PodName= Name| extend Cpu_Perct_utilization=round((CPU_mCores_Usage / CPU_mCores_Limit) * 100, 2)| project&nbsp; &nbsp; TimeGenerated,&nbsp; &nbsp; Namespace,&nbsp; &nbsp; ServiceName,&nbsp; &nbsp; PodName,&nbsp; &nbsp; CPU_mCores_Usage,&nbsp; &nbsp; CPU_mCores_Limit,&nbsp; &nbsp; CpuReq_in_mcores,&nbsp; &nbsp; Cpu_Perct_utilization| sort by TimeGenerated desc&nbsp;&nbsp;But just wanted to modify the query little bit, wanted to get an alert only when utilization hits maximum continuously 3 times within 30 minutes (by keeping frequency of evaluation 10 min). Please advise.&nbsp;

kidd_ip · Answer

How about this:
&nbsp;
let cpuusage = materialize(
    Perf
    | where ObjectName == 'K8SContainer'
    | where CounterName has 'cpuUsageNanoCores'
    | extend ContainerNameParts = split(InstanceName, '/')
    | extend ContainerNamePartCount = array_length(ContainerNameParts)             
    | extend PodUIDIndex = ContainerNamePartCount - 2, ContainerNameIndex = ContainerNamePartCount - 1  
    | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])
    | summarize AggregatedValue=max(CounterValue) by bin(TimeGenerated, 10m), ContainerName
    | project TimeGenerated, ContainerName, AggregatedValue
);

let cpulimits = materialize(
    Perf
    | where ObjectName == 'K8SContainer'
    | where CounterName == 'cpuLimitNanoCores'
    | extend ContainerNameParts = split(InstanceName, '/')
    | extend ContainerNamePartCount = array_length(ContainerNameParts)             
    | extend PodUIDIndex = ContainerNamePartCount - 2, ContainerNameIndex = ContainerNamePartCount - 1  
    | extend ContainerName = strcat(ContainerNameParts[PodUIDIndex], '/', ContainerNameParts[ContainerNameIndex])
    | extend CpuNanoCoreLimit= CounterValue
    | project ContainerName, CpuNanoCoreLimit
);

let cpu_threshold_exceeded = cpuusage
| join kind=inner cpulimits on ContainerName
| extend CPU_mCores_Usage = AggregatedValue / 1000000
| extend CPU_mCores_Limit = CpuNanoCoreLimit / 1000000
| extend Cpu_Perct_utilization = round((CPU_mCores_Usage / CPU_mCores_Limit) * 100, 2)
| where Cpu_Perct_utilization &gt; 95
| summarize count() by ContainerName, bin(TimeGenerated, 30m)
| where count_ &gt;= 3
| project ContainerName, TimeGenerated, count_;

cpu_threshold_exceeded
&nbsp;

loadedlouie270 · Answer

Try using this&nbsp;It means, that on that time window, how much "violations" of the threshold, must have in order for the alert to trigger. Hope it Helps.Keep in mind that in order for this to work you need to have has much "dataPoints"&nbsp; results, at least the amount that you are using in violations.&nbsp;Example: 3 violations means at least 3 5 min timespam, with aggregated values of 5 minutes.&nbsp;Hopefully this will make sense.&nbsp;

loadedlouie270 · Answer

Use the "violations" at the bottom of the alert&nbsp;&nbsp;IT should get you covered

Forum Discussion

AKS Pod resource utilization (CPU/Memory) alert

3 Replies

Resources