From b6469d0210b0d1d71ddf527656702e6050616a1e Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Wed, 24 Jul 2024 00:00:37 +0530 Subject: [PATCH 1/3] update queries Signed-off-by: kusumachalasani --- .../dataSourceQueries/DataSourceQueries.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java b/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java index be8b758ab..24f09a289 100644 --- a/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java +++ b/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java @@ -10,14 +10,15 @@ public enum PromQLQuery { NAMESPACE_QUERY("sum by (namespace) (kube_namespace_status_phase{phase=\"Active\"})"), WORKLOAD_INFO_QUERY("sum by (namespace, workload, workload_type) (namespace_workload_pod:kube_pod_owner:relabel)"), CONTAINER_INFO_QUERY("sum by (container, image, workload) (kube_pod_container_info * on(pod) group_left(workload, workload_type) (namespace_workload_pod:kube_pod_owner:relabel))"), - CPU_USAGE("%s by(container, namespace)(%s_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\" }[%sm]))"), - CPU_THROTTLE("%s by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"}[%sm]))"), - CPU_LIMIT("%s by(container,namespace) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='cpu', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}))"), - CPU_REQUEST("%s by(container, namespace) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='cpu', unit='core' ,namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))"), - MEMORY_USAGE("%s by(container, namespace) (%s_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\" }[%sm]))"), - MEMORY_RSS("%s by(container, namespace) (%s_over_time(container_memory_rss{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"}[%sm]))"), - MEMORY_LIMIT("%s by(container,namespace) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='memory', unit='byte', namespace=\"%s\",container=\"%s\" } * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))"), - MEMORY_REQUEST("%s by(container,namespace) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='memory', unit='byte',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))"), + + CPU_USAGE("%s by(namespace,container,workload,workload_type,owner_kind)(%s_over_time(( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[%sm:]))"), + CPU_THROTTLE("%s by(container,namespace, workload, workload_type, owner_kind)(%s_over_time((rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"}[15m]))* on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[%sm:])"), + CPU_LIMIT("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='cpu', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), + CPU_REQUEST("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='cpu', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), + MEMORY_USAGE("%s by(namespace,container,workload,workload_type,owner_kind) (min_over_time(( container_memory_working_set_bytes{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!=""}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])))[%sm:]))"), + MEMORY_RSS("%s by(namespace,container,workload,workload_type,owner_kind) (min_over_time((container_memory_rss{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[%sm:]))"), + MEMORY_LIMIT("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='memory', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), + MEMORY_REQUEST("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='memory', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), MAX_DATE("max(container_cpu_usage_seconds_total{container=\"%s\",namespace=\"%s\"} > 0)"); private final String query; From b31c944e44900534eb9f69b2be47b067e5a38e0a Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Wed, 24 Jul 2024 10:52:36 +0530 Subject: [PATCH 2/3] update queries Signed-off-by: kusumachalasani --- ...esource_optimization_local_monitoring.yaml | 59 ++++++++++--------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml index 1e233720b..0de16a4bd 100644 --- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml +++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml @@ -23,11 +23,11 @@ slo: aggregation_functions: - function: 'avg' - query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'avg by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_requests{container!="", container!="POD", pod!="", resource="cpu", unit="core", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Show sum of cpu requests in bytes for a container in a deployment - function: 'sum' - query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'sum by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_requests{container!="", container!="POD", pod!="", resource="cpu", unit="core", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # CPU Limit @@ -39,11 +39,11 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'avg by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_limits{container!="", container!="POD", pod!="", resource="cpu", unit="core", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Show sum of cpu limits in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE$", resource="cpu", unit="core"})' + query: 'sum by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_limits{container!="", container!="POD", pod!="", resource="cpu", unit="core", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # CPU Usage @@ -65,45 +65,45 @@ slo: # For openshift versions <=4.8 aggregation_functions: - function: avg - query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(namespace,container,workload,workload_type,owner_kind) (avg_over_time(((irate(container_cpu_usage_seconds_total{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[5m])) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="", workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: "<=4.8" # For openshift versions >=4.9 - function: avg - query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(namespace,container,workload,workload_type,owner_kind) (avg_over_time(((node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: ">4.9" # Approx minimum CPU per container in a deployment # For openshift versions <=4.8 - function: min - query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(namespace,container,workload,workload_type,owner_kind) (min_over_time(((irate(container_cpu_usage_seconds_total{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[5m])) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="", workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: "<=4.8" # For openshift versions >=4.9 - function: min - query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(namespace,container,workload,workload_type,owner_kind) (min_over_time(((node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: ">4.9" # Approx maximum CPU per container in a deployment # For openshift versions <=4.8 - function: max - query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(namespace,container,workload,workload_type,owner_kind) (max_over_time(((irate(container_cpu_usage_seconds_total{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[5m])) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="", workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: "<=4.8" # For openshift versions >=4.9 - function: max - query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(namespace,container,workload,workload_type,owner_kind) (max_over_time(((node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: ">4.9" # Sum of CPU usage for a container in all pods of a deployment # For openshift versions <=4.8 - function: sum - query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(namespace,container,workload,workload_type,owner_kind) (avg_over_time(((irate(container_cpu_usage_seconds_total{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[5m])) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="", workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: "<=4.8" # For openshift versions >=4.9 - function: sum - query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(namespace,container,workload,workload_type,owner_kind) (avg_over_time(((node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' versions: ">4.9" @@ -116,15 +116,20 @@ slo: aggregation_functions: # Average CPU throttling per container in a deployment - function: avg - query: 'avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((rate(container_cpu_cfs_throttled_seconds_total{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[15m]) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Maximum CPU throttling per container in a deployment - function: max - query: 'max(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'max by(namespace,container,workload,workload_type,owner_kind) (max_over_time((rate(container_cpu_cfs_throttled_seconds_total{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[15m]) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' + + # Min of CPU throttling for a container in all pods of a deployment + - function: min + query: 'min by(namespace,container,workload,workload_type,owner_kind) (min_over_time((rate(container_cpu_cfs_throttled_seconds_total{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[15m]) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Sum of CPU throttling for a container in all pods of a deployment - function: sum - query: 'sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'sum by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((rate(container_cpu_cfs_throttled_seconds_total{container!="", container!="POD", pod!="",container="$CONTAINER_NAME$", namespace="$NAMESPACE$"}[15m]) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' + @@ -139,11 +144,11 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'avg by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_requests{container!="", container!="POD", pod!="", resource="memory", unit="byte", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Show sum of memory requests in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'sum by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_requests{container!="", container!="POD", pod!="", resource="memory", unit="byte", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Memory Limit @@ -155,11 +160,11 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="memory", unit="byte"})' + query: 'avg by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_limits{container!="", container!="POD", pod!="", resource="memory", unit="byte", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Show sum of memory limits in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'sum by(container,namespace,workload,workload_type,owner_kind) ((kube_pod_container_resource_limits{container!="", container!="POD", pod!="", resource="memory", unit="byte", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase="Running"}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m]))))' # Memory Usage @@ -171,19 +176,19 @@ slo: aggregation_functions: - function: avg - query: 'avg(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((container_memory_working_set_bytes{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Approx minimum memory per container in a deployment - function: min - query: 'min(min_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(namespace,container,workload,workload_type,owner_kind) (min_over_time((container_memory_working_set_bytes{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Approx maximum memory per container in a deployment - function: max - query: 'max(max_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(namespace,container,workload,workload_type,owner_kind) (max_over_time((container_memory_working_set_bytes{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Sum of memory usage for a contianer in all pods of a deployment - function: sum - query: 'sum(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((container_memory_working_set_bytes{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # 2.4 Memory RSS @@ -195,17 +200,17 @@ slo: aggregation_functions: # Average memory RSS per container in a deployment - function: avg - query: 'avg(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((container_memory_rss{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Approx minimum memory RSS per container in a deployment - function: min - query: 'min(min_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(namespace,container,workload,workload_type,owner_kind) (min_over_time((container_memory_rss{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Approx maximum memory RSS per container in a deployment - function: max - query: 'max(max_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(namespace,container,workload,workload_type,owner_kind) (max_over_time((container_memory_rss{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' # Sum of memory RSS for a contianer in all pods of a deployment - function: sum - query: 'sum(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'sum by(namespace,container,workload,workload_type,owner_kind) (avg_over_time((container_memory_rss{container!="", container!="POD", pod!="", container="$CONTAINER_NAME$", namespace="$NAMESPACE$"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload="$WORKLOAD$", workload_type="$WORKLOAD_TYPE$"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[15m:]))' From 32e12a469c9fdf5e8f5702728bea2f875ad1bbc9 Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Fri, 26 Jul 2024 13:37:47 +0530 Subject: [PATCH 3/3] Update queries --- .../common/data/dataSourceQueries/DataSourceQueries.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java b/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java index 24f09a289..c6b428b8d 100644 --- a/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java +++ b/src/main/java/com/autotune/common/data/dataSourceQueries/DataSourceQueries.java @@ -19,7 +19,7 @@ public enum PromQLQuery { MEMORY_RSS("%s by(namespace,container,workload,workload_type,owner_kind) (min_over_time((container_memory_rss{container!='', container!='POD', pod!='',namespace=\"%s\",container=\"%s\"} * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))[%sm:]))"), MEMORY_LIMIT("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='memory', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), MEMORY_REQUEST("%s by(container,namespace,workload,workload_type,owner_kind) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='memory', unit='core',namespace=\"%s\",container=\"%s\"} * on(pod, namespace) group_left max by (container,pod, namespace) (kube_pod_status_phase{phase='Running'}) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15m])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15m])))"), - MAX_DATE("max(container_cpu_usage_seconds_total{container=\"%s\",namespace=\"%s\"} > 0)"); + MAX_DATE("last_over_time(container_cpu_usage_seconds_total{container=\"%s\",namespace=\"%s\"}[15d:]) * on(pod) group_left(workload, workload_type) max by (pod, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!="",workload=\"%s\",workload_type=\"%s\"}[15d])) * on(pod) group_left(owner_kind) max by (pod, owner_kind) (max_over_time(kube_pod_owner{pod!=""}[15d]))"); private final String query; PromQLQuery(String query) {