PageRenderTime 45ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/config/prometheus/common_metrics.yml

https://gitlab.com/innerwhisper/gitlab-ce
YAML | 214 lines | 212 code | 0 blank | 2 comment | 0 complexity | fad73b41a8591062860181c9a643cc64 MD5 | raw file
  1. dashboard: 'Environment metrics'
  2. priority: 1
  3. panel_groups:
  4. # NGINX Ingress metrics for pre-0.16.0 versions
  5. - group: Response metrics (NGINX Ingress VTS)
  6. priority: 10
  7. panels:
  8. - title: "Throughput"
  9. type: "area-chart"
  10. y_label: "Requests / Sec"
  11. weight: 1
  12. metrics:
  13. - id: response_metrics_nginx_ingress_throughput_status_code
  14. query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
  15. unit: req / sec
  16. label: Status Code
  17. - title: "Latency"
  18. type: "area-chart"
  19. y_label: "Latency (ms)"
  20. weight: 1
  21. metrics:
  22. - id: response_metrics_nginx_ingress_latency_pod_average
  23. query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})'
  24. label: Pod average (ms)
  25. unit: ms
  26. - title: "HTTP Error Rate"
  27. type: "area-chart"
  28. y_label: "HTTP Errors (%)"
  29. weight: 1
  30. metrics:
  31. - id: response_metrics_nginx_ingress_http_error_rate
  32. query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100'
  33. label: 5xx Errors (%)
  34. unit: "%"
  35. # NGINX Ingress metrics for post-0.16.0 versions
  36. - group: Response metrics (NGINX Ingress)
  37. priority: 10
  38. panels:
  39. - title: "Throughput"
  40. type: "area-chart"
  41. y_label: "Requests / Sec"
  42. weight: 1
  43. metrics:
  44. - id: response_metrics_nginx_ingress_16_throughput_status_code
  45. query_range: 'sum(label_replace(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m]), "status_code", "${1}xx", "status", "(.)..")) by (status_code)'
  46. unit: req / sec
  47. label: Status Code
  48. - title: "Latency"
  49. type: "area-chart"
  50. y_label: "Latency (ms)"
  51. weight: 1
  52. metrics:
  53. - id: response_metrics_nginx_ingress_16_latency_pod_average
  54. query_range: 'sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 1000'
  55. label: Pod average (ms)
  56. unit: ms
  57. - title: "HTTP Error Rate"
  58. type: "area-chart"
  59. y_label: "HTTP Errors (%)"
  60. weight: 1
  61. metrics:
  62. - id: response_metrics_nginx_ingress_16_http_error_rate
  63. query_range: 'sum(rate(nginx_ingress_controller_requests{status=~"5.*",namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 100'
  64. label: 5xx Errors (%)
  65. unit: "%"
  66. - group: Response metrics (HA Proxy)
  67. priority: 10
  68. panels:
  69. - title: "Throughput"
  70. type: "area-chart"
  71. y_label: "Requests / Sec"
  72. weight: 1
  73. metrics:
  74. - id: response_metrics_ha_proxy_throughput_status_code
  75. query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)'
  76. unit: req / sec
  77. label: Status Code
  78. - title: "HTTP Error Rate"
  79. type: "area-chart"
  80. y_label: "Error Rate (%)"
  81. weight: 1
  82. metrics:
  83. - id: response_metrics_ha_proxy_http_error_rate
  84. query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))'
  85. label: HTTP Errors (%)
  86. unit: "%"
  87. - group: Response metrics (AWS ELB)
  88. priority: 10
  89. panels:
  90. - title: "Throughput"
  91. type: "area-chart"
  92. y_label: "Requests / Sec"
  93. weight: 1
  94. metrics:
  95. - id: response_metrics_aws_elb_throughput_requests
  96. query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60'
  97. label: Total (req/sec)
  98. unit: req / sec
  99. - title: "Latency"
  100. type: "area-chart"
  101. y_label: "Latency (ms)"
  102. weight: 1
  103. metrics:
  104. - id: response_metrics_aws_elb_latency_average
  105. query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000'
  106. label: Average (ms)
  107. unit: ms
  108. - title: "HTTP Error Rate"
  109. type: "area-chart"
  110. y_label: "Error Rate (%)"
  111. weight: 1
  112. metrics:
  113. - id: response_metrics_aws_elb_http_error_rate
  114. query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})'
  115. label: HTTP Errors (%)
  116. unit: "%"
  117. - group: Response metrics (NGINX)
  118. priority: 10
  119. panels:
  120. - title: "Throughput"
  121. type: "area-chart"
  122. y_label: "Requests / Sec"
  123. weight: 1
  124. metrics:
  125. - id: response_metrics_nginx_throughput_status_code
  126. query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)'
  127. unit: req / sec
  128. label: Status Code
  129. - title: "Latency"
  130. type: "area-chart"
  131. y_label: "Latency (ms)"
  132. weight: 1
  133. metrics:
  134. - id: response_metrics_nginx_latency
  135. query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})'
  136. label: Upstream (ms)
  137. unit: ms
  138. - title: "HTTP Error Rate (Errors / Sec)"
  139. type: "area-chart"
  140. y_label: "HTTP 500 Errors / Sec"
  141. weight: 1
  142. metrics:
  143. - id: response_metrics_nginx_http_error_rate
  144. query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))'
  145. label: HTTP Errors
  146. unit: "errors / sec"
  147. - group: System metrics (Kubernetes)
  148. priority: 5
  149. panels:
  150. - title: "Memory Usage (Total)"
  151. type: "area-chart"
  152. y_label: "Total Memory Used (GB)"
  153. weight: 4
  154. metrics:
  155. - id: system_metrics_kubernetes_container_memory_total
  156. query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024'
  157. label: Total (GB)
  158. unit: GB
  159. - title: "Core Usage (Total)"
  160. type: "area-chart"
  161. y_label: "Total Cores"
  162. weight: 3
  163. metrics:
  164. - id: system_metrics_kubernetes_container_cores_total
  165. query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)'
  166. label: Total (cores)
  167. unit: "cores"
  168. - title: "Memory Usage (Pod average)"
  169. type: "line-chart"
  170. y_label: "Memory Used per Pod (MB)"
  171. weight: 2
  172. metrics:
  173. - id: system_metrics_kubernetes_container_memory_average
  174. query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
  175. label: Pod average (MB)
  176. unit: MB
  177. - title: "Canary: Memory Usage (Pod Average)"
  178. type: "line-chart"
  179. y_label: "Memory Used per Pod (MB)"
  180. weight: 2
  181. metrics:
  182. - id: system_metrics_kubernetes_container_memory_average_canary
  183. query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
  184. label: Pod average (MB)
  185. unit: MB
  186. track: canary
  187. - title: "Core Usage (Pod Average)"
  188. type: "line-chart"
  189. y_label: "Cores per Pod"
  190. weight: 1
  191. metrics:
  192. - id: system_metrics_kubernetes_container_core_usage
  193. query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
  194. label: Pod average (cores)
  195. unit: "cores"
  196. - title: "Canary: Core Usage (Pod Average)"
  197. type: "line-chart"
  198. y_label: "Cores per Pod"
  199. weight: 1
  200. metrics:
  201. - id: system_metrics_kubernetes_container_core_usage_canary
  202. query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
  203. label: Pod average (cores)
  204. unit: "cores"
  205. track: canary
  206. - title: "Knative function invocations"
  207. type: "area-chart"
  208. y_label: "Invocations"
  209. weight: 1
  210. metrics:
  211. - id: system_metrics_knative_function_invocation_count
  212. query_range: 'floor(sum(rate(istio_revision_request_count{destination_configuration="%{function_name}", destination_namespace="%{kube_namespace}"}[1m])/3))'
  213. label: invocations / minute
  214. unit: requests