Kubernetes APIserver

# This example shows a real service level used for Kubernetes Apiserver.
#
# The service level has 2 SLOs based on Apiserver requests/responses.
#
# We consider an SLI event the the requests made to the server, lets review the SLOs
#
# - `requests-availability`
#   - This SLO warn us that we are returning correctly the requests to the clients (kubectl users, controllers...).
#   - SLI error: We consider a bad request (event) a request with the codes >=500 or 429
#   - SLO objective (99.9%): We are restrictive with this because we only allow failing a request every 1000.
#
# - `requests-latency`
#   - This SLO warn us that we apiserver responses are being slow and this will affect the clients  (kubectl users, controllers...).
#   - SLI error: We consider a bad request (event) when the response latency is <400ms.
#   - SLO objective(99%): We have a relaxed objective because Kubernetes has a lot of async and eventual consistency flows. We could
#                         create in a future another SLO that is less restrictive and use the latency of the realtime requests (e.g: kubectl).
#
# `sloth generate -i ./examples/kubernetes-apiserver.yml`
#
version: "prometheus/v1"
service: "k8s-apiserver"
labels:
  cluster: "valhalla"
  component: "kubernetes"
slos:
  - name: "requests-availability"
    objective: 99.9
    description: "Warn that we are returning correctly the requests to the clients (kubectl users, controllers...)."
    labels:
      category: availability
    sli:
      events:
        error_query: sum(rate(apiserver_request_total{code=~"(5..|429)"}[{{.window}}]))
        total_query: sum(rate(apiserver_request_total[{{.window}}]))
    alerting:
      name: K8sApiserverAvailabilityAlert
      labels:
        category: "availability"
      annotations:
        runbook: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
      page_alert:
        labels:
          severity: critical
      ticket_alert:
        labels:
          severity: warning

  - name: "requests-latency"
    objective: 99
    description: "Warn that we apiserver responses are being slow and this will affect the clients  (kubectl users, controllers...)."
    labels:
      category: latency
    sli:
      events:
        error_query: |
          (
            sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[{{.window}}]))
          )
        total_query: sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}]))
    alerting:
      name: K8sApiserverLatencyAlert
      labels:
        category: "latency"
      annotations:
        runbook: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh"
      page_alert:
        labels:
          severity: critical
      ticket_alert:
        labels:
          severity: warning
---
# Code generated by Sloth (dev): https://github.com/slok/sloth.
# DO NOT EDIT.

groups:
- name: sloth-slo-sli-recordings-k8s-apiserver-requests-availability
  rules:
  - record: slo:sli_error:ratio_rate5m
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[5m])))
      /
      (sum(rate(apiserver_request_total[5m])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 5m
  - record: slo:sli_error:ratio_rate30m
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[30m])))
      /
      (sum(rate(apiserver_request_total[30m])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 30m
  - record: slo:sli_error:ratio_rate1h
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[1h])))
      /
      (sum(rate(apiserver_request_total[1h])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 1h
  - record: slo:sli_error:ratio_rate2h
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[2h])))
      /
      (sum(rate(apiserver_request_total[2h])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 2h
  - record: slo:sli_error:ratio_rate6h
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[6h])))
      /
      (sum(rate(apiserver_request_total[6h])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 6h
  - record: slo:sli_error:ratio_rate1d
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[1d])))
      /
      (sum(rate(apiserver_request_total[1d])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 1d
  - record: slo:sli_error:ratio_rate3d
    expr: |
      (sum(rate(apiserver_request_total{code=~"(5..|429)"}[3d])))
      /
      (sum(rate(apiserver_request_total[3d])))
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_window: 3d
  - record: slo:sli_error:ratio_rate30d
    expr: |
      sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}[30d])
      / ignoring (sloth_window)
      count_over_time(slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}[30d])
    labels:
      sloth_window: 30d
- name: sloth-slo-meta-recordings-k8s-apiserver-requests-availability
  rules:
  - record: slo:objective:ratio
    expr: vector(0.9990000000000001)
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: slo:error_budget:ratio
    expr: vector(1-0.9990000000000001)
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: slo:time_period:days
    expr: vector(30)
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: slo:current_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: slo:period_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate30d{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"}
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: slo:period_error_budget_remaining:ratio
    expr: 1 - slo:period_burn_rate:ratio{sloth_id="k8s-apiserver-requests-availability",
      sloth_service="k8s-apiserver", sloth_slo="requests-availability"}
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
  - record: sloth_slo_info
    expr: vector(1)
    labels:
      category: availability
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-availability
      sloth_mode: cli-gen-prom
      sloth_objective: "99.9"
      sloth_service: k8s-apiserver
      sloth_slo: requests-availability
      sloth_spec: prometheus/v1
      sloth_version: dev
- name: sloth-slo-alerts-k8s-apiserver-requests-availability
  rules:
  - alert: K8sApiserverAvailabilityAlert
    expr: |
      (
          (slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate1h{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432))
      )
      or ignoring (sloth_window)
      (
          (slo:sli_error:ratio_rate30m{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate6h{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432))
      )
    labels:
      category: availability
      severity: critical
      sloth_severity: page
    annotations:
      runbook: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
      summary: '{{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget burn
        rate is over expected.'
      title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
  - alert: K8sApiserverAvailabilityAlert
    expr: |
      (
          (slo:sli_error:ratio_rate2h{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate1d{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432))
      )
      or ignoring (sloth_window)
      (
          (slo:sli_error:ratio_rate6h{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate3d{sloth_id="k8s-apiserver-requests-availability", sloth_service="k8s-apiserver", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432))
      )
    labels:
      category: availability
      severity: warning
      sloth_severity: ticket
    annotations:
      runbook: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
      summary: '{{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget burn
        rate is over expected.'
      title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
- name: sloth-slo-sli-recordings-k8s-apiserver-requests-latency
  rules:
  - record: slo:sli_error:ratio_rate5m
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[5m]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[5m]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[5m])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 5m
  - record: slo:sli_error:ratio_rate30m
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[30m]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[30m]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[30m])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 30m
  - record: slo:sli_error:ratio_rate1h
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[1h]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[1h]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[1h])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 1h
  - record: slo:sli_error:ratio_rate2h
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[2h]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[2h]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[2h])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 2h
  - record: slo:sli_error:ratio_rate6h
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[6h]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[6h]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[6h])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 6h
  - record: slo:sli_error:ratio_rate1d
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[1d]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[1d]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[1d])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 1d
  - record: slo:sli_error:ratio_rate3d
    expr: |
      ((
        sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[3d]))
        -
        sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[3d]))
      )
      )
      /
      (sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[3d])))
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_window: 3d
  - record: slo:sli_error:ratio_rate30d
    expr: |
      sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}[30d])
      / ignoring (sloth_window)
      count_over_time(slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}[30d])
    labels:
      sloth_window: 30d
- name: sloth-slo-meta-recordings-k8s-apiserver-requests-latency
  rules:
  - record: slo:objective:ratio
    expr: vector(0.99)
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: slo:error_budget:ratio
    expr: vector(1-0.99)
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: slo:time_period:days
    expr: vector(30)
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: slo:current_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: slo:period_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate30d{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"}
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: slo:period_error_budget_remaining:ratio
    expr: 1 - slo:period_burn_rate:ratio{sloth_id="k8s-apiserver-requests-latency",
      sloth_service="k8s-apiserver", sloth_slo="requests-latency"}
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
  - record: sloth_slo_info
    expr: vector(1)
    labels:
      category: latency
      cluster: valhalla
      cmd: examplesgen.sh
      component: kubernetes
      sloth_id: k8s-apiserver-requests-latency
      sloth_mode: cli-gen-prom
      sloth_objective: "99"
      sloth_service: k8s-apiserver
      sloth_slo: requests-latency
      sloth_spec: prometheus/v1
      sloth_version: dev
- name: sloth-slo-alerts-k8s-apiserver-requests-latency
  rules:
  - alert: K8sApiserverLatencyAlert
    expr: |
      (
          (slo:sli_error:ratio_rate5m{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (14.4 * 0.01))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate1h{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (14.4 * 0.01))
      )
      or ignoring (sloth_window)
      (
          (slo:sli_error:ratio_rate30m{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (6 * 0.01))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate6h{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (6 * 0.01))
      )
    labels:
      category: latency
      severity: critical
      sloth_severity: page
    annotations:
      runbook: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
      summary: '{{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget burn
        rate is over expected.'
      title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
  - alert: K8sApiserverLatencyAlert
    expr: |
      (
          (slo:sli_error:ratio_rate2h{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (3 * 0.01))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate1d{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (3 * 0.01))
      )
      or ignoring (sloth_window)
      (
          (slo:sli_error:ratio_rate6h{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (1 * 0.01))
          and ignoring (sloth_window)
          (slo:sli_error:ratio_rate3d{sloth_id="k8s-apiserver-requests-latency", sloth_service="k8s-apiserver", sloth_slo="requests-latency"} > (1 * 0.01))
      )
    labels:
      category: latency
      severity: warning
      sloth_severity: ticket
    annotations:
      runbook: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
      summary: '{{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget burn
        rate is over expected.'
      title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
Back to top