Getting started

# This example shows the same example as getting-started.yml but using Sloth Kubernetes CRD.
# It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD.
#
# `sloth generate -i ./examples/k8s-getting-started.yml`
#
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
  name: sloth-slo-my-service
  namespace: monitoring
spec:
  service: "myservice"
  labels:
    owner: "myteam"
    repo: "myorg/myservice"
    tier: "2"
  slos:
    - name: "requests-availability"
      objective: 99.9
      description: "Common SLO based on availability for HTTP request responses."
      sli:
        events:
          errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
          totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
      alerting:
        name: MyServiceHighErrorRate
        labels:
          category: "availability"
        annotations:
          summary: "High error rate on 'myservice' requests responses"
        pageAlert:
          labels:
            severity: pageteam
            routing_key: myteam
        ticketAlert:
          labels:
            severity: "slack"
            slack_channel: "#alerts-myteam"
---
# Code generated by Sloth (dev): https://github.com/slok/sloth.
# DO NOT EDIT.

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  creationTimestamp: null
  labels:
    app.kubernetes.io/component: SLO
    app.kubernetes.io/managed-by: sloth
  name: sloth-slo-my-service
  namespace: monitoring
spec:
  groups:
  - name: sloth-slo-sli-recordings-myservice-requests-availability
    rules:
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[5m])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 5m
        tier: "2"
      record: slo:sli_error:ratio_rate5m
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[30m])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 30m
        tier: "2"
      record: slo:sli_error:ratio_rate30m
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1h])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 1h
        tier: "2"
      record: slo:sli_error:ratio_rate1h
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[2h])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 2h
        tier: "2"
      record: slo:sli_error:ratio_rate2h
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[6h])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 6h
        tier: "2"
      record: slo:sli_error:ratio_rate6h
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1d])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 1d
        tier: "2"
      record: slo:sli_error:ratio_rate1d
    - expr: |
        (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[3d])))
        /
        (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 3d
        tier: "2"
      record: slo:sli_error:ratio_rate3d
    - expr: |
        sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
        / ignoring (sloth_window)
        count_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 30d
        tier: "2"
      record: slo:sli_error:ratio_rate30d
  - name: sloth-slo-meta-recordings-myservice-requests-availability
    rules:
    - expr: vector(0.9990000000000001)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:objective:ratio
    - expr: vector(1-0.9990000000000001)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:error_budget:ratio
    - expr: vector(30)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:time_period:days
    - expr: |
        slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
        / on(sloth_id, sloth_slo, sloth_service) group_left
        slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:current_burn_rate:ratio
    - expr: |
        slo:sli_error:ratio_rate30d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
        / on(sloth_id, sloth_slo, sloth_service) group_left
        slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:period_burn_rate:ratio
    - expr: 1 - slo:period_burn_rate:ratio{sloth_id="myservice-requests-availability",
        sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:period_error_budget_remaining:ratio
    - expr: vector(1)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_mode: cli-gen-k8s
        sloth_objective: "99.9"
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_spec: sloth.slok.dev/v1
        sloth_version: dev
        tier: "2"
      record: sloth_slo_info
  - name: sloth-slo-alerts-myservice-requests-availability
    rules:
    - alert: MyServiceHighErrorRate
      annotations:
        summary: High error rate on 'myservice' requests responses
        title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
          burn rate is too fast.
      expr: |
        (
            max(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate1h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
        )
        or
        (
            max(slo:sli_error:ratio_rate30m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
        )
      labels:
        category: availability
        routing_key: myteam
        severity: pageteam
        sloth_severity: page
    - alert: MyServiceHighErrorRate
      annotations:
        summary: High error rate on 'myservice' requests responses
        title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error
          budget burn rate is too fast.
      expr: |
        (
            max(slo:sli_error:ratio_rate2h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate1d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
        )
        or
        (
            max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate3d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
        )
      labels:
        category: availability
        severity: slack
        slack_channel: '#alerts-myteam'
        sloth_severity: ticket