SLO plugin

version: "prometheus/v1"
service: "myservice"
labels:
  owner: "myteam"
  repo: "myorg/myservice"
  tier: "2"
slo_plugins:
  chain:
    - id: "sloth.dev/core/debug/v1"
      priority: 9999999
      config: {msg: "Plugin 99"}
    - id: "sloth.dev/core/debug/v1"
      priority: -999999
      config: {msg: "Plugin 0"}

slos:
  # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
  - name: "requests-availability"
    objective: 99.9
    description: "Common SLO based on availability for HTTP request responses."
    plugins:
      chain:
        - id: "sloth.dev/core/debug/v1"
          priority: 1050
          config: {msg: "Plugin 5"}
        - id: "sloth.dev/core/debug/v1"
          priority: -1000
          config: {msg: "Plugin 1"}
        - id: "sloth.dev/core/debug/v1"
          priority: 1000
          config: {msg: "Plugin 4"}
        - id: "sloth.dev/core/debug/v1"
          priority: -200
          config: {msg: "Plugin 2"}
        - id: "sloth.dev/core/debug/v1"
          config: {msg: "Plugin 3"}

    sli:
      events:
        error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
        total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
    alerting:
      name: MyServiceHighErrorRate
      labels:
        category: "availability"
      annotations:
        # Overwrite default Sloth SLO alert summmary on ticket and page alerts.
        summary: "High error rate on 'myservice' requests responses"
      page_alert:
        labels:
          severity: pageteam
          routing_key: myteam
      ticket_alert:
        labels:
          severity: "slack"
          slack_channel: "#alerts-myteam"
---
# Code generated by Sloth (dev): https://github.com/slok/sloth.
# DO NOT EDIT.

groups:
- name: sloth-slo-sli-recordings-myservice-requests-availability
  rules:
  - record: slo:sli_error:ratio_rate5m
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[5m])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 5m
      tier: "2"
  - record: slo:sli_error:ratio_rate30m
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[30m])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 30m
      tier: "2"
  - record: slo:sli_error:ratio_rate1h
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1h])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 1h
      tier: "2"
  - record: slo:sli_error:ratio_rate2h
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[2h])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 2h
      tier: "2"
  - record: slo:sli_error:ratio_rate6h
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[6h])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 6h
      tier: "2"
  - record: slo:sli_error:ratio_rate1d
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1d])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 1d
      tier: "2"
  - record: slo:sli_error:ratio_rate3d
    expr: |
      (sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[3d])))
      /
      (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d])))
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 3d
      tier: "2"
  - record: slo:sli_error:ratio_rate30d
    expr: |
      sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
      / ignoring (sloth_window)
      count_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_window: 30d
      tier: "2"
- name: sloth-slo-meta-recordings-myservice-requests-availability
  rules:
  - record: slo:objective:ratio
    expr: vector(0.9990000000000001)
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: slo:error_budget:ratio
    expr: vector(1-0.9990000000000001)
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: slo:time_period:days
    expr: vector(30)
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: slo:current_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: slo:period_burn_rate:ratio
    expr: |
      slo:sli_error:ratio_rate30d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      / on(sloth_id, sloth_slo, sloth_service) group_left
      slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: slo:period_error_budget_remaining:ratio
    expr: 1 - slo:period_burn_rate:ratio{sloth_id="myservice-requests-availability",
      sloth_service="myservice", sloth_slo="requests-availability"}
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_service: myservice
      sloth_slo: requests-availability
      tier: "2"
  - record: sloth_slo_info
    expr: vector(1)
    labels:
      cmd: examplesgen.sh
      owner: myteam
      repo: myorg/myservice
      sloth_id: myservice-requests-availability
      sloth_mode: cli-gen-prom
      sloth_objective: "99.9"
      sloth_service: myservice
      sloth_slo: requests-availability
      sloth_spec: prometheus/v1
      sloth_version: dev
      tier: "2"
- name: sloth-slo-alerts-myservice-requests-availability
  rules:
  - alert: MyServiceHighErrorRate
    expr: |
      (
          max(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
          and
          max(slo:sli_error:ratio_rate1h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
      )
      or
      (
          max(slo:sli_error:ratio_rate30m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
          and
          max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
      )
    labels:
      category: availability
      routing_key: myteam
      severity: pageteam
      sloth_severity: page
    annotations:
      summary: High error rate on 'myservice' requests responses
      title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
  - alert: MyServiceHighErrorRate
    expr: |
      (
          max(slo:sli_error:ratio_rate2h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
          and
          max(slo:sli_error:ratio_rate1d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
      )
      or
      (
          max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
          and
          max(slo:sli_error:ratio_rate3d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
      )
    labels:
      category: availability
      severity: slack
      slack_channel: '#alerts-myteam'
      sloth_severity: ticket
    annotations:
      summary: High error rate on 'myservice' requests responses
      title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
        burn rate is too fast.
DEBU[0000] Debug level is enabled                        version=dev
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/contrib/grouped_slo/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/contrib/info_labels/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/contrib/rule_intervals/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=github.com/slok/sloth-test-slo-plugins/spec_as_labels/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/alert_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/debug/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/metadata_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/noop/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/sli_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/validate/v1 version=dev window=30d
INFO[0000] Plugins loaded                                sli-plugins=0 slo-plugins=10 version=dev window=30d
INFO[0000] SLO period windows loaded                     svc=alert.WindowsRepo version=dev window=30d windows=2
INFO[0000] Generating from Kubernetes Prometheus spec    version=dev window=30d
DEBU[0000] Multiwindow-multiburn alerts generated        out=- slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/alert_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/debug/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/metadata_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/noop/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/sli_rules/v1 version=dev window=30d
DEBU[0000] SLO plugin discovered and loaded              slo-plugin-id=sloth.dev/core/validate/v1 version=dev window=30d
INFO[0000] Plugins loaded                                sli-plugins=0 slo-plugins=10 version=dev window=30d
INFO[0000] SLO period windows loaded                     svc=alert.WindowsRepo version=dev window=30d windows=2
INFO[0000] Generating from Prometheus spec               version=dev window=30d
DEBU[0000] Multiwindow-multiburn alerts generated        out=- slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 0                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 1                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 2                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 3                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 4                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 5                                      out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
DEBU[0000] Plugin 99                                     out=- plugin=sloth.dev/core/debug/v1 slo=myservice-requests-availability svc=generate.prometheus.Service version=dev window=30d
INFO[0000] Prometheus rules written                      groups=3 out=- svc=storageio.StdPrometheusGroupedRulesYAMLRepo version=dev window=30d