SLI plugin

SLO spec Generated SLI Plugin

# This example shows the same example as home-wifi.yml but using Sloth Kubernetes CRD.
# It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD.
#
# `sloth generate -i ./examples/plugin-k8s-home-wifi.yml` -p ./examples
#
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
  name: sloth-slo-home-wifi
  namespace: monitoring
  labels:
    prometheus: prometheus
    role: alert-rules
    app: sloth
spec:
  service: "myservice"
  labels:
    owner: "myteam"
    repo: "myorg/myservice"
    tier: "2"
  slos:
    # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
    - name: "requests-availability"
      objective: 99.9
      description: "Common SLO based on availability for HTTP request responses."
      sli:
        plugin:
          id: "getting_started_availability"
          options:
            job: "myservice"
            filter: 'f1="v1",f2="v2"'
      alerting:
        name: MyServiceHighErrorRate
        labels:
          category: "availability"
        annotations:
          # Overwrite default Sloth SLO alert summmary on ticket and page alerts.
          summary: "High error rate on 'myservice' requests responses"
        page_alert:
          labels:
            severity: pageteam
            routing_key: myteam
        ticket_alert:
          labels:
            severity: "slack"
            slack_channel: "#alerts-myteam"

---
# Code generated by Sloth (dev): https://github.com/slok/sloth.
# DO NOT EDIT.

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  creationTimestamp: null
  labels:
    app: sloth
    app.kubernetes.io/component: SLO
    app.kubernetes.io/managed-by: sloth
    prometheus: prometheus
    role: alert-rules
  name: sloth-slo-home-wifi
  namespace: monitoring
spec:
  groups:
  - name: sloth-slo-sli-recordings-myservice-requests-availability
    rules:
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[5m]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[5m])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 5m
        tier: "2"
      record: slo:sli_error:ratio_rate5m
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[30m]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[30m])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 30m
        tier: "2"
      record: slo:sli_error:ratio_rate30m
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[1h]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[1h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 1h
        tier: "2"
      record: slo:sli_error:ratio_rate1h
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[2h]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[2h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 2h
        tier: "2"
      record: slo:sli_error:ratio_rate2h
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[6h]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[6h])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 6h
        tier: "2"
      record: slo:sli_error:ratio_rate6h
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[1d]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[1d])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 1d
        tier: "2"
      record: slo:sli_error:ratio_rate1d
    - expr: |-
        (
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice",code=~"(5..|429)" }[3d]))
        /
        sum(rate(http_request_duration_seconds_count{ f1="v1",f2="v2",job="myservice" }[3d])))
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 3d
        tier: "2"
      record: slo:sli_error:ratio_rate3d
    - expr: |
        sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
        / ignoring (sloth_window)
        count_over_time(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}[30d])
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_window: 30d
        tier: "2"
      record: slo:sli_error:ratio_rate30d
  - name: sloth-slo-meta-recordings-myservice-requests-availability
    rules:
    - expr: vector(0.9990000000000001)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:objective:ratio
    - expr: vector(1-0.9990000000000001)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:error_budget:ratio
    - expr: vector(30)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:time_period:days
    - expr: |
        slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
        / on(sloth_id, sloth_slo, sloth_service) group_left
        slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:current_burn_rate:ratio
    - expr: |
        slo:sli_error:ratio_rate30d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
        / on(sloth_id, sloth_slo, sloth_service) group_left
        slo:error_budget:ratio{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:period_burn_rate:ratio
    - expr: 1 - slo:period_burn_rate:ratio{sloth_id="myservice-requests-availability",
        sloth_service="myservice", sloth_slo="requests-availability"}
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_service: myservice
        sloth_slo: requests-availability
        tier: "2"
      record: slo:period_error_budget_remaining:ratio
    - expr: vector(1)
      labels:
        cmd: examplesgen.sh
        owner: myteam
        repo: myorg/myservice
        sloth_id: myservice-requests-availability
        sloth_mode: cli-gen-k8s
        sloth_objective: "99.9"
        sloth_service: myservice
        sloth_slo: requests-availability
        sloth_spec: sloth.slok.dev/v1
        sloth_version: dev
        tier: "2"
      record: sloth_slo_info
  - name: sloth-slo-alerts-myservice-requests-availability
    rules:
    - alert: MyServiceHighErrorRate
      annotations:
        summary: High error rate on 'myservice' requests responses
        title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget
          burn rate is too fast.
      expr: |
        (
            max(slo:sli_error:ratio_rate5m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate1h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (14.4 * 0.0009999999999999432)) without (sloth_window)
        )
        or
        (
            max(slo:sli_error:ratio_rate30m{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (6 * 0.0009999999999999432)) without (sloth_window)
        )
      labels:
        category: availability
        sloth_severity: page
    - alert: MyServiceHighErrorRate
      annotations:
        summary: High error rate on 'myservice' requests responses
        title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error
          budget burn rate is too fast.
      expr: |
        (
            max(slo:sli_error:ratio_rate2h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate1d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (3 * 0.0009999999999999432)) without (sloth_window)
        )
        or
        (
            max(slo:sli_error:ratio_rate6h{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
            and
            max(slo:sli_error:ratio_rate3d{sloth_id="myservice-requests-availability", sloth_service="myservice", sloth_slo="requests-availability"} > (1 * 0.0009999999999999432)) without (sloth_window)
        )
      labels:
        category: availability
        sloth_severity: ticket

package availability

import (
    "bytes"
    "context"
    "fmt"
    "regexp"
    "strings"
    "text/template"
)

const (
    SLIPluginVersion = "prometheus/v1"
    SLIPluginID      = "getting_started_availability"
)

var queryTpl = template.Must(template.New("").Parse(`
sum(rate(http_request_duration_seconds_count{ {{.filter}}job="{{.job}}",code=~"(5..|429)" }[{{"{{.window}}"}}]))
/
sum(rate(http_request_duration_seconds_count{ {{.filter}}job="{{.job}}" }[{{"{{.window}}"}}]))`))

var filterRegex = regexp.MustCompile(`([^=]+="[^=,"]+",)+`)

// SLIPlugin is the getting started plugin example.
//
// It will return an Sloth error ratio raw query that returns the error ratio of HTTP requests based
// on the HTTP response status code, taking 5xx and 429 as error events.
func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) {
    // Get job.
    job, ok := options["job"]
    if !ok {
        return "", fmt.Errorf("job options is required")
    }

    // Validate labels.
    err := validateLabels(labels, "owner", "tier")
    if err != nil {
        return "", fmt.Errorf("invalid labels: %w", err)
    }

    // Sanitize filter.
    filter := options["filter"]
    if filter != "" {
        filter = strings.Trim(filter, "{}")
        filter = strings.Trim(filter, ",")
        filter = filter + ","
        match := filterRegex.MatchString(filter)
        if !match {
            return "", fmt.Errorf("invalid prometheus filter: %s", filter)
        }
    }

    // Create query.
    var b bytes.Buffer
    data := map[string]string{
        "job":    job,
        "filter": filter,
    }
    err = queryTpl.Execute(&b, data)
    if err != nil {
        return "", fmt.Errorf("could not execute template: %w", err)
    }

    return b.String(), nil
}

// validateLabels will check the labels exist.
func validateLabels(labels map[string]string, requiredKeys ...string) error {
    for _, k := range requiredKeys {
        v, ok := labels[k]
        if !ok || (ok && v == "") {
            return fmt.Errorf("%q label is required", k)
        }
    }

    return nil
}