Skip to content

Commit 177f2d7

Browse files
sradcoAI Assistant
andcommitted
k8s: add relabel config, relabeled rules, alerting health and AlertingRule CRD support
Add remaining k8s layer capabilities: - AlertRelabelConfig CRUD operations for OpenShift AlertRelabelConfig CRs - RelabeledRules: applies Prometheus relabel configs to derive effective alert rules with source/management labels - AlertingHealth: route reachability checks for platform and user-workload Prometheus/Alertmanager endpoints - AlertingRule CRUD operations for OpenShift AlertingRule CRs - ExternalManagement label helpers Also adds supporting leaf packages: - pkg/managementlabels: management label constants and helpers - pkg/alert_rule: alert rule ID generation and parsing Signed-off-by: Shirly Radco <sradco@redhat.com> Signed-off-by: João Vilaça <jvilaca@redhat.com> Signed-off-by: Aviv Litman <alitman@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent 21bd78a commit 177f2d7

9 files changed

Lines changed: 1037 additions & 2 deletions

File tree

pkg/alert_rule/alert_rule.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package alertrule
2+
3+
import (
4+
"crypto/sha256"
5+
"encoding/base64"
6+
"fmt"
7+
"regexp"
8+
"sort"
9+
"strings"
10+
"unicode/utf8"
11+
12+
"github.com/openshift/monitoring-plugin/pkg/managementlabels"
13+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
14+
)
15+
16+
var promLabelNameRegexp = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`)
17+
18+
func GetAlertingRuleId(alertRule *monitoringv1.Rule) string {
19+
var name string
20+
var kind string
21+
if alertRule.Alert != "" {
22+
name = alertRule.Alert
23+
kind = "alert"
24+
} else if alertRule.Record != "" {
25+
name = alertRule.Record
26+
kind = "record"
27+
} else {
28+
return ""
29+
}
30+
31+
expr := normalizeExpr(alertRule.Expr.String())
32+
forDuration := ""
33+
if alertRule.For != nil {
34+
forDuration = strings.TrimSpace(string(*alertRule.For))
35+
}
36+
37+
labelsBlock := normalizedBusinessLabelsBlock(alertRule.Labels)
38+
39+
// Canonical payload is intentionally derived from rule spec (expr/for/labels) and identity (kind/name),
40+
// and excludes annotations and openshift_io_* provenance/system labels.
41+
canonicalPayload := strings.Join([]string{kind, name, expr, forDuration, labelsBlock}, "\n---\n")
42+
43+
// Generate SHA256 hash
44+
hash := sha256.Sum256([]byte(canonicalPayload))
45+
46+
return "rid_" + base64.RawURLEncoding.EncodeToString(hash[:])
47+
}
48+
49+
func normalizeExpr(expr string) string {
50+
// Collapse consecutive whitespace so cosmetic formatting changes do not churn ids.
51+
return strings.Join(strings.Fields(strings.TrimSpace(expr)), " ")
52+
}
53+
54+
func normalizedBusinessLabelsBlock(in map[string]string) string {
55+
if len(in) == 0 {
56+
return ""
57+
}
58+
59+
lines := make([]string, 0, len(in))
60+
for k, v := range in {
61+
key := strings.TrimSpace(k)
62+
if key == "" {
63+
continue
64+
}
65+
if strings.HasPrefix(key, "openshift_io_") || key == managementlabels.AlertNameLabel {
66+
// Skip system labels
67+
continue
68+
}
69+
if !promLabelNameRegexp.MatchString(strings.TrimSpace(key)) {
70+
continue
71+
}
72+
if v == "" {
73+
// Align with specHash behavior: drop empty values
74+
continue
75+
}
76+
if !utf8.ValidString(v) {
77+
continue
78+
}
79+
80+
lines = append(lines, fmt.Sprintf("%s=%s", key, v))
81+
}
82+
83+
sort.Strings(lines)
84+
return strings.Join(lines, "\n")
85+
}

pkg/k8s/alert_relabel_config.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package k8s
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
osmv1 "github.com/openshift/api/monitoring/v1"
8+
osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned"
9+
"k8s.io/apimachinery/pkg/api/errors"
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/fields"
12+
"k8s.io/client-go/tools/cache"
13+
)
14+
15+
type alertRelabelConfigManager struct {
16+
clientset *osmv1client.Clientset
17+
arcInformer cache.SharedIndexInformer
18+
}
19+
20+
func newAlertRelabelConfigManager(ctx context.Context, clientset *osmv1client.Clientset) (*alertRelabelConfigManager, error) {
21+
arcInformer := cache.NewSharedIndexInformer(
22+
alertRelabelConfigListWatchForAllNamespaces(clientset),
23+
&osmv1.AlertRelabelConfig{},
24+
0,
25+
cache.Indexers{},
26+
)
27+
28+
arcm := &alertRelabelConfigManager{
29+
clientset: clientset,
30+
arcInformer: arcInformer,
31+
}
32+
33+
go arcm.arcInformer.Run(ctx.Done())
34+
35+
if !cache.WaitForNamedCacheSync("AlertRelabelConfig informer", ctx.Done(), arcm.arcInformer.HasSynced) {
36+
return nil, fmt.Errorf("failed to sync AlertRelabelConfig informer")
37+
}
38+
39+
return arcm, nil
40+
}
41+
42+
func alertRelabelConfigListWatchForAllNamespaces(clientset *osmv1client.Clientset) *cache.ListWatch {
43+
return cache.NewListWatchFromClient(clientset.MonitoringV1().RESTClient(), "alertrelabelconfigs", "", fields.Everything())
44+
}
45+
46+
func (arcm *alertRelabelConfigManager) List(ctx context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) {
47+
arcs := arcm.arcInformer.GetStore().List()
48+
49+
alertRelabelConfigs := make([]osmv1.AlertRelabelConfig, 0, len(arcs))
50+
for _, item := range arcs {
51+
arc, ok := item.(*osmv1.AlertRelabelConfig)
52+
if !ok {
53+
continue
54+
}
55+
alertRelabelConfigs = append(alertRelabelConfigs, *arc)
56+
}
57+
58+
return alertRelabelConfigs, nil
59+
}
60+
61+
func (arcm *alertRelabelConfigManager) Get(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) {
62+
arc, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(namespace).Get(ctx, name, metav1.GetOptions{})
63+
if err != nil {
64+
if errors.IsNotFound(err) {
65+
return nil, false, nil
66+
}
67+
68+
return nil, false, err
69+
}
70+
71+
return arc, true, nil
72+
}
73+
74+
func (arcm *alertRelabelConfigManager) Create(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) {
75+
created, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(arc.Namespace).Create(ctx, &arc, metav1.CreateOptions{})
76+
if err != nil {
77+
return nil, fmt.Errorf("failed to create AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err)
78+
}
79+
80+
return created, nil
81+
}
82+
83+
func (arcm *alertRelabelConfigManager) Update(ctx context.Context, arc osmv1.AlertRelabelConfig) error {
84+
_, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(arc.Namespace).Update(ctx, &arc, metav1.UpdateOptions{})
85+
if err != nil {
86+
return fmt.Errorf("failed to update AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err)
87+
}
88+
89+
return nil
90+
}
91+
92+
func (arcm *alertRelabelConfigManager) Delete(ctx context.Context, namespace string, name string) error {
93+
err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(namespace).Delete(ctx, name, metav1.DeleteOptions{})
94+
if err != nil {
95+
return fmt.Errorf("failed to delete AlertRelabelConfig %s: %w", name, err)
96+
}
97+
98+
return nil
99+
}

pkg/k8s/alerting_health.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package k8s
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
"sync"
8+
9+
"gopkg.in/yaml.v2"
10+
corev1 "k8s.io/api/core/v1"
11+
"k8s.io/apimachinery/pkg/fields"
12+
"k8s.io/client-go/kubernetes"
13+
"k8s.io/client-go/tools/cache"
14+
)
15+
16+
const (
17+
clusterMonitoringConfigMap = "cluster-monitoring-config"
18+
clusterMonitoringConfigKey = "config.yaml"
19+
)
20+
21+
type clusterMonitoringConfig struct {
22+
EnableUserWorkload bool `yaml:"enableUserWorkload"`
23+
}
24+
25+
// clusterMonitoringConfigManager watches the cluster-monitoring-config ConfigMap
26+
// via an informer and caches the parsed enableUserWorkload value so that
27+
// AlertingHealth never needs a live API call.
28+
type clusterMonitoringConfigManager struct {
29+
informer cache.SharedIndexInformer
30+
31+
mu sync.RWMutex
32+
enabled bool
33+
err error
34+
}
35+
36+
func newClusterMonitoringConfigManager(ctx context.Context, clientset *kubernetes.Clientset) (*clusterMonitoringConfigManager, error) {
37+
informer := cache.NewSharedIndexInformer(
38+
cache.NewListWatchFromClient(
39+
clientset.CoreV1().RESTClient(),
40+
"configmaps",
41+
ClusterMonitoringNamespace,
42+
fields.OneTermEqualSelector("metadata.name", clusterMonitoringConfigMap),
43+
),
44+
&corev1.ConfigMap{},
45+
0,
46+
cache.Indexers{},
47+
)
48+
49+
m := &clusterMonitoringConfigManager{
50+
informer: informer,
51+
}
52+
53+
_, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
54+
AddFunc: func(obj interface{}) {
55+
cm, ok := obj.(*corev1.ConfigMap)
56+
if !ok {
57+
return
58+
}
59+
m.handleUpdate(cm)
60+
},
61+
UpdateFunc: func(_, newObj interface{}) {
62+
cm, ok := newObj.(*corev1.ConfigMap)
63+
if !ok {
64+
return
65+
}
66+
m.handleUpdate(cm)
67+
},
68+
DeleteFunc: func(_ interface{}) {
69+
m.mu.Lock()
70+
defer m.mu.Unlock()
71+
m.enabled = false
72+
m.err = nil
73+
},
74+
})
75+
if err != nil {
76+
return nil, fmt.Errorf("failed to add event handler to cluster-monitoring-config informer: %w", err)
77+
}
78+
79+
go informer.Run(ctx.Done())
80+
81+
if !cache.WaitForNamedCacheSync("ClusterMonitoringConfig informer", ctx.Done(), informer.HasSynced) {
82+
return nil, fmt.Errorf("failed to sync ClusterMonitoringConfig informer")
83+
}
84+
85+
return m, nil
86+
}
87+
88+
func (m *clusterMonitoringConfigManager) handleUpdate(cm *corev1.ConfigMap) {
89+
m.mu.Lock()
90+
defer m.mu.Unlock()
91+
92+
raw, ok := cm.Data[clusterMonitoringConfigKey]
93+
if !ok || strings.TrimSpace(raw) == "" {
94+
m.enabled = false
95+
m.err = nil
96+
return
97+
}
98+
99+
var cfg clusterMonitoringConfig
100+
if err := yaml.Unmarshal([]byte(raw), &cfg); err != nil {
101+
m.enabled = false
102+
m.err = fmt.Errorf("parse cluster monitoring config.yaml: %w", err)
103+
return
104+
}
105+
106+
m.enabled = cfg.EnableUserWorkload
107+
m.err = nil
108+
}
109+
110+
func (m *clusterMonitoringConfigManager) userWorkloadEnabled() (bool, error) {
111+
m.mu.RLock()
112+
defer m.mu.RUnlock()
113+
return m.enabled, m.err
114+
}
115+
116+
// AlertingHealth returns alerting route health and UWM enablement status.
117+
func (c *client) AlertingHealth(ctx context.Context) (AlertingHealth, error) {
118+
health := c.prometheusAlerts.alertingHealth(ctx)
119+
120+
enabled, err := c.clusterMonitoringConfig.userWorkloadEnabled()
121+
if err != nil {
122+
return health, fmt.Errorf("failed to determine user workload enablement: %w", err)
123+
}
124+
health.UserWorkloadEnabled = enabled
125+
126+
return health, nil
127+
}

0 commit comments

Comments
 (0)