Skip to content

Commit f5efdf2

Browse files
sradcoAI Assistant
andcommitted
docs/ci/e2e: add alert management documentation, CI workflow, and e2e tests
Signed-off-by: Shirly Radco <sradco@redhat.com> Signed-off-by: João Vilaça <jvilaca@redhat.com> Signed-off-by: Aviv Litman <alitman@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent 8869060 commit f5efdf2

5 files changed

Lines changed: 782 additions & 0 deletions

File tree

.github/workflows/unit-tests.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Unit Tests
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- add-alert-management-api-base
7+
8+
jobs:
9+
test:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout code
13+
uses: actions/checkout@v4
14+
15+
- name: Set up Go
16+
uses: actions/setup-go@v5
17+
with:
18+
go-version-file: go.mod
19+
20+
- name: Run tests
21+
run: go test -count=1 $(go list ./... | grep -v /test/e2e)

docs/alert-management.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
## Alert Management Notes
2+
3+
This document covers alert management behavior and prerequisites for the monitoring plugin.
4+
5+
### User workload monitoring prerequisites
6+
7+
To include **user workload** alerts and rules in `/api/v1/alerting/alerts` and `/api/v1/alerting/rules`, the user workload monitoring stack must be enabled. Follow the OpenShift documentation for enabling and configuring UWM:
8+
9+
https://docs.redhat.com/en/documentation/monitoring_stack_for_red_hat_openshift/4.20/html/configuring_user_workload_monitoring/configuring-alerts-and-notifications-uwm
10+
11+
#### How the plugin reads user workload alerts/rules
12+
13+
The plugin prefers **Thanos tenancy** for user workload alerts/rules (RBAC-scoped, requires a namespace parameter). When the client does not provide a `namespace` filter, the plugin discovers candidate namespaces and queries Thanos tenancy per-namespace, using the end-user bearer token.
14+
15+
Routes in `openshift-user-workload-monitoring` are treated as **fallbacks** (and are also used for some health checks and pending state retrieval).
16+
17+
If you want to create the user workload Prometheus route (optional), you can expose the service:
18+
19+
```shell
20+
oc -n openshift-user-workload-monitoring expose svc/prometheus-user-workload-web --name=prometheus-user-workload-web --port=web
21+
```
22+
23+
If the route is missing/unreachable but tenancy is healthy, the plugin should still return user workload data and suppress route warnings.
24+
25+
#### Alert states
26+
27+
- `/api/v1/alerting/alerts?state=pending`: pending alerts come from Prometheus.
28+
- `/api/v1/alerting/alerts?state=firing`: firing alerts come from Alertmanager when available.
29+
- `/api/v1/alerting/alerts?state=silenced`: silenced alerts come from Alertmanager (requires an Alertmanager endpoint).
30+
31+
### Alertmanager routing choices
32+
33+
OpenShift supports routing user workload alerts to:
34+
35+
- The **platform Alertmanager** (default instance)
36+
- A **separate Alertmanager** for user workloads
37+
- **External Alertmanager** instances
38+
39+
This is a cluster configuration choice and does not change the plugin API shape. The plugin reads alerts from Alertmanager (for firing/silenced) and Prometheus (for pending), then merges platform and user workload results when available.
40+
41+
The plugin intentionally reads from only the in-cluster Alertmanager endpoints. Supporting multiple external Alertmanagers would introduce ambiguous alert state and silencing outcomes because each instance can apply different routing, inhibition, and silence configurations.
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
package e2e
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"io"
9+
"net/http"
10+
"testing"
11+
"time"
12+
13+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/util/intstr"
16+
"k8s.io/apimachinery/pkg/util/wait"
17+
18+
"github.com/openshift/monitoring-plugin/internal/managementrouter"
19+
"github.com/openshift/monitoring-plugin/pkg/k8s"
20+
"github.com/openshift/monitoring-plugin/test/e2e/framework"
21+
)
22+
23+
func listRulesForAlertMgmt(ctx context.Context, pluginURL string) ([]monitoringv1.Rule, error) {
24+
client := &http.Client{Timeout: 10 * time.Second}
25+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pluginURL+"/api/v1/alerting/rules", nil)
26+
if err != nil {
27+
return nil, err
28+
}
29+
30+
resp, err := client.Do(req)
31+
if err != nil {
32+
return nil, err
33+
}
34+
defer resp.Body.Close()
35+
36+
if resp.StatusCode != http.StatusOK {
37+
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
38+
}
39+
40+
var listResp struct {
41+
Data struct {
42+
Rules []monitoringv1.Rule `json:"rules"`
43+
} `json:"data"`
44+
Status string `json:"status"`
45+
}
46+
if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
47+
return nil, err
48+
}
49+
50+
return listResp.Data.Rules, nil
51+
}
52+
53+
func TestBulkDeleteUserDefinedAlertRules(t *testing.T) {
54+
f, err := framework.New()
55+
if err != nil {
56+
t.Fatalf("Failed to create framework: %v", err)
57+
}
58+
59+
ctx := context.Background()
60+
61+
testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-bulk-delete", false)
62+
if err != nil {
63+
t.Fatalf("Failed to create test namespace: %v", err)
64+
}
65+
defer cleanup()
66+
67+
forDuration := monitoringv1.Duration("5m")
68+
69+
testRule1 := monitoringv1.Rule{
70+
Alert: "TestBulkDeleteAlert1",
71+
Expr: intstr.FromString("up == 0"),
72+
For: &forDuration,
73+
Labels: map[string]string{
74+
"severity": "warning",
75+
},
76+
Annotations: map[string]string{
77+
"description": "Test alert 1 for bulk delete testing",
78+
},
79+
}
80+
81+
testRule2 := monitoringv1.Rule{
82+
Alert: "TestBulkDeleteAlert2",
83+
Expr: intstr.FromString("up == 1"),
84+
For: &forDuration,
85+
Labels: map[string]string{
86+
"severity": "info",
87+
},
88+
Annotations: map[string]string{
89+
"description": "Test alert 2 for bulk delete testing",
90+
},
91+
}
92+
93+
testRule3 := monitoringv1.Rule{
94+
Alert: "TestBulkDeleteAlert3",
95+
Expr: intstr.FromString("up == 2"),
96+
For: &forDuration,
97+
Labels: map[string]string{
98+
"severity": "critical",
99+
},
100+
Annotations: map[string]string{
101+
"description": "Test alert 3 for bulk delete testing",
102+
},
103+
}
104+
105+
_, err = createPrometheusRule(ctx, f, testNamespace, testRule1, testRule2, testRule3)
106+
if err != nil {
107+
t.Fatalf("Failed to create PrometheusRule: %v", err)
108+
}
109+
110+
var ruleIdsToDelete []string
111+
err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
112+
rules, err := listRulesForAlertMgmt(ctx, f.PluginURL)
113+
if err != nil {
114+
t.Logf("Failed to list rules: %v", err)
115+
return false, nil
116+
}
117+
118+
foundRuleIds := []string{}
119+
for _, rule := range rules {
120+
if rule.Alert == "TestBulkDeleteAlert1" || rule.Alert == "TestBulkDeleteAlert2" {
121+
ruleId := rule.Labels[k8s.AlertRuleLabelId]
122+
if ruleId != "" {
123+
foundRuleIds = append(foundRuleIds, ruleId)
124+
}
125+
}
126+
}
127+
128+
if len(foundRuleIds) == 2 {
129+
ruleIdsToDelete = foundRuleIds
130+
t.Logf("Found rule IDs to delete: %v", ruleIdsToDelete)
131+
return true, nil
132+
}
133+
134+
t.Logf("Found %d/2 test alerts in memory", len(foundRuleIds))
135+
return false, nil
136+
})
137+
138+
if err != nil {
139+
t.Fatalf("Timeout waiting for alerts to appear in memory: %v", err)
140+
}
141+
142+
reqBody := managementrouter.BulkDeleteUserDefinedAlertRulesRequest{
143+
RuleIds: ruleIdsToDelete,
144+
}
145+
146+
reqJSON, err := json.Marshal(reqBody)
147+
if err != nil {
148+
t.Fatalf("Failed to marshal request body: %v", err)
149+
}
150+
151+
bulkDeleteURL := fmt.Sprintf("%s/api/v1/alerting/rules", f.PluginURL)
152+
req, err := http.NewRequestWithContext(ctx, http.MethodDelete, bulkDeleteURL, bytes.NewBuffer(reqJSON))
153+
if err != nil {
154+
t.Fatalf("Failed to create HTTP request: %v", err)
155+
}
156+
req.Header.Set("Content-Type", "application/json")
157+
158+
client := &http.Client{Timeout: 10 * time.Second}
159+
resp, err := client.Do(req)
160+
if err != nil {
161+
t.Fatalf("Failed to make bulk delete request: %v", err)
162+
}
163+
defer resp.Body.Close()
164+
165+
if resp.StatusCode != http.StatusOK {
166+
body, _ := io.ReadAll(resp.Body)
167+
t.Fatalf("Expected status code %d, got %d. Response body: %s", http.StatusOK, resp.StatusCode, string(body))
168+
}
169+
170+
var bulkDeleteResp managementrouter.BulkDeleteUserDefinedAlertRulesResponse
171+
if err := json.NewDecoder(resp.Body).Decode(&bulkDeleteResp); err != nil {
172+
t.Fatalf("Failed to decode response: %v", err)
173+
}
174+
175+
if len(bulkDeleteResp.Rules) != 2 {
176+
t.Fatalf("Expected 2 rules in response, got %d", len(bulkDeleteResp.Rules))
177+
}
178+
179+
for _, result := range bulkDeleteResp.Rules {
180+
if result.StatusCode != http.StatusNoContent {
181+
t.Errorf("Rule %s deletion failed with status %d: %s", result.Id, result.StatusCode, result.Message)
182+
} else {
183+
t.Logf("Rule %s deleted successfully", result.Id)
184+
}
185+
}
186+
187+
promRule, err := f.Monitoringv1clientset.MonitoringV1().PrometheusRules(testNamespace).Get(
188+
ctx,
189+
"test-prometheus-rule",
190+
metav1.GetOptions{},
191+
)
192+
if err != nil {
193+
t.Fatalf("Failed to get PrometheusRule after deletion: %v", err)
194+
}
195+
196+
if len(promRule.Spec.Groups) != 1 {
197+
t.Fatalf("Expected 1 rule group, got %d", len(promRule.Spec.Groups))
198+
}
199+
200+
ruleGroup := promRule.Spec.Groups[0]
201+
if len(ruleGroup.Rules) != 1 {
202+
t.Fatalf("Expected 1 rule remaining, got %d: %+v", len(ruleGroup.Rules), ruleGroup.Rules)
203+
}
204+
205+
remainingRule := ruleGroup.Rules[0]
206+
if remainingRule.Alert != "TestBulkDeleteAlert3" {
207+
t.Errorf("Expected remaining rule to be TestBulkDeleteAlert3, got %s", remainingRule.Alert)
208+
}
209+
210+
if remainingRule.Labels["severity"] != "critical" {
211+
t.Errorf("Expected severity=critical, got %s", remainingRule.Labels["severity"])
212+
}
213+
214+
t.Log("Bulk delete test completed successfully - only TestBulkDeleteAlert3 remains")
215+
}
216+
217+
func TestDeleteUserDefinedAlertRuleById(t *testing.T) {
218+
f, err := framework.New()
219+
if err != nil {
220+
t.Fatalf("Failed to create framework: %v", err)
221+
}
222+
223+
ctx := context.Background()
224+
225+
testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-delete-by-id", false)
226+
if err != nil {
227+
t.Fatalf("Failed to create test namespace: %v", err)
228+
}
229+
defer cleanup()
230+
231+
forDuration := monitoringv1.Duration("5m")
232+
233+
testRule1 := monitoringv1.Rule{
234+
Alert: "TestDeleteByIdAlert1",
235+
Expr: intstr.FromString("up == 0"),
236+
For: &forDuration,
237+
Labels: map[string]string{
238+
"severity": "warning",
239+
},
240+
Annotations: map[string]string{
241+
"description": "Test alert 1 for delete by id testing",
242+
},
243+
}
244+
245+
testRule2 := monitoringv1.Rule{
246+
Alert: "TestDeleteByIdAlert2",
247+
Expr: intstr.FromString("up == 1"),
248+
For: &forDuration,
249+
Labels: map[string]string{
250+
"severity": "info",
251+
},
252+
Annotations: map[string]string{
253+
"description": "Test alert 2 for delete by id testing",
254+
},
255+
}
256+
257+
_, err = createPrometheusRule(ctx, f, testNamespace, testRule1, testRule2)
258+
if err != nil {
259+
t.Fatalf("Failed to create PrometheusRule: %v", err)
260+
}
261+
262+
var ruleIdToDelete string
263+
err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
264+
rules, err := listRulesForAlertMgmt(ctx, f.PluginURL)
265+
if err != nil {
266+
t.Logf("Failed to list rules: %v", err)
267+
return false, nil
268+
}
269+
270+
for _, rule := range rules {
271+
if rule.Alert == "TestDeleteByIdAlert1" {
272+
ruleIdToDelete = rule.Labels[k8s.AlertRuleLabelId]
273+
t.Logf("Found rule ID to delete: %s", ruleIdToDelete)
274+
return true, nil
275+
}
276+
}
277+
278+
t.Logf("Test alert not found yet in memory")
279+
return false, nil
280+
})
281+
282+
if err != nil {
283+
t.Fatalf("Timeout waiting for alerts to appear in memory: %v", err)
284+
}
285+
286+
deleteURL := fmt.Sprintf("%s/api/v1/alerting/rules/%s", f.PluginURL, ruleIdToDelete)
287+
req, err := http.NewRequestWithContext(ctx, http.MethodDelete, deleteURL, nil)
288+
if err != nil {
289+
t.Fatalf("Failed to create HTTP request: %v", err)
290+
}
291+
292+
client := &http.Client{Timeout: 10 * time.Second}
293+
resp, err := client.Do(req)
294+
if err != nil {
295+
t.Fatalf("Failed to make delete request: %v", err)
296+
}
297+
defer resp.Body.Close()
298+
299+
if resp.StatusCode != http.StatusNoContent {
300+
body, _ := io.ReadAll(resp.Body)
301+
t.Fatalf("Expected status code %d, got %d. Response body: %s", http.StatusNoContent, resp.StatusCode, string(body))
302+
}
303+
304+
t.Logf("Rule %s deleted successfully", ruleIdToDelete)
305+
306+
promRule, err := f.Monitoringv1clientset.MonitoringV1().PrometheusRules(testNamespace).Get(
307+
ctx,
308+
"test-prometheus-rule",
309+
metav1.GetOptions{},
310+
)
311+
if err != nil {
312+
t.Fatalf("Failed to get PrometheusRule after deletion: %v", err)
313+
}
314+
315+
if len(promRule.Spec.Groups) != 1 {
316+
t.Fatalf("Expected 1 rule group, got %d", len(promRule.Spec.Groups))
317+
}
318+
319+
ruleGroup := promRule.Spec.Groups[0]
320+
if len(ruleGroup.Rules) != 1 {
321+
t.Fatalf("Expected 1 rule remaining, got %d: %+v", len(ruleGroup.Rules), ruleGroup.Rules)
322+
}
323+
324+
remainingRule := ruleGroup.Rules[0]
325+
if remainingRule.Alert != "TestDeleteByIdAlert2" {
326+
t.Errorf("Expected remaining rule to be TestDeleteByIdAlert2, got %s", remainingRule.Alert)
327+
}
328+
329+
if remainingRule.Labels["severity"] != "info" {
330+
t.Errorf("Expected severity=info, got %s", remainingRule.Labels["severity"])
331+
}
332+
333+
t.Log("Delete by ID test completed successfully - only TestDeleteByIdAlert2 remains")
334+
}

0 commit comments

Comments
 (0)