canonical · cbartz · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -20,7 +20,7 @@ jobs:
       juju-channel: 3.6/stable
       provider: lxd
       test-tox-env: integration-juju3.6
-      modules: '["test_multi_unit_same_machine", "test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_upgrade",  "test_reactive"]'
+      modules: '["test_multi_unit_same_machine", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_upgrade"]'
       # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_<N>
       # mapping. See CONTRIBUTING.md for more details.
       extra-arguments: |

@@ -20,6 +20,7 @@ jobs:
       matrix:
         test-module:
           - test_debug_ssh
+          - test_metrics
           - test_planner_runner
     steps:
       - name: Checkout code

@@ -13,8 +13,7 @@
 from github_runner_manager.errors import IssueMetricEventError
 from github_runner_manager.manager.vm_manager import CodeInformation
 
-METRICS_LOG_PATH = Path(os.getenv("METRICS_LOG_PATH", "/var/log/github-runner-metrics.log"))
-
+_DEFAULT_METRICS_LOG_PATH = "/var/log/github-runner-metrics.log"
 
 logger = logging.getLogger(__name__)
 
@@ -156,8 +155,18 @@ def issue_event(event: Event) -> None:
     Raises:
         IssueMetricEventError: If the event cannot be logged.
     """
+    metrics_log_path = get_metrics_log_path()
     try:
-        with METRICS_LOG_PATH.open(mode="a", encoding="utf-8") as metrics_file:
+        with metrics_log_path.open(mode="a", encoding="utf-8") as metrics_file:
             metrics_file.write(f"{event.json(exclude_none=True)}\n")
     except OSError as exc:
-        raise IssueMetricEventError(f"Cannot write to {METRICS_LOG_PATH}") from exc
+        raise IssueMetricEventError(f"Cannot write to {metrics_log_path}") from exc
+
+
+def get_metrics_log_path() -> Path:
+    """Get the metrics log path, reading the env var at call time rather than import time.
+
+    Returns:
+        The metrics log file path.
+    """
+    return Path(os.getenv("METRICS_LOG_PATH", _DEFAULT_METRICS_LOG_PATH))
@@ -4,7 +4,6 @@
 """Fixtures for github-runner-manager integration tests."""
 
 import logging
-import subprocess
 import time
 from pathlib import Path
 from typing import Generator
@@ -15,88 +14,13 @@
 from github.Auth import Token
 from github.Branch import Branch
 from github.Repository import Repository
-from openstack.compute.v2.server import Server as OpenstackServer
 
 from .factories import GitHubConfig, OpenStackConfig, ProxyConfig, TestConfig
 from .planner_stub import PlannerStub, PlannerStubConfig
 
 logger = logging.getLogger(__name__)
 
 
-def wait_for_runner(
-    openstack_connection: openstack.connection.Connection,
-    test_config: TestConfig,
-    timeout: int = 300,
-    interval: int = 5,
-) -> tuple[OpenstackServer, str] | tuple[None, None]:
-    """Wait for an OpenStack runner to be created and return it with its IP.
-
-    Args:
-        openstack_connection: OpenStack connection object.
-        test_config: Test configuration with VM prefix.
-        timeout: Maximum time to wait in seconds.
-        interval: Time between checks in seconds.
-
-    Returns:
-        Tuple of (runner, ip) if found, or (None, None) if not found within timeout.
-    """
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        servers = [
-            server
-            for server in openstack_connection.list_servers()
-            if server.name.startswith(test_config.vm_prefix)
-        ]
-        if servers:
-            runner = servers[0]
-            logger.info("Found runner: %s", runner.name)
-
-            ip = None
-            for network_addresses in runner.addresses.values():
-                for address in network_addresses:
-                    ip = address["addr"]
-                    break
-                if ip:
-                    break
-
-            if ip:
-                return runner, ip
-
-        time.sleep(interval)
-
-    return None, None
-
-
-def wait_for_no_runners(
-    openstack_connection: openstack.connection.Connection,
-    test_config: TestConfig,
-    timeout: int = 900,
-    interval: int = 15,
-) -> bool:
-    """Wait until no VMs with the test prefix exist on OpenStack.
-
-    Args:
-        openstack_connection: OpenStack connection object.
-        test_config: Test configuration with VM prefix.
-        timeout: Maximum time to wait in seconds.
-        interval: Time between checks in seconds.
-
-    Returns:
-        True when no matching VMs exist; False if timeout is reached first.
-    """
-    start = time.time()
-    while time.time() - start < timeout:
-        servers = [
-            s
-            for s in openstack_connection.list_servers()
-            if s.name.startswith(test_config.vm_prefix)
-        ]
-        if not servers:
-            return True
-        time.sleep(interval)
-    return False
-
-
 @pytest.fixture(scope="module")
 def test_config(pytestconfig: pytest.Config) -> TestConfig:
     """Create a unique test configuration for parallel test execution.
@@ -355,16 +279,9 @@ def github_branch(
     """
     test_branch = f"test-{test_config.test_id}"
 
-    sha_result = subprocess.run(
-        ["/usr/bin/git", "rev-parse", "HEAD"],
-        capture_output=True,
-        text=True,
-        check=True,
-    )
-    current_commit_sha = sha_result.stdout.strip()
-
+    default_branch = github_repository.get_branch(github_repository.default_branch)
     branch_ref = github_repository.create_git_ref(
-        ref=f"refs/heads/{test_branch}", sha=current_commit_sha
+        ref=f"refs/heads/{test_branch}", sha=default_branch.commit.sha
     )
 
     # Wait for branch to be available, GitHub is eventually consistent
@@ -376,7 +293,11 @@ def github_branch(
     while time.time() - start_time < timeout:
         try:
             branch = github_repository.get_branch(test_branch)
-            logger.info("Created test branch: %s at SHA: %s", test_branch, current_commit_sha)
+            logger.info(
+                "Created test branch: %s at SHA: %s",
+                test_branch,
+                default_branch.commit.sha,
+            )
             break
         except Exception as e:
             elapsed = time.time() - start_time

@@ -141,6 +141,8 @@ def create_default_config(
     test_config: TestConfig | None = None,
     planner_url: str | None = None,
     planner_token: str | None = None,
+    reconcile_interval: int = 60,
+    base_virtual_machines: int = 1,
 ) -> dict[str, Any]:
     """Create a default test configuration dictionary.
 
@@ -154,6 +156,8 @@ def create_default_config(
             Defaults to new unique values.
         planner_url: Planner service URL. Omitted from config when not provided.
         planner_token: Planner service token. Omitted from config when not provided.
+        reconcile_interval: Minutes between delete-loop reconciliation ticks.
+        base_virtual_machines: Floor for non-reactive desired runners.
 
     Returns:
         Configuration dictionary for the application.
@@ -235,7 +239,7 @@ def create_default_config(
                         "labels": ["noble", "x64"],
                     },
                     "flavor": {"name": openstack_config.flavor or "small", "labels": ["small"]},
-                    "base_virtual_machines": 1,
+                    "base_virtual_machines": base_virtual_machines,
                     "max_total_virtual_machines": 0,
                 }
             ]
@@ -256,5 +260,5 @@ def create_default_config(
         },
         **({"planner_url": planner_url} if planner_url else {}),
         **({"planner_token": planner_token} if planner_token else {}),
-        "reconcile_interval": 60,
+        "reconcile_interval": reconcile_interval,
     }
@@ -0,0 +1,123 @@
+# Copyright 2026 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Helpers for app-level integration metrics assertions."""
+
+import json
+import time
+from pathlib import Path
+from typing import Any
+
+from github.Repository import Repository
+
+from github_runner_manager.manager.vm_manager import PostJobStatus
+from github_runner_manager.types_.github import JobConclusion
+
+TEST_WORKFLOW_NAMES = [
+    "Workflow Dispatch Tests",
+    "Workflow Dispatch Crash Tests",
+    "Workflow Dispatch Failure Tests 2a34f8b1-41e4-4bcb-9bbf-7a74e6c482f7",
+]
+
+
+def _assert_non_negative_number(metric: dict[str, Any], key: str) -> None:
+    """Assert event key exists and contains a non-negative numeric value."""
+    assert key in metric, f"Missing metric field: {key}"
+    value = metric[key]
+    assert isinstance(value, (int, float)), f"Metric field {key} is not numeric: {value!r}"
+    assert value >= 0, f"Metric field {key} is negative: {value!r}"
+
+
+def clear_metrics_log(metrics_log_path: Path) -> None:
+    """Delete metrics log file to reset test state."""
+    metrics_log_path.unlink(missing_ok=True)
+
+
+def get_metrics_events(metrics_log_path: Path) -> list[dict[str, Any]]:
+    """Return metrics events from the log file."""
+    if not metrics_log_path.exists():
+        return []
+    lines = metrics_log_path.read_text(encoding="utf-8").splitlines()
+    return [json.loads(line) for line in lines if line.strip()]
+
+
+def wait_for_events(
+    metrics_log_path: Path,
+    expected_events: set[str],
+    timeout: int = 10 * 60,
+    interval: int = 10,
+) -> list[dict[str, Any]]:
+    """Wait until all expected event names are present in the metrics log."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        events = get_metrics_events(metrics_log_path)
+        emitted = {event.get("event") for event in events}
+        if expected_events <= emitted:
+            return events
+        time.sleep(interval)
+    raise TimeoutError(f"Timed out waiting for metrics events: {sorted(expected_events)}")
+
+
+def assert_events_after_reconciliation(
+    events: list[dict[str, Any]],
+    flavor: str,
+    github_repository: Repository,
+    post_job_status: PostJobStatus,
+) -> None:
+    """Assert runner-start/stop/reconciliation metrics for a completed test flow."""
+    emitted = {event.get("event") for event in events}
+    assert {
+        "runner_start",
+        "runner_stop",
+        "reconciliation",
+    } <= emitted, "Not all metrics events were logged"
+
+    for metric in events:
+        if metric.get("event") == "runner_start":
+            assert metric.get("flavor") == flavor
+            assert metric.get("workflow") in TEST_WORKFLOW_NAMES
+            assert metric.get("repo") == github_repository.full_name
+            assert metric.get("github_event") == "workflow_dispatch"
+            _assert_non_negative_number(metric, "idle")
+            _assert_non_negative_number(metric, "queue_duration")
+
+        if metric.get("event") == "runner_stop":
+            assert metric.get("flavor") == flavor
+            assert metric.get("workflow") in TEST_WORKFLOW_NAMES
+            assert metric.get("repo") == github_repository.full_name
+            assert metric.get("github_event") == "workflow_dispatch"
+            assert metric.get("status") == post_job_status
+            if post_job_status == PostJobStatus.ABNORMAL:
+                assert metric.get("status_info", {}).get("code", 0) != 0
+                assert metric.get("job_conclusion") in [None, JobConclusion.CANCELLED]
+            else:
+                assert "status_info" not in metric
+                assert metric.get("job_conclusion") == JobConclusion.SUCCESS
+            _assert_non_negative_number(metric, "job_duration")
+
+        if metric.get("event") == "reconciliation":
+            assert metric.get("flavor") == flavor
+            _assert_non_negative_number(metric, "duration")
+            assert metric.get("crashed_runners") == 0
+            _assert_non_negative_number(metric, "idle_runners")
+            _assert_non_negative_number(metric, "active_runners")
+            _assert_non_negative_number(metric, "expected_runners")
+
+
+def wait_for_runner_to_be_marked_offline(
+    github_repository: Repository,
+    runner_name: str,
+    timeout: int = 30 * 60,
+    interval: int = 60,
+) -> None:
+    """Wait for a runner to become offline or disappear from GitHub."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        for runner in github_repository.get_self_hosted_runners():
+            if runner.name == runner_name:
+                if runner.status == "online":
+                    time.sleep(interval)
+                    break
+        else:
+            return
+    raise TimeoutError(f"Timeout while waiting for runner {runner_name} to be marked offline")