Skip to content

Commit d147c75

Browse files
authored
Merge pull request #581 from Dstack-TEE/refactor/dedup-kms-auth-helpers
refactor: deduplicate KMS auth helpers
2 parents 1029e63 + c561a7f commit d147c75

File tree

4 files changed

+81
-85
lines changed

4 files changed

+81
-85
lines changed

kms/src/main_service/upgrade_authority.rs

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// SPDX-License-Identifier: Apache-2.0
44

5-
use crate::config::AuthApi;
5+
use crate::config::{AuthApi, KmsConfig};
66
use anyhow::{bail, Context, Result};
77
use dstack_guest_agent_rpc::{
88
dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs,
@@ -188,19 +188,51 @@ fn url_join(url: &str, path: &str) -> String {
188188
url
189189
}
190190

191-
fn dstack_client() -> DstackGuestClient<PrpcClient> {
191+
pub(crate) fn dstack_client() -> DstackGuestClient<PrpcClient> {
192192
let address = dstack_types::dstack_agent_address();
193193
let http_client = PrpcClient::new(address);
194194
DstackGuestClient::new(http_client)
195195
}
196196

197-
async fn app_attest(report_data: Vec<u8>) -> Result<AttestResponse> {
197+
pub(crate) async fn app_attest(report_data: Vec<u8>) -> Result<AttestResponse> {
198198
dstack_client().attest(RawQuoteArgs { report_data }).await
199199
}
200200

201-
fn pad64(hash: [u8; 32]) -> Vec<u8> {
201+
pub(crate) fn pad64(hash: [u8; 32]) -> Vec<u8> {
202202
let mut padded = Vec::with_capacity(64);
203203
padded.extend_from_slice(&hash);
204204
padded.resize(64, 0);
205205
padded
206206
}
207+
208+
pub(crate) async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> {
209+
let boot_info = local_kms_boot_info(cfg.pccs_url.as_deref())
210+
.await
211+
.context("failed to build local KMS boot info")?;
212+
let response = cfg
213+
.auth_api
214+
.is_app_allowed(&boot_info, true)
215+
.await
216+
.context("failed to call KMS auth check")?;
217+
if !response.is_allowed {
218+
bail!("boot denied: {}", response.reason);
219+
}
220+
Ok(())
221+
}
222+
223+
pub(crate) async fn ensure_kms_allowed(
224+
cfg: &KmsConfig,
225+
attestation: &VerifiedAttestation,
226+
) -> Result<()> {
227+
let boot_info = build_boot_info(attestation, false, "")
228+
.context("failed to build KMS boot info from attestation")?;
229+
let response = cfg
230+
.auth_api
231+
.is_app_allowed(&boot_info, true)
232+
.await
233+
.context("failed to call KMS auth check")?;
234+
if !response.is_allowed {
235+
bail!("boot denied: {}", response.reason);
236+
}
237+
Ok(())
238+
}

kms/src/onboard_service.rs

Lines changed: 4 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,13 @@
55
use std::sync::{Arc, Mutex};
66

77
use anyhow::{bail, Context, Result};
8-
use dstack_guest_agent_rpc::{
9-
dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs,
10-
};
118
use dstack_kms_rpc::{
129
kms_client::KmsClient,
1310
onboard_server::{OnboardRpc, OnboardServer},
1411
AttestationInfoResponse, BootstrapRequest, BootstrapResponse, GetKmsKeyRequest, OnboardRequest,
1512
OnboardResponse,
1613
};
1714
use fs_err as fs;
18-
use http_client::prpc::PrpcClient;
1915
use k256::ecdsa::SigningKey;
2016
use ra_rpc::{
2117
client::{CertInfo, RaClient, RaClientConfig},
@@ -30,7 +26,9 @@ use safe_write::safe_write;
3026

3127
use crate::{
3228
config::KmsConfig,
33-
main_service::upgrade_authority::{build_boot_info, local_kms_boot_info},
29+
main_service::upgrade_authority::{
30+
app_attest, dstack_client, ensure_kms_allowed, ensure_self_kms_allowed, pad64,
31+
},
3432
};
3533

3634
#[derive(Clone)]
@@ -260,7 +258,7 @@ impl Keys {
260258
.map_err(|_| anyhow::anyhow!("source attestation mutex poisoned"))?
261259
.clone()
262260
.context("Missing source KMS attestation")?;
263-
ensure_remote_kms_allowed(cfg, &source_attestation)
261+
ensure_kms_allowed(cfg, &source_attestation)
264262
.await
265263
.context("Source KMS is not allowed for onboarding")?;
266264

@@ -349,52 +347,6 @@ pub(crate) async fn bootstrap_keys(cfg: &KmsConfig) -> Result<()> {
349347
Ok(())
350348
}
351349

352-
fn dstack_client() -> DstackGuestClient<PrpcClient> {
353-
let address = dstack_types::dstack_agent_address();
354-
let http_client = PrpcClient::new(address);
355-
DstackGuestClient::new(http_client)
356-
}
357-
358-
async fn app_attest(report_data: Vec<u8>) -> Result<AttestResponse> {
359-
dstack_client().attest(RawQuoteArgs { report_data }).await
360-
}
361-
362-
async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> {
363-
let boot_info = local_kms_boot_info(cfg.pccs_url.as_deref())
364-
.await
365-
.context("Failed to build local KMS boot info")?;
366-
let response = cfg
367-
.auth_api
368-
.is_app_allowed(&boot_info, true)
369-
.await
370-
.context("Failed to call KMS auth check")?;
371-
if !response.is_allowed {
372-
bail!("Boot denied: {}", response.reason);
373-
}
374-
Ok(())
375-
}
376-
377-
async fn ensure_remote_kms_allowed(
378-
cfg: &KmsConfig,
379-
attestation: &VerifiedAttestation,
380-
) -> Result<()> {
381-
ensure_kms_allowed(cfg, attestation).await
382-
}
383-
384-
async fn ensure_kms_allowed(cfg: &KmsConfig, attestation: &VerifiedAttestation) -> Result<()> {
385-
let boot_info = build_boot_info(attestation, false, "")
386-
.context("Failed to build KMS boot info from attestation")?;
387-
let response = cfg
388-
.auth_api
389-
.is_app_allowed(&boot_info, true)
390-
.await
391-
.context("Failed to call KMS auth check")?;
392-
if !response.is_allowed {
393-
bail!("Boot denied: {}", response.reason);
394-
}
395-
Ok(())
396-
}
397-
398350
async fn attest_keys(p256_pubkey: &[u8], k256_pubkey: &[u8]) -> Result<Vec<u8>> {
399351
let p256_hex = hex::encode(p256_pubkey);
400352
let k256_hex = hex::encode(k256_pubkey);
@@ -412,13 +364,6 @@ fn keccak256(msg: &[u8]) -> [u8; 32] {
412364
hasher.finalize().into()
413365
}
414366

415-
fn pad64(hash: [u8; 32]) -> Vec<u8> {
416-
let mut padded = Vec::with_capacity(64);
417-
padded.extend_from_slice(&hash);
418-
padded.resize(64, 0);
419-
padded
420-
}
421-
422367
async fn gen_ra_cert(ca_cert_pem: String, ca_key_pem: String) -> Result<(String, String)> {
423368
use ra_tls::cert::CertRequest;
424369
use ra_tls::rcgen::{KeyPair, PKCS_ECDSA_P256_SHA256};

tests/docs/kms-bootstrap-onboard.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ Operational notes:
7575
1. Prefer a **prebuilt KMS image**.
7676
2. `Boot Progress: done` does **not** guarantee the onboard endpoint is ready.
7777
3. The onboarding completion endpoint is **GET `/finish`**.
78-
4. On teepod, onboard mode usually uses the `-8000` URL, while runtime TLS KMS RPC usually uses the `-8000s` URL.
78+
4. On teepod with gateway, onboard mode usually uses the `-8000` URL, while runtime TLS KMS RPC usually uses the `-8000s` URL. **Port forwarding** (`--port tcp:0.0.0.0:<host-port>:8000`) is simpler than gateway for testing, because gateway requires the auth API to return a `gatewayAppId` at boot time.
7979
5. If you use a very small custom webhook instead of the real auth service, `KMS.GetMeta` may fail because `auth_api.get_info()` expects extra chain / contract metadata fields. In that case, use `GetTempCaCert` as the runtime readiness probe.
80+
6. dstack CVMs use QEMU user-mode networking — the host is reachable at **`10.0.2.2`** from inside the CVM. The `source_url` in `Onboard.Onboard` must use a CVM-reachable address (e.g., `https://10.0.2.2:<port>/prpc`), not `127.0.0.1`.
81+
7. **Remote KMS attestation has an empty `osImageHash`.** When the receiver verifies the source KMS during onboard, the `osImageHash` is empty because `vm_config` is unavailable for remote attestation. Auth configs for receiver-side checks must include `"0x"` in the `osImages` array.
8082

8183
---
8284

@@ -99,14 +101,16 @@ Use two independently controllable auth services:
99101

100102
They can be:
101103

102-
1. host-local if reachable by CVMs
104+
1. **Preferred:** host-local, accessed from CVMs via `http://10.0.2.2:<port>` (QEMU host gateway)
103105
2. public services
104106
3. sidecars inside each KMS deployment
105107

106108
At minimum, both policies must allow the KMS instance they serve. During onboard, source-side policy must also allow the destination KMS caller.
107109

108110
For `auth-simple`, `kms.mrAggregated = []` is a deny-all policy for KMS. Add the current KMS MR values explicitly when switching a test from deny to allow.
109111

112+
Include `"0x"` in the `osImages` array for configs used in receiver-side onboard checks (see operational note 7 above).
113+
110114
### 4.3 Deploy `kms-src` and `kms-dst`
111115

112116
Deploy both KMS instances in onboard mode with:

tests/docs/kms-self-authorization.md

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,22 @@ The goal is to validate the following behaviors without depending on `kms/e2e/`
1111

1212
This guide is written as a deployment-and-test runbook so an AI agent can follow it end-to-end.
1313

14-
> **Execution notes from a real run on teepod2 (2026-03-19):**
14+
> **Execution notes from real runs on teepod2 (2026-03-19):**
1515
>
1616
> 1. Do **not** assume a host-local `auth-simple` instance is reachable from a CVM. In practice, the auth API must be:
1717
> - publicly reachable by the CVM, or
1818
> - deployed as a sidecar/internal service inside the same test environment.
19-
> 2. For PR validation, prefer a **prebuilt KMS test image**. The run documented here used `cr.kvin.wang/dstack-kms:kms-auth-checks-157ad4ba`.
19+
> - dstack CVMs use QEMU user-mode networking — the host is reachable at **`10.0.2.2`** from inside the CVM.
20+
> 2. For PR validation, prefer a **prebuilt KMS test image**.
2021
> 3. `Boot Progress: done` only means the VM guest boot finished. It does **not** guarantee the KMS onboard endpoint is already ready.
2122
> 4. If you inject helper scripts through `docker-compose.yaml`, prefer inline `configs.content` over `configs.file` unless you have confirmed the extra files are copied into the deployment bundle.
2223
> 5. The onboard completion endpoint is **GET `/finish`**, not POST.
2324
> 6. Do **not** reuse a previously captured `mr_aggregated` across redeploys. Auth policies must be generated from the attestation of the **current** VM under test.
2425
> 7. KMS now always requires quote/attestation. For local development without TDX hardware, use `sdk/simulator` instead of trying to run a no-attestation KMS flow.
2526
> 8. For `auth-simple`, `kms.mrAggregated = []` is a deny-all policy for KMS. Use that as the baseline deny configuration, then add the measured KMS MR values for allow cases.
27+
> 9. **Port forwarding is simpler than gateway for testing.** Using `--gateway` requires the auth API to return a valid `gatewayAppId`, which adds unnecessary complexity. Use `--port tcp:0.0.0.0:<host-port>:8000` instead.
28+
> 10. **Remote KMS attestation has an empty `osImageHash`.** When the receiver verifies the source KMS during onboard, the `osImageHash` field in the attestation is empty (because `vm_config` is not available for the remote attestation). Auth configs for receiver-side checks must include `"0x"` in the `osImages` array to match this empty hash.
29+
> 11. The `source_url` in the `Onboard.Onboard` request must use an address **reachable from inside the CVM** (e.g., `https://10.0.2.2:<port>/prpc`), not `127.0.0.1` which is the CVM's own loopback.
2630
2731
---
2832

@@ -119,10 +123,10 @@ Strong recommendation for this manual test:
119123

120124
Using a prebuilt image significantly reduces ambiguity when a failure happens: you can focus on KMS authorization logic rather than image build or registry behavior.
121125

122-
Teepod/gateway URL convention observed during a real run:
126+
If you use teepod gateway instead of port forwarding:
123127

124-
- **onboard mode:** use the `-8000` style URL
125-
- **runtime TLS KMS RPC after bootstrap/onboard:** use the `-8000s` style URL
128+
- **onboard mode:** use the `-8000` style URL (plain HTTP)
129+
- **runtime TLS KMS RPC after bootstrap/onboard:** use the `-8000s` style URL (TLS passthrough)
126130

127131
Do not assume the same external URL works before and after onboarding is finished.
128132

@@ -144,9 +148,9 @@ The original plan was to run two host-local `auth-simple` processes. In practice
144148

145149
Choose one of these options:
146150

147-
1. **Preferred:** deploy the auth API as a separate public service or CVM
148-
2. **Also fine:** run the auth API as a sidecar in the same KMS test deployment
149-
3. **Only if reachable:** run `auth-simple` on the operator host and point KMS at that reachable host/IP
151+
1. **Preferred:** run `auth-simple` on the operator host and point KMS at `http://10.0.2.2:<port>` (QEMU host gateway). This is the simplest if the CVMs use QEMU user-mode networking.
152+
2. **Also fine:** deploy the auth API as a separate public service or CVM
153+
3. **Sidecar:** run the auth API as a sidecar in the same KMS test deployment
150154

151155
If you use the sidecar/public-service pattern, keep the same logical split:
152156

@@ -224,12 +228,17 @@ Requirements for **both** VMs:
224228
- `core.onboard.auto_bootstrap_domain = ""`
225229
- `core.auth_api.type = "webhook"`
226230

227-
Point them at different auth services or sidecars:
231+
Point them at different auth services. If using host-local `auth-simple` with QEMU user-mode networking:
228232

229-
- `kms-src``http://<host-reachable-ip>:3101`
230-
- `kms-dst``http://<host-reachable-ip>:3102`
233+
- `kms-src``http://10.0.2.2:3101`
234+
- `kms-dst``http://10.0.2.2:3102`
231235

232-
If you use sidecars instead of host-local auth servers, replace those URLs with the sidecar/internal service addresses.
236+
**Recommended deploy method:** use port forwarding (`--port`) instead of gateway. Gateway requires the auth API to return a `gatewayAppId` at boot, which makes testing harder. With port forwarding, the KMS onboard and runtime endpoints are directly accessible on the host:
237+
238+
```bash
239+
vmm-cli.py deploy --name kms-src ... --port tcp:0.0.0.0:9301:8000
240+
vmm-cli.py deploy --name kms-dst ... --port tcp:0.0.0.0:9302:8000
241+
```
233242

234243
If you need an example deployment template, adapt the flow in:
235244

@@ -238,14 +247,18 @@ If you need an example deployment template, adapt the flow in:
238247
Record these values:
239248

240249
```bash
241-
export KMS_SRC_ONBOARD='https://<kms-src-onboard-host>/'
242-
export KMS_DST_ONBOARD='https://<kms-dst-onboard-host>/'
250+
# With port forwarding:
251+
export KMS_SRC_ONBOARD='http://127.0.0.1:9301'
252+
export KMS_DST_ONBOARD='http://127.0.0.1:9302'
253+
export KMS_SRC_RUNTIME='https://127.0.0.1:9301'
254+
export KMS_DST_RUNTIME='https://127.0.0.1:9302'
243255
```
244256

245257
Notes:
246258

247-
- The onboard endpoint is plain onboarding mode, so use `Onboard.*`
248-
- The runtime KMS endpoint is available only after bootstrap/onboard and `/finish`
259+
- The onboard endpoint serves plain HTTP, so use `http://` for `KMS_*_ONBOARD`
260+
- After bootstrap/onboard + `/finish`, the KMS restarts with TLS — use `https://` for `KMS_*_RUNTIME`
261+
- The `source_url` in `Onboard.Onboard` must be reachable from inside the CVM (e.g., `https://10.0.2.2:9301/prpc`)
249262

250263
Wait until the onboard endpoint is actually ready before continuing. A simple probe loop is recommended:
251264

@@ -300,12 +313,14 @@ All three values above are expected to be hex strings **without** the `0x` prefi
300313

301314
#### Deny-by-MR config
302315

303-
Use a wrong `mrAggregated` value while allowing the observed OS image:
316+
Use a wrong `mrAggregated` value while allowing the observed OS image.
317+
318+
> **Important:** include `"0x"` in `osImages` to handle remote KMS attestation during onboard receiver-side checks, where `osImageHash` is empty because `vm_config` is unavailable for the remote attestation.
304319
305320
```bash
306321
cat > /tmp/kms-self-auth/deny-by-mr.json <<'EOF'
307322
{
308-
"osImages": ["0xREPLACE_OS"],
323+
"osImages": ["0xREPLACE_OS", "0x"],
309324
"gatewayAppId": "any",
310325
"kms": {
311326
"mrAggregated": ["0x0000000000000000000000000000000000000000000000000000000000000000"],
@@ -322,7 +337,7 @@ EOF
322337
```bash
323338
cat > /tmp/kms-self-auth/allow-single.json <<'EOF'
324339
{
325-
"osImages": ["0xREPLACE_OS"],
340+
"osImages": ["0xREPLACE_OS", "0x"],
326341
"gatewayAppId": "any",
327342
"kms": {
328343
"mrAggregated": ["0xREPLACE_MR"],
@@ -339,7 +354,7 @@ EOF
339354
```bash
340355
cat > /tmp/kms-self-auth/allow-src-and-dst.json <<'EOF'
341356
{
342-
"osImages": ["0xREPLACE_SRC_OS", "0xREPLACE_DST_OS"],
357+
"osImages": ["0xREPLACE_SRC_OS", "0xREPLACE_DST_OS", "0x"],
343358
"gatewayAppId": "any",
344359
"kms": {
345360
"mrAggregated": ["0xREPLACE_SRC_MR", "0xREPLACE_DST_MR"],

0 commit comments

Comments
 (0)