diff --git a/packages/bcode-browser/skills/BROWSER.md b/packages/bcode-browser/skills/BROWSER.md index 423e4b048..f5194a557 100644 --- a/packages/bcode-browser/skills/BROWSER.md +++ b/packages/bcode-browser/skills/BROWSER.md @@ -133,8 +133,11 @@ await session.Input.dispatchMouseEvent({ type: "mouseReleased", x, y, button: "l await session.Input.insertText({ text: "hello" }) // Screenshot. -const { data } = await session.Page.captureScreenshot({ format: "png" }) -// data is base64; write with the `write` tool or process in JS. +await session.Page.captureScreenshot({ format: "png" }) +// You see the image inline on the next turn — `browser_execute` automatically +// attaches every `Page.captureScreenshot` result. No need to decode, save, or +// `read` the bytes back. The base64 is still in `data` (via the return value) +// for the rare case you want to process it programmatically. ``` For the full menu of UI mechanics — dropdowns, dialogs, iframes, shadow DOM, uploads, scrolling, screenshots-with-highlights — list `{{SKILLS_DIR}}/interaction-skills/` to see all available topics, then read the relevant one. diff --git a/packages/bcode-browser/skills/interaction-skills/screenshots.md b/packages/bcode-browser/skills/interaction-skills/screenshots.md index 94c5a6962..6d1146992 100644 --- a/packages/bcode-browser/skills/interaction-skills/screenshots.md +++ b/packages/bcode-browser/skills/interaction-skills/screenshots.md @@ -2,12 +2,19 @@ `session.Page.captureScreenshot` is your default discovery and verification tool. +**Auto-attached.** Every successful `Page.captureScreenshot` made during a `browser_execute` call is automatically surfaced to you as an inline image attachment on the next turn — same channel the `read` tool uses for image files. You don't need to decode the base64, save it, or `read` it back to see the image. + +The `data` field on the return value still carries the base64 string for the rare case where you want to process the image programmatically (OCR, diff against a previous shot, dimension extraction). + ## Core calls ```js // Viewport only (default) — fastest, matches what the user sees +await session.Page.captureScreenshot({ format: 'png' }) +// You'll see the image inline on the next turn. No write/read step needed. + +// If you do want the bytes (e.g. to write to disk yourself): const { data } = await session.Page.captureScreenshot({ format: 'png' }) -// Cross-platform temp dir: /tmp on Linux, /var/folders/… on macOS, %TEMP% on Windows const { tmpdir } = await import('node:os') await Bun.write(`${tmpdir()}/shot.png`, Buffer.from(data, 'base64')) diff --git a/packages/bcode-browser/src/browser-execute.ts b/packages/bcode-browser/src/browser-execute.ts index 17bebdbc2..64a6802d4 100644 --- a/packages/bcode-browser/src/browser-execute.ts +++ b/packages/bcode-browser/src/browser-execute.ts @@ -41,6 +41,7 @@ // Level-2 hook in packages/opencode is a thin adapter. import fs from "fs/promises" +import path from "path" import { Effect, Schema } from "effect" import { SessionStore } from "./session-store" import { Skills } from "./skills" @@ -78,14 +79,44 @@ export interface ExecuteContext { readonly onChunk?: (output: string) => Effect.Effect } +// One screenshot collected during an execute() call. Drained into the +// Level-2 wrapper's `attachments[]` so the agent sees the image inline on the +// next assistant turn — no decode/write/read dance from inside the snippet. +export interface CollectedScreenshot { + readonly mime: "image/png" | "image/jpeg" | "image/webp" + readonly base64: string +} + export interface ExecuteResult { readonly output: string // The snippet's `return` value, JSON-serialized when possible. `undefined` // serializes as `null` (JSON has no undefined). Non-serializable values // fall back to `String(v)`. readonly result: string + // Every successful `Page.captureScreenshot` made by the snippet, in the + // order the CDP responses came back. Empty when the snippet didn't take + // any screenshots. + readonly screenshots: readonly CollectedScreenshot[] +} + +const SCREENSHOT_FORMAT_TO_MIME: Record = { + png: "image/png", + jpeg: "image/jpeg", + webp: "image/webp", +} + +const SCREENSHOT_FORMAT_TO_EXT: Record = { + png: "png", + jpeg: "jpg", + webp: "webp", } +const screenshotMime = (format: unknown): CollectedScreenshot["mime"] => + SCREENSHOT_FORMAT_TO_MIME[typeof format === "string" ? format : "png"] ?? "image/png" + +const screenshotExt = (format: unknown): string => + SCREENSHOT_FORMAT_TO_EXT[typeof format === "string" ? format : "png"] ?? "png" + // AsyncFunction is not a global — pull it off an async arrow's constructor. const AsyncFunction = (async () => {}).constructor as new ( ...args: string[] @@ -145,12 +176,46 @@ export const make = Effect.fn("BrowserExecute.make")(function* (dataDir: string) debug: tee, }) + // Screenshot tap. Subscribes to the Session's call-result stream for + // the duration of this execute() call; every successful + // `Page.captureScreenshot` is collected (drained into `attachments[]` + // by the Level-2 wrapper so the agent sees the image inline) and, + // when `BCODE_SCREENSHOT_DIR` is set, also written to disk for + // eval-judge consumption. Two consumers of one tap. + // + // Concurrency note: parallel execute() calls against the same Session + // (rare but possible — different sessionIDs share no Session, but a + // single sessionID with two in-flight tool calls would) each subscribe + // independently and would each see all screenshots produced during + // their lifetime. Acceptable for v1; opencode tool calls within one + // assistant message are serialized anyway. + const screenshots: CollectedScreenshot[] = [] + const dumpDir = process.env.BCODE_SCREENSHOT_DIR + const startedAt = Date.now() + let seq = 0 + const unsubscribe = session.onCallResult((method, params, result) => { + if (method !== "Page.captureScreenshot") return + const r = result as { data?: unknown } + if (typeof r?.data !== "string") return + const p = (params ?? {}) as { format?: unknown } + const mime = screenshotMime(p.format) + const ext = screenshotExt(p.format) + const idx = seq++ + screenshots.push({ mime, base64: r.data }) + if (dumpDir) { + const filename = `${ctx.sessionID}-${startedAt}-${String(idx).padStart(3, "0")}.${ext}` + fs.mkdir(dumpDir, { recursive: true }) + .then(() => fs.writeFile(path.join(dumpDir, filename), Buffer.from(r.data as string, "base64"))) + .catch(() => { /* eval-side dump is best-effort */ }) + } + }) + const ran = yield* Effect.tryPromise({ try: () => wrapped(session, snippetConsole), catch: (err) => new Error(`browser_execute snippet threw: ${err instanceof Error ? err.stack ?? err.message : String(err)}`), - }) + }).pipe(Effect.ensuring(Effect.sync(() => unsubscribe()))) - return { output, result: serialize(ran) } satisfies ExecuteResult + return { output, result: serialize(ran), screenshots } satisfies ExecuteResult }).pipe( Effect.scoped, Effect.timeoutOrElse({ diff --git a/packages/bcode-browser/src/cdp/session.ts b/packages/bcode-browser/src/cdp/session.ts index 66327a148..6fb400f5a 100644 --- a/packages/bcode-browser/src/cdp/session.ts +++ b/packages/bcode-browser/src/cdp/session.ts @@ -47,6 +47,7 @@ export class Session implements Transport { private pending = new Map(); private activeSessionId: string | undefined; private eventListeners: Array<(method: string, params: unknown, sessionId?: string) => void> = []; + private callResultListeners: Array<(method: string, params: unknown, result: unknown) => void> = []; // Generated bindings — one per CDP domain. // Initialized lazily after construction so `_call` is available. @@ -170,6 +171,23 @@ export class Session implements Transport { }; } + /** + * Subscribe to all successful CDP method results. Returns an unsubscribe fn. + * Fires after `_call` resolves; listener errors are swallowed. + * + * Used by `browser-execute` to collect `Page.captureScreenshot` outputs + * from inside an execute() call (drained into `attachments[]` so the agent + * sees the image inline; optionally also written to `BCODE_SCREENSHOT_DIR` + * for eval-judge consumption). Generic by design — keeps `Session` + * agnostic of any one method's semantics. + */ + onCallResult(fn: (method: string, params: unknown, result: unknown) => void): () => void { + this.callResultListeners.push(fn); + return () => { + this.callResultListeners = this.callResultListeners.filter(x => x !== fn); + }; + } + /** Wait for the next event matching `method` (and optional predicate). */ waitFor(method: string, predicate?: (params: T) => boolean, timeoutMs = 30_000): Promise { return new Promise((resolve, reject) => { @@ -198,7 +216,15 @@ export class Session implements Transport { msg.sessionId = this.activeSessionId; } return new Promise((resolve, reject) => { - this.pending.set(id, { resolve, reject }); + this.pending.set(id, { + resolve: (v) => { + for (const fn of this.callResultListeners) { + try { fn(method, params, v); } catch { /* ignore */ } + } + resolve(v); + }, + reject, + }); this.ws!.send(JSON.stringify(msg)); }); } diff --git a/packages/bcode-browser/test/browser-execute.test.ts b/packages/bcode-browser/test/browser-execute.test.ts index 1c2f2967b..20a4e5670 100644 --- a/packages/bcode-browser/test/browser-execute.test.ts +++ b/packages/bcode-browser/test/browser-execute.test.ts @@ -113,6 +113,63 @@ test.skipIf(!enabled)("workspace import inside a snippet", async () => { expect(JSON.parse(result.result)).toBe("bcode-be") }) +test.skipIf(!enabled)("Page.captureScreenshot is collected into result.screenshots", async () => { + const result = await Effect.runPromise( + Effect.scoped( + Effect.gen(function* () { + const impl = yield* BrowserExecute.make(dataDir) + return yield* impl.execute( + { + code: `await session.Page.enable(); + await session.Page.navigate({ url: "data:text/html,shothi" }); + await session.waitFor("Page.loadEventFired", undefined, 5000); + const a = await session.Page.captureScreenshot({ format: "png" }); + const b = await session.Page.captureScreenshot({ format: "jpeg", quality: 50 }); + return { aLen: a.data.length, bLen: b.data.length };`, + }, + { sessionID, workspaceDir }, + ) + }), + ), + ) + expect(result.screenshots).toHaveLength(2) + expect(result.screenshots[0]!.mime).toBe("image/png") + expect(result.screenshots[1]!.mime).toBe("image/jpeg") + // base64 must round-trip back to non-empty bytes for both shots. + expect(Buffer.from(result.screenshots[0]!.base64, "base64").length).toBeGreaterThan(0) + expect(Buffer.from(result.screenshots[1]!.base64, "base64").length).toBeGreaterThan(0) +}) + +test.skipIf(!enabled)("BCODE_SCREENSHOT_DIR dumps screenshots to disk", async () => { + const dump = await fs.mkdtemp(path.join(os.tmpdir(), "bcode-shotdump-")) + const prev = process.env.BCODE_SCREENSHOT_DIR + process.env.BCODE_SCREENSHOT_DIR = dump + try { + await Effect.runPromise( + Effect.scoped( + Effect.gen(function* () { + const impl = yield* BrowserExecute.make(dataDir) + return yield* impl.execute( + { + code: `await session.Page.captureScreenshot({ format: "png" });`, + }, + { sessionID, workspaceDir }, + ) + }), + ), + ) + // Disk dump is fire-and-forget; give it a tick to land. + await new Promise((r) => setTimeout(r, 150)) + const files = await fs.readdir(dump) + expect(files.length).toBeGreaterThan(0) + expect(files.every((f) => f.endsWith(".png"))).toBe(true) + } finally { + if (prev === undefined) delete process.env.BCODE_SCREENSHOT_DIR + else process.env.BCODE_SCREENSHOT_DIR = prev + await fs.rm(dump, { recursive: true, force: true }) + } +}) + test.skipIf(!enabled)("syntax error in snippet surfaces a clean failure", async () => { await expect( Effect.runPromise( diff --git a/packages/opencode/src/tool/browser-execute.ts b/packages/opencode/src/tool/browser-execute.ts index f0632f732..e7dd420c5 100644 --- a/packages/opencode/src/tool/browser-execute.ts +++ b/packages/opencode/src/tool/browser-execute.ts @@ -55,15 +55,30 @@ export const BrowserExecuteTool = Tool.define( metadata: { output: preview(output) }, }), }) + // Drain every `Page.captureScreenshot` made during this snippet + // into `attachments[]`. Opencode appends FilePart attachments to + // the next assistant turn as image parts, so the model receives + // the screenshot natively as vision input — no decode/write/read + // dance from inside the snippet. Same channel `read` and + // `webfetch` use when they surface images. + const attachments = result.screenshots.map((s) => ({ + type: "file" as const, + mime: s.mime, + url: `data:${s.mime};base64,${s.base64}`, + })) return { title: "browser_execute", output: [ result.output.trimEnd(), result.result === "null" ? "" : `=> ${result.result}`, + attachments.length > 0 + ? `(${attachments.length} screenshot${attachments.length === 1 ? "" : "s"} attached)` + : "", ] .filter(Boolean) .join("\n\n"), metadata: { result: result.result, output: preview(result.output) }, + attachments, } }).pipe(Effect.orDie), }