Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions packages/bcode-browser/skills/BROWSER.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,11 @@ await session.Input.dispatchMouseEvent({ type: "mouseReleased", x, y, button: "l
await session.Input.insertText({ text: "hello" })

// Screenshot.
const { data } = await session.Page.captureScreenshot({ format: "png" })
// data is base64; write with the `write` tool or process in JS.
await session.Page.captureScreenshot({ format: "png" })
// You see the image inline on the next turn — `browser_execute` automatically
// attaches every `Page.captureScreenshot` result. No need to decode, save, or
// `read` the bytes back. The base64 is still in `data` (via the return value)
// for the rare case you want to process it programmatically.
```

For the full menu of UI mechanics — dropdowns, dialogs, iframes, shadow DOM, uploads, scrolling, screenshots-with-highlights — list `{{SKILLS_DIR}}/interaction-skills/` to see all available topics, then read the relevant one.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@

`session.Page.captureScreenshot` is your default discovery and verification tool.

**Auto-attached.** Every successful `Page.captureScreenshot` made during a `browser_execute` call is automatically surfaced to you as an inline image attachment on the next turn — same channel the `read` tool uses for image files. You don't need to decode the base64, save it, or `read` it back to see the image.

The `data` field on the return value still carries the base64 string for the rare case where you want to process the image programmatically (OCR, diff against a previous shot, dimension extraction).

## Core calls

```js
// Viewport only (default) — fastest, matches what the user sees
await session.Page.captureScreenshot({ format: 'png' })
// You'll see the image inline on the next turn. No write/read step needed.

// If you do want the bytes (e.g. to write to disk yourself):
const { data } = await session.Page.captureScreenshot({ format: 'png' })
// Cross-platform temp dir: /tmp on Linux, /var/folders/… on macOS, %TEMP% on Windows
const { tmpdir } = await import('node:os')
await Bun.write(`${tmpdir()}/shot.png`, Buffer.from(data, 'base64'))

Expand Down
69 changes: 67 additions & 2 deletions packages/bcode-browser/src/browser-execute.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
// Level-2 hook in packages/opencode is a thin adapter.

import fs from "fs/promises"
import path from "path"
import { Effect, Schema } from "effect"
import { SessionStore } from "./session-store"
import { Skills } from "./skills"
Expand Down Expand Up @@ -78,14 +79,44 @@ export interface ExecuteContext {
readonly onChunk?: (output: string) => Effect.Effect<void>
}

// One screenshot collected during an execute() call. Drained into the
// Level-2 wrapper's `attachments[]` so the agent sees the image inline on the
// next assistant turn — no decode/write/read dance from inside the snippet.
export interface CollectedScreenshot {
readonly mime: "image/png" | "image/jpeg" | "image/webp"
readonly base64: string
}

export interface ExecuteResult {
readonly output: string
// The snippet's `return` value, JSON-serialized when possible. `undefined`
// serializes as `null` (JSON has no undefined). Non-serializable values
// fall back to `String(v)`.
readonly result: string
// Every successful `Page.captureScreenshot` made by the snippet, in the
// order the CDP responses came back. Empty when the snippet didn't take
// any screenshots.
readonly screenshots: readonly CollectedScreenshot[]
}

const SCREENSHOT_FORMAT_TO_MIME: Record<string, CollectedScreenshot["mime"]> = {
png: "image/png",
jpeg: "image/jpeg",
webp: "image/webp",
}

const SCREENSHOT_FORMAT_TO_EXT: Record<string, string> = {
png: "png",
jpeg: "jpg",
webp: "webp",
}

const screenshotMime = (format: unknown): CollectedScreenshot["mime"] =>
SCREENSHOT_FORMAT_TO_MIME[typeof format === "string" ? format : "png"] ?? "image/png"

const screenshotExt = (format: unknown): string =>
SCREENSHOT_FORMAT_TO_EXT[typeof format === "string" ? format : "png"] ?? "png"

// AsyncFunction is not a global — pull it off an async arrow's constructor.
const AsyncFunction = (async () => {}).constructor as new (
...args: string[]
Expand Down Expand Up @@ -145,12 +176,46 @@ export const make = Effect.fn("BrowserExecute.make")(function* (dataDir: string)
debug: tee,
})

// Screenshot tap. Subscribes to the Session's call-result stream for
// the duration of this execute() call; every successful
// `Page.captureScreenshot` is collected (drained into `attachments[]`
// by the Level-2 wrapper so the agent sees the image inline) and,
// when `BCODE_SCREENSHOT_DIR` is set, also written to disk for
// eval-judge consumption. Two consumers of one tap.
//
// Concurrency note: parallel execute() calls against the same Session
// (rare but possible — different sessionIDs share no Session, but a
// single sessionID with two in-flight tool calls would) each subscribe
// independently and would each see all screenshots produced during
// their lifetime. Acceptable for v1; opencode tool calls within one
// assistant message are serialized anyway.
const screenshots: CollectedScreenshot[] = []
const dumpDir = process.env.BCODE_SCREENSHOT_DIR
const startedAt = Date.now()
let seq = 0
const unsubscribe = session.onCallResult((method, params, result) => {
if (method !== "Page.captureScreenshot") return
const r = result as { data?: unknown }
if (typeof r?.data !== "string") return
const p = (params ?? {}) as { format?: unknown }
const mime = screenshotMime(p.format)
const ext = screenshotExt(p.format)
const idx = seq++
screenshots.push({ mime, base64: r.data })
if (dumpDir) {
const filename = `${ctx.sessionID}-${startedAt}-${String(idx).padStart(3, "0")}.${ext}`
fs.mkdir(dumpDir, { recursive: true })
.then(() => fs.writeFile(path.join(dumpDir, filename), Buffer.from(r.data as string, "base64")))
.catch(() => { /* eval-side dump is best-effort */ })
}
})

const ran = yield* Effect.tryPromise({
try: () => wrapped(session, snippetConsole),
catch: (err) => new Error(`browser_execute snippet threw: ${err instanceof Error ? err.stack ?? err.message : String(err)}`),
})
}).pipe(Effect.ensuring(Effect.sync(() => unsubscribe())))

return { output, result: serialize(ran) } satisfies ExecuteResult
return { output, result: serialize(ran), screenshots } satisfies ExecuteResult
}).pipe(
Effect.scoped,
Effect.timeoutOrElse({
Expand Down
28 changes: 27 additions & 1 deletion packages/bcode-browser/src/cdp/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ export class Session implements Transport {
private pending = new Map<number, Pending>();
private activeSessionId: string | undefined;
private eventListeners: Array<(method: string, params: unknown, sessionId?: string) => void> = [];
private callResultListeners: Array<(method: string, params: unknown, result: unknown) => void> = [];

// Generated bindings — one per CDP domain.
// Initialized lazily after construction so `_call` is available.
Expand Down Expand Up @@ -170,6 +171,23 @@ export class Session implements Transport {
};
}

/**
* Subscribe to all successful CDP method results. Returns an unsubscribe fn.
* Fires after `_call` resolves; listener errors are swallowed.
*
* Used by `browser-execute` to collect `Page.captureScreenshot` outputs
* from inside an execute() call (drained into `attachments[]` so the agent
* sees the image inline; optionally also written to `BCODE_SCREENSHOT_DIR`
* for eval-judge consumption). Generic by design — keeps `Session`
* agnostic of any one method's semantics.
*/
onCallResult(fn: (method: string, params: unknown, result: unknown) => void): () => void {
this.callResultListeners.push(fn);
return () => {
this.callResultListeners = this.callResultListeners.filter(x => x !== fn);
};
}

/** Wait for the next event matching `method` (and optional predicate). */
waitFor<T = unknown>(method: string, predicate?: (params: T) => boolean, timeoutMs = 30_000): Promise<T> {
return new Promise((resolve, reject) => {
Expand Down Expand Up @@ -198,7 +216,15 @@ export class Session implements Transport {
msg.sessionId = this.activeSessionId;
}
return new Promise((resolve, reject) => {
this.pending.set(id, { resolve, reject });
this.pending.set(id, {
resolve: (v) => {
for (const fn of this.callResultListeners) {
try { fn(method, params, v); } catch { /* ignore */ }
}
resolve(v);
},
reject,
});
this.ws!.send(JSON.stringify(msg));
});
}
Expand Down
57 changes: 57 additions & 0 deletions packages/bcode-browser/test/browser-execute.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,63 @@ test.skipIf(!enabled)("workspace import inside a snippet", async () => {
expect(JSON.parse(result.result)).toBe("bcode-be")
})

test.skipIf(!enabled)("Page.captureScreenshot is collected into result.screenshots", async () => {
const result = await Effect.runPromise(
Effect.scoped(
Effect.gen(function* () {
const impl = yield* BrowserExecute.make(dataDir)
return yield* impl.execute(
{
code: `await session.Page.enable();
await session.Page.navigate({ url: "data:text/html,<title>shot</title><body>hi" });
await session.waitFor("Page.loadEventFired", undefined, 5000);
const a = await session.Page.captureScreenshot({ format: "png" });
const b = await session.Page.captureScreenshot({ format: "jpeg", quality: 50 });
return { aLen: a.data.length, bLen: b.data.length };`,
},
{ sessionID, workspaceDir },
)
}),
),
)
expect(result.screenshots).toHaveLength(2)
expect(result.screenshots[0]!.mime).toBe("image/png")
expect(result.screenshots[1]!.mime).toBe("image/jpeg")
// base64 must round-trip back to non-empty bytes for both shots.
expect(Buffer.from(result.screenshots[0]!.base64, "base64").length).toBeGreaterThan(0)
expect(Buffer.from(result.screenshots[1]!.base64, "base64").length).toBeGreaterThan(0)
})

test.skipIf(!enabled)("BCODE_SCREENSHOT_DIR dumps screenshots to disk", async () => {
const dump = await fs.mkdtemp(path.join(os.tmpdir(), "bcode-shotdump-"))
const prev = process.env.BCODE_SCREENSHOT_DIR
process.env.BCODE_SCREENSHOT_DIR = dump
try {
await Effect.runPromise(
Effect.scoped(
Effect.gen(function* () {
const impl = yield* BrowserExecute.make(dataDir)
return yield* impl.execute(
{
code: `await session.Page.captureScreenshot({ format: "png" });`,
},
{ sessionID, workspaceDir },
)
}),
),
)
// Disk dump is fire-and-forget; give it a tick to land.
await new Promise((r) => setTimeout(r, 150))
const files = await fs.readdir(dump)
expect(files.length).toBeGreaterThan(0)
expect(files.every((f) => f.endsWith(".png"))).toBe(true)
} finally {
if (prev === undefined) delete process.env.BCODE_SCREENSHOT_DIR
else process.env.BCODE_SCREENSHOT_DIR = prev
await fs.rm(dump, { recursive: true, force: true })
}
})

test.skipIf(!enabled)("syntax error in snippet surfaces a clean failure", async () => {
await expect(
Effect.runPromise(
Expand Down
15 changes: 15 additions & 0 deletions packages/opencode/src/tool/browser-execute.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,30 @@ export const BrowserExecuteTool = Tool.define(
metadata: { output: preview(output) },
}),
})
// Drain every `Page.captureScreenshot` made during this snippet
// into `attachments[]`. Opencode appends FilePart attachments to
// the next assistant turn as image parts, so the model receives
// the screenshot natively as vision input — no decode/write/read
// dance from inside the snippet. Same channel `read` and
// `webfetch` use when they surface images.
const attachments = result.screenshots.map((s) => ({
type: "file" as const,
mime: s.mime,
url: `data:${s.mime};base64,${s.base64}`,
}))
return {
title: "browser_execute",
output: [
result.output.trimEnd(),
result.result === "null" ? "" : `=> ${result.result}`,
attachments.length > 0
? `(${attachments.length} screenshot${attachments.length === 1 ? "" : "s"} attached)`
: "",
]
.filter(Boolean)
.join("\n\n"),
metadata: { result: result.result, output: preview(result.output) },
attachments,
}
}).pipe(Effect.orDie),
}
Expand Down
Loading