diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..f001789
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,35 @@
+.git
+.gitignore
+.github
+.venv
+**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+engine/logs/
+node_modules
+frontend/node_modules
+.npm-cache
+frontend/.vite
+frontend/dist
+
+#  Model weights 
+*.pt
+*.bin
+models/
+
+#  Windows build artifacts
+*.exe
+quadtrix.exe
+*.png
+*.jpg
+*.jpeg
+*.md
+LICENSE
+contributing.md
+SECURITY.md
+run.md
+.DS_Store
+Thumbs.db
+.idea
+.vscode
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1910dd9..311ad33 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -44,7 +44,7 @@ jobs:
           pip install fastapi "uvicorn[standard]" pydantic pydantic-settings httpx redis
 
       - name: Compile Python sources
-        run: python -m compileall backend engine iGPU
+        run: python -m compileall backend engine
 
       - name: Import FastAPI application
         working-directory: backend
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
new file mode 100644
index 0000000..e025f34
--- /dev/null
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,82 @@
+name: Publish Docker image
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - "v*.*.*"
+    paths-ignore:
+      - 'cuda/**'
+      - 'docs/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - master
+    paths-ignore:
+      - 'cuda/**'
+      - 'docs/**'
+      - '**.md'
+
+env:
+  REGISTRY: ghcr.io
+
+jobs:
+  build-and-push:
+    name: Build & push to ghcr.io
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set lowercase image name
+        id: image
+        run: |
+          echo "name=$(echo '${{ github.repository }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Log in to ghcr.io
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}
+          tags: |
+            type=raw,value=latest,enable={{is_default_branch}}
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=ref,event=pr
+      - name: Build and push Docker image (CPU)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            BASE_IMAGE=ubuntu:24.04
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+      - name: Image published
+        if: github.event_name != 'pull_request'
+        run: |
+          echo "Image published to GitHub Packages"
+          echo ""
+          echo "Pull with:"
+          echo "  docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
+          echo ""
+          echo "Or via docker-compose:"
+          echo "  image: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
diff --git a/.github/workflows/github-package.yml b/.github/workflows/github-package.yml
deleted file mode 100644
index cbcb92a..0000000
--- a/.github/workflows/github-package.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Publish GitHub Package
-
-on:
-  workflow_dispatch:
-  push:
-    tags:
-      - "v*"
-
-permissions:
-  contents: read
-  packages: write
-
-jobs:
-  publish-github-package:
-    name: Publish to GitHub Packages
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Set up Node.js for GitHub Packages
-        uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-          registry-url: "https://npm.pkg.github.com"
-          scope: "@eamon2009"
-          cache: "npm"
-          cache-dependency-path: frontend/package-lock.json
-
-      - name: Build frontend assets
-        run: |
-          npm --prefix frontend ci
-          npm --prefix frontend run build
-
-      - name: Prepare GitHub Packages metadata
-        run: |
-          npm pkg set name="@eamon2009/quadtrix"
-          npm pkg set publishConfig.registry="https://npm.pkg.github.com"
-
-      - name: Publish package
-        run: npm publish
-        env:
-          NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
deleted file mode 100644
index c4cba46..0000000
--- a/.github/workflows/release.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Release
-
-on:
-  push:
-    tags:
-      - "v*"
-
-permissions:
-  contents: write
-
-jobs:
-  build-release:
-    name: Build release artifacts
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-          cache: "npm"
-          cache-dependency-path: frontend/package-lock.json
-
-      - name: Build C++ binary
-        run: g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix main.cpp
-
-      - name: Compile Python sources
-        run: python -m compileall backend engine iGPU
-
-      - name: Build frontend
-        working-directory: frontend
-        run: |
-          npm ci
-          npm run build
-
-      - name: Package release files
-        run: |
-          tar \
-            --exclude='*.pt' \
-            --exclude='engine/logs' \
-            --exclude='__pycache__' \
-            -czf quadtrix-linux.tar.gz \
-            quadtrix README.md LICENSE run.md backend engine iGPU include src data config frontend/dist
-
-      - name: Create GitHub release
-        uses: softprops/action-gh-release@v2
-        with:
-          files: quadtrix-linux.tar.gz
-          generate_release_notes: true
diff --git a/.npmignore b/.npmignore
deleted file mode 100644
index 86f73e5..0000000
--- a/.npmignore
+++ /dev/null
@@ -1,25 +0,0 @@
-__pycache__/
-*.pyc
-*.pyo
-*.pt
-*.bin
-*.exe
-.git/
-.github/
-.npm-cache/
-.venv/
-.vscode/
-build/
-frontend/node_modules/
-frontend/src/
-frontend/public/
-frontend/package-lock.json
-frontend/package.json
-frontend/postcss.config.js
-frontend/tailwind.config.ts
-frontend/tsconfig.json
-frontend/vite.config.ts
-engine/logs/
-engine/fine-tune/
-engine/fineweb_30mb.txt
-engine/data/input.txt
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..33c2524
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,70 @@
+FROM ubuntu:24.04 AS builder
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      g++ \
+      python3 \
+      python3-pip \
+      python3-venv \
+      curl \
+      ca-certificates \
+      && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+      && apt-get install -y --no-install-recommends nodejs \
+      && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+COPY . .
+RUN g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix main.cpp
+RUN cd frontend \
+      && npm ci \
+      && npm run build
+RUN python3 -m venv /venv \
+      && /venv/bin/pip install --upgrade pip --quiet \
+      && /venv/bin/pip install -r backend/requirements.txt --quiet
+
+ARG BASE_IMAGE=ubuntu:24.04
+FROM ${BASE_IMAGE:-ubuntu:24.04} AS runtime
+
+LABEL org.opencontainers.image.title="Quadtrix.cpp"
+LABEL org.opencontainers.image.description="Local LLM with C++/PyTorch backends and React UI"
+LABEL org.opencontainers.image.source="https://github.com/Eamon2009/Quadtrix.cpp"
+LABEL org.opencontainers.image.version="1.1.0"
+LABEL org.opencontainers.image.licenses="MIT"
+
+ENV DEBIAN_FRONTEND=noninteractive \
+      PYTHONUNBUFFERED=1 \
+      PATH="/venv/bin:$PATH"
+
+# Runtime system packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 \
+      supervisor \
+      curl \
+      ca-certificates \
+      && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+      && apt-get install -y --no-install-recommends nodejs \
+      && npm install -g serve --quiet \
+      && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY --from=builder /venv              /venv
+COPY --from=builder /build/quadtrix    /app/quadtrix
+COPY --from=builder /build/frontend/dist /app/frontend/dist
+COPY --from=builder /build/backend     /app/backend
+COPY --from=builder /build/engine      /app/engine
+COPY supervisord.conf       /etc/supervisor/conf.d/quadtrix.conf
+COPY docker-entrypoint.sh   /app/entrypoint.sh
+
+RUN chmod +x /app/entrypoint.sh /app/quadtrix \
+      && mkdir -p /var/log/supervisor /app/models
+VOLUME ["/app/models"]
+ENV TORCH_CHECKPOINT_PATH=/app/models/best_model.pt \
+      GPT_MODEL_PATH=/app/models/best_model.bin \
+      API_PORT=3001 \
+      CORS_ORIGINS=http://localhost:8080 \
+      LOG_LEVEL=INFO \
+      MAX_SESSIONS=1000 \
+      SESSION_TTL_HOURS=24
+EXPOSE 3001 8080
+
+ENTRYPOINT ["/app/entrypoint.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
index f5faa8a..327479d 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,24 @@
 # Quadtrix.cpp
-<img width="2442" height="1586" alt="run_20260508_110726" src="https://github.com/user-attachments/assets/ef51d1c3-e28e-4674-8a71-5513e753b174" />
-
-Quadtrix.cpp is a local language model project with several execution paths:
-
-- A dependency-free C++17 transformer implementation with manual forward and backward passes.
-- A PyTorch training and inference path for faster experimentation on CPU, CUDA, or supported accelerator backends.
-- A FastAPI middleware layer for chat sessions, health checks, backend selection, and feedback.
-- A React + TypeScript frontend for local chat, settings, session history, and model status.
-- Optional package/CLI support through `bin/quadtrix.js`.
+---
+Quadtrix.cpp is a local large language model project built around a modular, multi-path architecture that allows to choose the right execution strategy for their hardware and workflow. Whether you are working on a bare-metal embedded environment, running experiments on a GPU cluster, serving a REST API, or interacting through a browser-based chat interface, Quadtrix.cpp provides a coherent and composable foundation for each of those scenarios. This is designed to be approachable for people who want to read and modify every layer of the stack, while remaining practical enough for people who simply want to spin up a working local model quickly.
+> For full technical reference, check the documentation — <a href="https://eamon2009.github.io/LLMs/" style="color:#1a73e8;text-decoration:underline;" target="_blank"> Docs</a>
+ 
 
 
 > [!IMPORTANT]
 > Please be aware that several commands listed in this documentation—specifically those involving file paths and directory navigation—should not be directly copied and pasted into your terminal. Because file structures and path syntax (such as / vs \) vary significantly across operating systems like Windows, macOS, and Linux, you must manually adjust these arguments to match your local environment. Ensure you verify your current working directory and replace any placeholder paths with the absolute or relative path specific to your machine to avoid execution errors.
 
+---
+##  Architecture
 
-The project is designed as a technical learning implementation. The C++ path exposes the transformer internals directly: tensor operations, attention, layer normalization, cross-entropy, analytical gradients, AdamW, checkpointing, and autoregressive generation.
+<img width="1016" height="684" alt="image" src="https://github.com/user-attachments/assets/0e9faad4-71a9-4c7f-80e9-1136dfea6e57" />
+The diagram shows how tokens enter at the bottom as raw IDs, get converted into vector embeddings with positional information added, then pass upward through a repeated stack of decoder blocks - each block applying masked attention followed by a feed-forward layer, with normalisation wrapping both. At the very top, a linear projection maps those representations to output logits across the vocabulary. The right-hand side zooms into the attention mechanism itself, showing how queries, keys, and values are linearly projected, fed into a scaled dot-product with an optional causal mask and softmax, then concatenated across all heads before being projected back out. The training flow panel on the far right shows this running as a five-step cycle per batch: data loading, forward pass, loss computation, backward pass for gradients, and a weight update. The bottom section confirms the behaviour through training loss, validation loss, and perplexity plots - all three curves descending and converging steadily as steps increase, indicating the model is learning as expected.
+ 
 
 ## v1.1.0 
 <img width="2185" height="829" alt="run_20260430_192930" src="https://github.com/user-attachments/assets/c6db061a-aa8d-4d8d-a1e2-1a81418bb613" />
+<img width="2442" height="1586" alt="run_20260508_110726" src="https://github.com/user-attachments/assets/ef51d1c3-e28e-4674-8a71-5513e753b174" />
+
 
 ---
 
diff --git a/backend/.env.example b/backend/.env.example
index 86a59a5..ba40a71 100644
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -5,5 +5,5 @@ LOG_LEVEL=INFO
 MAX_SESSIONS=1000
 SESSION_TTL_HOURS=24
 CPP_SERVER_URL=http://localhost:8080
-TORCH_CHECKPOINT_PATH=../engine/best_model .pt
+TORCH_CHECKPOINT_PATH=../engine/best_model.pt
 REQUEST_TIMEOUT_SECONDS=60
diff --git a/backend/config.py b/backend/config.py
index 07ab4d9..9bfdbc0 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
     max_sessions: int = Field(default=1000, alias="MAX_SESSIONS")
     session_ttl_hours: int = Field(default=24, alias="SESSION_TTL_HOURS")
     cpp_server_url: str = Field(default="http://localhost:8080", alias="CPP_SERVER_URL")
-    torch_checkpoint_path: str = Field(default="../engine/best_model .pt", alias="TORCH_CHECKPOINT_PATH")
+    torch_checkpoint_path: str = Field(default="../engine/best_model.pt", alias="TORCH_CHECKPOINT_PATH")
     request_timeout_seconds: float = Field(default=60.0, alias="REQUEST_TIMEOUT_SECONDS")
 
     model_config = SettingsConfigDict(env_file=".env", extra="ignore", populate_by_name=True)
diff --git a/bin/quadtrix.js b/bin/quadtrix.js
deleted file mode 100644
index 1dd4287..0000000
--- a/bin/quadtrix.js
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/env node
-
-const { createServer } = require("node:http");
-const { createReadStream, existsSync, mkdirSync, statSync } = require("node:fs");
-const { extname, join, resolve } = require("node:path");
-const { spawn } = require("node:child_process");
-const { platform } = require("node:os");
-
-const packageRoot = resolve(__dirname, "..");
-const userRoot = process.cwd();
-const isWindows = platform() === "win32";
-const python = isWindows ? "python" : "python3";
-const cppBinary = join(userRoot, ".quadtrix", "bin", isWindows ? "quadtrix.exe" : "quadtrix");
-
-const mimeTypes = {
-  ".css": "text/css",
-  ".html": "text/html",
-  ".ico": "image/x-icon",
-  ".js": "text/javascript",
-  ".json": "application/json",
-  ".png": "image/png",
-  ".svg": "image/svg+xml",
-  ".webmanifest": "application/manifest+json",
-};
-
-function usage() {
-  console.log(`
-Quadtrix CLI
-
-Usage:
-  quadtrix chat [--api-port 3001] [--web-port 5173] [--no-open]
-  quadtrix train --backend cpp [--data data/input.txt]
-  quadtrix train --backend python
-  quadtrix setup
-
-Commands:
-  chat      Start the FastAPI backend, serve the built frontend, and open chat.
-  train     Train locally with either the C++ or Python implementation.
-  setup     Install Python backend/engine dependencies with pip.
-`);
-}
-
-function argValue(args, name, fallback) {
-  const index = args.indexOf(name);
-  if (index === -1 || index + 1 >= args.length) {
-    return fallback;
-  }
-  return args[index + 1];
-}
-
-function hasArg(args, name) {
-  return args.includes(name);
-}
-
-function run(command, args, options = {}) {
-  const child = spawn(command, args, {
-    cwd: options.cwd || userRoot,
-    env: { ...process.env, ...(options.env || {}) },
-    stdio: options.stdio || "inherit",
-    shell: false,
-  });
-
-  child.on("error", (error) => {
-    console.error(`Failed to start ${command}: ${error.message}`);
-  });
-
-  return child;
-}
-
-function openBrowser(url) {
-  if (isWindows) {
-    spawn("cmd", ["/c", "start", "", url], { detached: true, stdio: "ignore" }).unref();
-    return;
-  }
-
-  if (platform() === "darwin") {
-    spawn("open", [url], { detached: true, stdio: "ignore" }).unref();
-    return;
-  }
-
-  spawn("xdg-open", [url], { detached: true, stdio: "ignore" }).unref();
-}
-
-function serveStatic(directory, port) {
-  if (!existsSync(directory)) {
-    console.error(`Frontend build not found: ${directory}`);
-    console.error("Run `npm run build:frontend` before packing or publishing.");
-    process.exit(1);
-  }
-
-  const server = createServer((request, response) => {
-    const rawPath = decodeURIComponent((request.url || "/").split("?")[0]);
-    const safePath = rawPath.replace(/^\/+/, "");
-    let filePath = resolve(directory, safePath || "index.html");
-
-    if (!filePath.startsWith(resolve(directory))) {
-      response.writeHead(403);
-      response.end("Forbidden");
-      return;
-    }
-
-    if (!existsSync(filePath) || statSync(filePath).isDirectory()) {
-      filePath = join(directory, "index.html");
-    }
-
-    response.writeHead(200, {
-      "Content-Type": mimeTypes[extname(filePath)] || "application/octet-stream",
-    });
-    createReadStream(filePath).pipe(response);
-  });
-
-  server.listen(port, () => {
-    console.log(`Frontend: http://localhost:${port}`);
-  });
-
-  return server;
-}
-
-function startChat(args) {
-  const apiPort = argValue(args, "--api-port", "3001");
-  const webPort = argValue(args, "--web-port", "5173");
-  const frontendDist = join(packageRoot, "frontend", "dist");
-  const backendDir = join(packageRoot, "backend");
-  const url = `http://localhost:${webPort}`;
-
-  const api = run(python, ["-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", apiPort], {
-    cwd: backendDir,
-    env: {
-      API_PORT: apiPort,
-      CORS_ORIGINS: url,
-    },
-  });
-  const web = serveStatic(frontendDist, Number(webPort));
-
-  console.log("Starting Quadtrix chat...");
-  console.log(`Backend:  http://localhost:${apiPort}`);
-
-  if (!hasArg(args, "--no-open")) {
-    setTimeout(() => openBrowser(url), 1200);
-  }
-
-  const stop = () => {
-    web.close();
-    api.kill();
-    process.exit(0);
-  };
-
-  process.on("SIGINT", stop);
-  process.on("SIGTERM", stop);
-}
-
-function setup() {
-  const requirements = join(packageRoot, "backend", "requirements.txt");
-  const child = run(python, ["-m", "pip", "install", "-r", requirements]);
-  child.on("exit", (code) => process.exit(code || 0));
-}
-
-function compileCpp() {
-  mkdirSync(join(userRoot, ".quadtrix", "bin"), { recursive: true });
-  const child = run("g++", [
-    "-std=c++17",
-    "-O2",
-    "-I.",
-    "-Iinclude",
-    "-o",
-    cppBinary,
-    "main.cpp",
-  ], { cwd: packageRoot });
-
-  return new Promise((resolvePromise) => {
-    child.on("exit", (code) => resolvePromise(code || 0));
-  });
-}
-
-function resolveTrainingData(args) {
-  const requested = argValue(args, "--data", join(userRoot, "data", "input.txt"));
-  const data = resolve(userRoot, requested);
-
-  if (!existsSync(data)) {
-    console.error(`Training data not found: ${data}`);
-    console.error("Pass a text file with `--data ./path/to/input.txt`.");
-    process.exit(1);
-  }
-
-  return data;
-}
-
-async function train(args) {
-  const backend = argValue(args, "--backend", "cpp");
-
-  if (backend === "cpp") {
-    const data = resolveTrainingData(args);
-    const code = await compileCpp();
-    if (code !== 0) {
-      process.exit(code);
-    }
-    run(cppBinary, [data]).on("exit", (exitCode) => process.exit(exitCode || 0));
-    return;
-  }
-
-  if (backend === "python" || backend === "py") {
-    const data = resolveTrainingData(args);
-    const script = join(packageRoot, "engine", "main.py");
-    run(python, [script], {
-      cwd: join(packageRoot, "engine"),
-      env: { QUADTRIX_TRAIN_DATA: data },
-    }).on("exit", (exitCode) => {
-      process.exit(exitCode || 0);
-    });
-    return;
-  }
-
-  console.error(`Unknown backend: ${backend}`);
-  console.error("Use `--backend cpp` or `--backend python`.");
-  process.exit(1);
-}
-
-async function main() {
-  const [command, ...args] = process.argv.slice(2);
-
-  if (!command || command === "--help" || command === "-h") {
-    usage();
-    return;
-  }
-
-  if (command === "chat") {
-    startChat(args);
-    return;
-  }
-
-  if (command === "setup") {
-    setup();
-    return;
-  }
-
-  if (command === "train") {
-    await train(args);
-    return;
-  }
-
-  console.error(`Unknown command: ${command}`);
-  usage();
-  process.exit(1);
-}
-
-main();
diff --git a/config/config.h b/config/config.h
index 78b8185..db053cb 100644
--- a/config/config.h
+++ b/config/config.h
@@ -21,7 +21,7 @@ static const double TRAIN_SPLIT = 0.9; // 90 % train, 10 % val
 static const int BATCH_SIZE = 4;
 static const int BLOCK_SIZE = 64; // context length
 static const int MAX_ITERS = 3000;
-static const int EVAL_INTERVAL = 200;
+static const int EVAL_INTERVAL = 20;
 static const float LEARNING_RATE = 3e-4f;
 static const int EVAL_ITERS = 10;
 static const int N_EMBD = 128;
diff --git a/cuda/KERNAL/.gitkeep b/cuda/KERNAL/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/cuda/KERNAL/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/cuda/KERNAL/adamw.cu b/cuda/KERNAL/adamw.cu
new file mode 100644
index 0000000..5660add
--- /dev/null
+++ b/cuda/KERNAL/adamw.cu
@@ -0,0 +1,94 @@
+#include "../includes/adamw.cuh"
+
+#include "../includes/runtime.cuh"
+#include "../includes/utils.cuh"
+
+#include <cmath>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+bool valid_same_shape_f32(const TensorView& a, const TensorView& b) {
+    if (a.data == nullptr || b.data == nullptr || a.device != DeviceKind::CUDA || b.device != DeviceKind::CUDA ||
+        a.device_id != b.device_id || a.dtype != DType::F32 || b.dtype != DType::F32 ||
+        a.shape.rank != b.shape.rank || !a.shape.is_contiguous() || !b.shape.is_contiguous() ||
+        a.numel() != b.numel()) {
+        return false;
+    }
+    for (int i = 0; i < a.shape.rank; ++i) {
+        if (a.shape.dims[i] != b.shape.dims[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+__global__ void adamw_kernel(
+    float* __restrict__ params,
+    const float* __restrict__ grads,
+    float* __restrict__ first_moment,
+    float* __restrict__ second_moment,
+    std::size_t n,
+    float learning_rate,
+    float beta1,
+    float beta2,
+    float beta1_correction,
+    float beta2_correction,
+    float epsilon,
+    float weight_decay,
+    float grad_scale) {
+    const std::size_t idx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= n) {
+        return;
+    }
+
+    const float param = params[idx];
+    const float grad = grads[idx] * grad_scale;
+    const float m = beta1 * first_moment[idx] + (1.0f - beta1) * grad;
+    const float v = beta2 * second_moment[idx] + (1.0f - beta2) * grad * grad;
+    const float m_hat = m / beta1_correction;
+    const float v_hat = v / beta2_correction;
+    first_moment[idx] = m;
+    second_moment[idx] = v;
+    params[idx] = param - learning_rate * (m_hat / (sqrtf(v_hat) + epsilon) + weight_decay * param);
+}
+
+}  // namespace
+
+Status adamw_update(
+    TensorView params,
+    const TensorView& grads,
+    TensorView first_moment,
+    TensorView second_moment,
+    const AdamWConfig& config,
+    float grad_scale,
+    cudaStream_t stream) {
+    if (!valid_same_shape_f32(params, grads) || !valid_same_shape_f32(params, first_moment) ||
+        !valid_same_shape_f32(params, second_moment) || config.step <= 0 || config.beta1 < 0.0f ||
+        config.beta1 >= 1.0f || config.beta2 < 0.0f || config.beta2 >= 1.0f || config.epsilon <= 0.0f) {
+        return Status::failure(cudaErrorInvalidValue, "invalid adamw_update arguments");
+    }
+
+    const float beta1_correction = 1.0f - powf(config.beta1, static_cast<float>(config.step));
+    const float beta2_correction = 1.0f - powf(config.beta2, static_cast<float>(config.step));
+    DeviceGuard guard(params.device_id);
+    adamw_kernel<<<one_dim_grid(params.numel()), kDefaultBlockSize, 0, stream>>>(
+        params.data_as<float>(),
+        grads.data_as<const float>(),
+        first_moment.data_as<float>(),
+        second_moment.data_as<float>(),
+        params.numel(),
+        config.learning_rate,
+        config.beta1,
+        config.beta2,
+        beta1_correction,
+        beta2_correction,
+        config.epsilon,
+        config.weight_decay,
+        grad_scale);
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/KERNAL/attention_forward.cu b/cuda/KERNAL/attention_forward.cu
new file mode 100644
index 0000000..cd5d7fe
--- /dev/null
+++ b/cuda/KERNAL/attention_forward.cu
@@ -0,0 +1,177 @@
+#include "../includes/attention.cuh"
+
+#include "../includes/runtime.cuh"
+#include "../includes/utils.cuh"
+
+#include <cmath>
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+constexpr int kAttentionBlockSize = 256;
+
+bool fits_int(std::int64_t value) {
+    return value > 0 && value <= std::numeric_limits<int>::max();
+}
+
+bool valid_f32_cuda(const TensorView& tensor) {
+    return tensor.data != nullptr && tensor.device == DeviceKind::CUDA && tensor.dtype == DType::F32 &&
+           tensor.shape.is_contiguous();
+}
+
+__device__ float block_sum(float value, float* shared) {
+    value = warp_sum(value);
+    const int lane = threadIdx.x & (kWarpSize - 1);
+    const int warp = threadIdx.x / kWarpSize;
+    if (lane == 0) {
+        shared[warp] = value;
+    }
+    __syncthreads();
+    const int warp_count = (blockDim.x + kWarpSize - 1) / kWarpSize;
+    value = threadIdx.x < warp_count ? shared[lane] : 0.0f;
+    if (warp == 0) {
+        value = warp_sum(value);
+    }
+    if (threadIdx.x == 0) {
+        shared[0] = value;
+    }
+    __syncthreads();
+    return shared[0];
+}
+
+__device__ float block_max(float value, float* shared) {
+    value = warp_max(value);
+    const int lane = threadIdx.x & (kWarpSize - 1);
+    const int warp = threadIdx.x / kWarpSize;
+    if (lane == 0) {
+        shared[warp] = value;
+    }
+    __syncthreads();
+    const int warp_count = (blockDim.x + kWarpSize - 1) / kWarpSize;
+    value = threadIdx.x < warp_count ? shared[lane] : -INFINITY;
+    if (warp == 0) {
+        value = warp_max(value);
+    }
+    if (threadIdx.x == 0) {
+        shared[0] = value;
+    }
+    __syncthreads();
+    return shared[0];
+}
+
+__global__ void attention_forward_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ preatt,
+    float* __restrict__ att,
+    float* __restrict__ output,
+    int total_rows,
+    int time,
+    int channels,
+    int num_heads,
+    int head_size) {
+    extern __shared__ float shared[];
+    const int row_id = blockIdx.x;
+    if (row_id >= total_rows) {
+        return;
+    }
+
+    const int h = row_id % num_heads;
+    const int t = (row_id / num_heads) % time;
+    const int b = row_id / (num_heads * time);
+    const float scale = rsqrtf(static_cast<float>(head_size));
+    const int c3 = 3 * channels;
+    const float* __restrict__ query = input + b * time * c3 + t * c3 + h * head_size;
+    float* __restrict__ preatt_row = preatt + b * num_heads * time * time + h * time * time + t * time;
+    float* __restrict__ att_row = att + b * num_heads * time * time + h * time * time + t * time;
+
+    float local_max = -INFINITY;
+    for (int t2 = threadIdx.x; t2 <= t; t2 += blockDim.x) {
+        const float* __restrict__ key = input + b * time * c3 + t2 * c3 + channels + h * head_size;
+        float score = 0.0f;
+        for (int i = 0; i < head_size; ++i) {
+            score += query[i] * key[i];
+        }
+        score *= scale;
+        preatt_row[t2] = score;
+        local_max = fmaxf(local_max, score);
+    }
+    const float max_val = block_max(local_max, shared);
+
+    float local_sum = 0.0f;
+    for (int t2 = threadIdx.x; t2 <= t; t2 += blockDim.x) {
+        const float value = expf(preatt_row[t2] - max_val);
+        att_row[t2] = value;
+        local_sum += value;
+    }
+    const float sum = block_sum(local_sum, shared);
+    const float inv_sum = sum == 0.0f ? 0.0f : 1.0f / sum;
+
+    for (int t2 = threadIdx.x; t2 < time; t2 += blockDim.x) {
+        att_row[t2] = t2 <= t ? att_row[t2] * inv_sum : 0.0f;
+        if (t2 > t) {
+            preatt_row[t2] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    float* __restrict__ out = output + b * time * channels + t * channels + h * head_size;
+    for (int i = threadIdx.x; i < head_size; i += blockDim.x) {
+        float value = 0.0f;
+        for (int t2 = 0; t2 <= t; ++t2) {
+            const float* __restrict__ v = input + b * time * c3 + t2 * c3 + 2 * channels + h * head_size;
+            value += att_row[t2] * v[i];
+        }
+        out[i] = value;
+    }
+}
+
+}  // namespace
+
+Status attention_forward(
+    const TensorView& input_qkv,
+    TensorView preatt,
+    TensorView att,
+    TensorView output,
+    int num_heads,
+    cudaStream_t stream) {
+    if (!valid_f32_cuda(input_qkv) || !valid_f32_cuda(preatt) || !valid_f32_cuda(att) || !valid_f32_cuda(output) ||
+        input_qkv.shape.rank != 3 || preatt.shape.rank != 4 || att.shape.rank != 4 || output.shape.rank != 3 ||
+        input_qkv.device_id != preatt.device_id || input_qkv.device_id != att.device_id ||
+        input_qkv.device_id != output.device_id) {
+        return Status::failure(cudaErrorInvalidValue, "invalid attention_forward tensors");
+    }
+
+    const std::int64_t batch = input_qkv.shape.dims[0];
+    const std::int64_t time = input_qkv.shape.dims[1];
+    const std::int64_t channels3 = input_qkv.shape.dims[2];
+    if (num_heads <= 0 || channels3 % 3 != 0) {
+        return Status::failure(cudaErrorInvalidValue, "invalid attention_forward qkv shape");
+    }
+    const std::int64_t channels = channels3 / 3;
+    if (channels % num_heads != 0 || output.shape.dims[0] != batch || output.shape.dims[1] != time ||
+        output.shape.dims[2] != channels || preatt.shape.dims[0] != batch || preatt.shape.dims[1] != num_heads ||
+        preatt.shape.dims[2] != time || preatt.shape.dims[3] != time || att.shape.dims != preatt.shape.dims ||
+        !fits_int(batch) || !fits_int(time) || !fits_int(channels)) {
+        return Status::failure(cudaErrorInvalidValue, "invalid attention_forward shape");
+    }
+
+    DeviceGuard guard(input_qkv.device_id);
+    const int rows = static_cast<int>(batch * time * num_heads);
+    const std::size_t shared_bytes = ((kAttentionBlockSize + kWarpSize - 1) / kWarpSize) * sizeof(float);
+    attention_forward_kernel<<<rows, kAttentionBlockSize, shared_bytes, stream>>>(
+        input_qkv.data_as<const float>(),
+        preatt.data_as<float>(),
+        att.data_as<float>(),
+        output.data_as<float>(),
+        rows,
+        static_cast<int>(time),
+        static_cast<int>(channels),
+        num_heads,
+        static_cast<int>(channels / num_heads));
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/KERNAL/matmul_forward.cu b/cuda/KERNAL/matmul_forward.cu
new file mode 100644
index 0000000..b2a8cf3
--- /dev/null
+++ b/cuda/KERNAL/matmul_forward.cu
@@ -0,0 +1,242 @@
+#include "../includes/matmul.cuh"
+
+#include "../includes/runtime.cuh"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cstdlib>
+#include <cstdio>
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+const char* kInvalidMatmul = "invalid matmul arguments";
+
+cublasOperation_t to_cublas_op(MatmulTranspose op) {
+    return op == MatmulTranspose::Transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
+bool fits_int(std::int64_t value) {
+    return value > 0 && value <= std::numeric_limits<int>::max();
+}
+
+bool is_rank2_contiguous_cuda(const TensorView& tensor) {
+    return tensor.data != nullptr && tensor.device == DeviceKind::CUDA && tensor.shape.rank == 2 &&
+           tensor.shape.is_contiguous();
+}
+
+std::int64_t rows_after_op(const TensorView& tensor, MatmulTranspose op) {
+    return op == MatmulTranspose::Transpose ? tensor.shape.dims[1] : tensor.shape.dims[0];
+}
+
+std::int64_t cols_after_op(const TensorView& tensor, MatmulTranspose op) {
+    return op == MatmulTranspose::Transpose ? tensor.shape.dims[0] : tensor.shape.dims[1];
+}
+
+cudaDataType_t to_cuda_data_type(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return CUDA_R_32F;
+        case DType::F16:
+            return CUDA_R_16F;
+        case DType::BF16:
+            return CUDA_R_16BF;
+        case DType::I32:
+        case DType::U8:
+            break;
+    }
+    return CUDA_R_32F;
+}
+
+cublasComputeType_t compute_type_for(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return CUBLAS_COMPUTE_32F;
+        case DType::F16:
+        case DType::BF16:
+            return CUBLAS_COMPUTE_32F_FAST_16F;
+        case DType::I32:
+        case DType::U8:
+            break;
+    }
+    return CUBLAS_COMPUTE_32F;
+}
+
+cublasGemmAlgo_t gemm_algo_for(DType dtype) {
+    return dtype == DType::F32 ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+}
+
+bool supports_gemm_dtype(DType dtype) {
+    return dtype == DType::F32 || dtype == DType::F16 || dtype == DType::BF16;
+}
+
+}  // namespace
+
+const char* cublas_status_name(cublasStatus_t status) {
+    switch (status) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+#ifdef CUBLAS_STATUS_LICENSE_ERROR
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+    }
+    return "CUBLAS_STATUS_UNKNOWN";
+}
+
+BlasHandle::BlasHandle(int device_id) : device_id_(device_id) {
+    DeviceGuard guard(device_id_);
+    cublasStatus_t status = cublasCreate(&handle_);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        std::fprintf(stderr, "Fatal cuBLAS error: cublasCreate failed with %s\n", cublas_status_name(status));
+        std::abort();
+    }
+    status = cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        std::fprintf(stderr, "Fatal cuBLAS error: cublasSetMathMode failed with %s\n", cublas_status_name(status));
+        std::abort();
+    }
+}
+
+BlasHandle::~BlasHandle() {
+    if (handle_ != nullptr) {
+        DeviceGuard guard(device_id_);
+        cublasDestroy(handle_);
+    }
+}
+
+BlasHandle::BlasHandle(BlasHandle&& other) noexcept
+    : handle_(other.handle_), device_id_(other.device_id_) {
+    other.handle_ = nullptr;
+}
+
+BlasHandle& BlasHandle::operator=(BlasHandle&& other) noexcept {
+    if (this == &other) {
+        return *this;
+    }
+
+    if (handle_ != nullptr) {
+        DeviceGuard guard(device_id_);
+        cublasDestroy(handle_);
+    }
+
+    handle_ = other.handle_;
+    device_id_ = other.device_id_;
+    other.handle_ = nullptr;
+    return *this;
+}
+
+BlasStatus BlasHandle::set_stream(cudaStream_t stream) {
+    cublasStatus_t status = cublasSetStream(handle_, stream);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        return BlasStatus::failure(status, "cublasSetStream failed");
+    }
+    return BlasStatus::success();
+}
+
+BlasStatus matmul(
+    BlasHandle& handle,
+    const TensorView& a,
+    MatmulTranspose op_a,
+    const TensorView& b,
+    MatmulTranspose op_b,
+    TensorView c,
+    float alpha,
+    float beta,
+    cudaStream_t stream) {
+    if (!is_rank2_contiguous_cuda(a) || !is_rank2_contiguous_cuda(b) || !is_rank2_contiguous_cuda(c)) {
+        return BlasStatus::failure(CUBLAS_STATUS_INVALID_VALUE, kInvalidMatmul);
+    }
+    if (a.dtype != b.dtype || a.dtype != c.dtype || !supports_gemm_dtype(a.dtype)) {
+        return BlasStatus::failure(CUBLAS_STATUS_NOT_SUPPORTED, "matmul dtype is unsupported or mismatched");
+    }
+    if (a.device_id != b.device_id || a.device_id != c.device_id || a.device_id != handle.device_id()) {
+        return BlasStatus::failure(CUBLAS_STATUS_INVALID_VALUE, "matmul tensors and handle must share a device");
+    }
+
+    const std::int64_t m64 = rows_after_op(a, op_a);
+    const std::int64_t k64 = cols_after_op(a, op_a);
+    const std::int64_t b_k64 = rows_after_op(b, op_b);
+    const std::int64_t n64 = cols_after_op(b, op_b);
+
+    if (k64 != b_k64 || c.shape.dims[0] != m64 || c.shape.dims[1] != n64) {
+        return BlasStatus::failure(CUBLAS_STATUS_INVALID_VALUE, "matmul shape mismatch");
+    }
+    if (!fits_int(m64) || !fits_int(n64) || !fits_int(k64) || !fits_int(a.shape.dims[1]) ||
+        !fits_int(b.shape.dims[1]) || !fits_int(c.shape.dims[1])) {
+        return BlasStatus::failure(CUBLAS_STATUS_INVALID_VALUE, "matmul dimensions exceed cuBLAS int range");
+    }
+
+    DeviceGuard guard(handle.device_id());
+    BlasStatus stream_status = handle.set_stream(stream);
+    if (!stream_status.ok) {
+        return stream_status;
+    }
+
+    const int m = static_cast<int>(m64);
+    const int n = static_cast<int>(n64);
+    const int k = static_cast<int>(k64);
+    const int lda = static_cast<int>(a.shape.dims[1]);
+    const int ldb = static_cast<int>(b.shape.dims[1]);
+    const int ldc = static_cast<int>(c.shape.dims[1]);
+    const cudaDataType_t dtype = to_cuda_data_type(a.dtype);
+
+    cublasStatus_t status = cublasGemmEx(
+        handle.get(),
+        to_cublas_op(op_b),
+        to_cublas_op(op_a),
+        n,
+        m,
+        k,
+        &alpha,
+        b.data,
+        dtype,
+        ldb,
+        a.data,
+        dtype,
+        lda,
+        &beta,
+        c.data,
+        dtype,
+        ldc,
+        compute_type_for(a.dtype),
+        gemm_algo_for(a.dtype));
+
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        return BlasStatus::failure(status, "cublasGemmEx failed");
+    }
+    return BlasStatus::success();
+}
+
+BlasStatus matmul_forward(
+    BlasHandle& handle,
+    const TensorView& input,
+    const TensorView& weight,
+    TensorView output,
+    cudaStream_t stream,
+    float alpha,
+    float beta) {
+    return matmul(handle, input, MatmulTranspose::None, weight, MatmulTranspose::None, output, alpha, beta, stream);
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/KERNAL/nccl_all_reduce.cu b/cuda/KERNAL/nccl_all_reduce.cu
new file mode 100644
index 0000000..6f48b01
--- /dev/null
+++ b/cuda/KERNAL/nccl_all_reduce.cu
@@ -0,0 +1,184 @@
+#include "../includes/nccl_all_reduce.cuh"
+
+#include "../includes/runtime.cuh"
+#include "../includes/utils.cuh"
+
+#include <cstdio>
+#include <utility>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+#ifdef QUADTRIX_ENABLE_NCCL
+ncclDataType_t to_nccl_dtype(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return ncclFloat32;
+        case DType::F16:
+            return ncclFloat16;
+        case DType::I32:
+            return ncclInt32;
+        case DType::U8:
+            return ncclUint8;
+    }
+    return ncclFloat32;
+}
+
+bool supports_nccl_dtype(DType dtype) {
+    return dtype == DType::F32 || dtype == DType::F16 || dtype == DType::I32 || dtype == DType::U8
+        ;
+}
+#else
+bool supports_nccl_dtype(DType) {
+    return false;
+}
+#endif
+
+bool valid_reduce_tensor(const TensorView& tensor) {
+    return tensor.data != nullptr && tensor.device == DeviceKind::CUDA && tensor.shape.is_contiguous() &&
+           tensor.numel() > 0 && supports_nccl_dtype(tensor.dtype);
+}
+
+__global__ void scale_kernel(float* values, std::size_t n, float scale) {
+    const std::size_t idx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        values[idx] *= scale;
+    }
+}
+
+}  // namespace
+
+const char* nccl_status_name(ncclResult_t status) {
+#ifdef QUADTRIX_ENABLE_NCCL
+    return ncclGetErrorString(status);
+#else
+    switch (status) {
+        case ncclSuccess:
+            return "ncclSuccess";
+        case ncclUnhandledCudaError:
+            return "ncclUnhandledCudaError";
+        case ncclSystemError:
+            return "ncclSystemError";
+        case ncclInternalError:
+            return "ncclInternalError";
+        case ncclInvalidArgument:
+            return "ncclInvalidArgument";
+        case ncclInvalidUsage:
+            return "ncclInvalidUsage";
+        case ncclNumResults:
+            return "ncclNumResults";
+    }
+    return "ncclUnknown";
+#endif
+}
+
+NcclCommunicator::NcclCommunicator(ncclUniqueId unique_id, int world_size, int rank, int device_id)
+    : world_size_(world_size), rank_(rank), device_id_(device_id) {
+    DeviceGuard guard(device_id_);
+#ifdef QUADTRIX_ENABLE_NCCL
+    ncclResult_t status = ncclCommInitRank(&comm_, world_size_, unique_id, rank_);
+    if (status != ncclSuccess) {
+        std::fprintf(stderr, "Fatal NCCL error: ncclCommInitRank failed with %s\n", nccl_status_name(status));
+        std::abort();
+    }
+#else
+    std::fprintf(stderr, "Fatal NCCL error: build with QUADTRIX_ENABLE_NCCL and link NCCL to use NcclCommunicator\n");
+    std::abort();
+#endif
+}
+
+NcclStatus create_unique_id(ncclUniqueId* unique_id) {
+    if (unique_id == nullptr) {
+        return NcclStatus::failure(ncclInvalidArgument, "unique_id must not be null");
+    }
+#ifdef QUADTRIX_ENABLE_NCCL
+    ncclResult_t status = ncclGetUniqueId(unique_id);
+    if (status != ncclSuccess) {
+        return NcclStatus::failure(status, "ncclGetUniqueId failed");
+    }
+    return NcclStatus::success();
+#else
+    return NcclStatus::failure(ncclInvalidUsage, "NCCL support is not enabled in this build");
+#endif
+}
+
+NcclCommunicator::~NcclCommunicator() {
+#ifdef QUADTRIX_ENABLE_NCCL
+    if (comm_ != nullptr) {
+        DeviceGuard guard(device_id_);
+        ncclCommDestroy(comm_);
+    }
+#endif
+}
+
+NcclCommunicator::NcclCommunicator(NcclCommunicator&& other) noexcept
+    : comm_(other.comm_), world_size_(other.world_size_), rank_(other.rank_), device_id_(other.device_id_) {
+    other.comm_ = nullptr;
+}
+
+NcclCommunicator& NcclCommunicator::operator=(NcclCommunicator&& other) noexcept {
+    if (this == &other) {
+        return *this;
+    }
+#ifdef QUADTRIX_ENABLE_NCCL
+    if (comm_ != nullptr) {
+        DeviceGuard guard(device_id_);
+        ncclCommDestroy(comm_);
+    }
+#endif
+    comm_ = other.comm_;
+    world_size_ = other.world_size_;
+    rank_ = other.rank_;
+    device_id_ = other.device_id_;
+    other.comm_ = nullptr;
+    return *this;
+}
+
+NcclStatus all_reduce_sum(NcclCommunicator& communicator, TensorView tensor, cudaStream_t stream) {
+    if (!communicator.valid() || tensor.device_id != communicator.device_id() || !valid_reduce_tensor(tensor)) {
+        return NcclStatus::failure(ncclInvalidArgument, "invalid all_reduce_sum arguments");
+    }
+
+#ifdef QUADTRIX_ENABLE_NCCL
+    DeviceGuard guard(communicator.device_id());
+    ncclResult_t status = ncclAllReduce(
+        tensor.data,
+        tensor.data,
+        tensor.numel(),
+        to_nccl_dtype(tensor.dtype),
+        ncclSum,
+        communicator.get(),
+        stream);
+    if (status != ncclSuccess) {
+        return NcclStatus::failure(status, "ncclAllReduce failed");
+    }
+    return NcclStatus::success();
+#else
+    return NcclStatus::failure(ncclInvalidUsage, "NCCL support is not enabled in this build");
+#endif
+}
+
+NcclStatus all_reduce_average(NcclCommunicator& communicator, TensorView tensor, cudaStream_t stream) {
+    NcclStatus reduce = all_reduce_sum(communicator, tensor, stream);
+    if (!reduce.ok) {
+        return reduce;
+    }
+    if (tensor.dtype != DType::F32) {
+        return NcclStatus::failure(ncclInvalidArgument, "all_reduce_average currently supports F32 tensors only");
+    }
+
+    DeviceGuard guard(communicator.device_id());
+    scale_kernel<<<one_dim_grid(tensor.numel()), kDefaultBlockSize, 0, stream>>>(
+        tensor.data_as<float>(),
+        tensor.numel(),
+        1.0f / static_cast<float>(communicator.world_size()));
+    Status scale_status = QUADTRIX_CUDA_CHECK(cudaGetLastError());
+    if (!scale_status.ok) {
+        return NcclStatus::failure(ncclUnhandledCudaError, "all_reduce_average scale kernel failed");
+    }
+    return NcclStatus::success();
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/KERNAL/permute.cu b/cuda/KERNAL/permute.cu
new file mode 100644
index 0000000..ba10771
--- /dev/null
+++ b/cuda/KERNAL/permute.cu
@@ -0,0 +1,137 @@
+#include "../includes/permute.cuh"
+
+#include "../includes/runtime.cuh"
+#include "../includes/utils.cuh"
+
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+bool fits_int(std::int64_t value) {
+    return value > 0 && value <= std::numeric_limits<int>::max();
+}
+
+bool valid_f32_cuda(const TensorView& tensor) {
+    return tensor.data != nullptr && tensor.device == DeviceKind::CUDA && tensor.dtype == DType::F32 &&
+           tensor.shape.is_contiguous();
+}
+
+__global__ void permute_qkv_kernel(
+    const float* __restrict__ input_qkv,
+    float* __restrict__ query,
+    float* __restrict__ key,
+    float* __restrict__ value,
+    int total,
+    int time,
+    int channels,
+    int head_size) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    const int i = idx % head_size;
+    const int h = (idx / head_size) % (channels / head_size);
+    const int t = (idx / head_size / (channels / head_size)) % time;
+    const int b = idx / head_size / (channels / head_size) / time;
+    const int c = h * head_size + i;
+    const int src_base = b * time * 3 * channels + t * 3 * channels;
+    query[idx] = input_qkv[src_base + c];
+    key[idx] = input_qkv[src_base + channels + c];
+    value[idx] = input_qkv[src_base + 2 * channels + c];
+}
+
+__global__ void unpermute_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    int total,
+    int time,
+    int channels,
+    int head_size) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    const int i = idx % head_size;
+    const int h = (idx / head_size) % (channels / head_size);
+    const int t = (idx / head_size / (channels / head_size)) % time;
+    const int b = idx / head_size / (channels / head_size) / time;
+    const int c = h * head_size + i;
+    output[b * time * channels + t * channels + c] = input[idx];
+}
+
+}  // namespace
+
+Status permute_qkv_btc_to_bnhth(
+    const TensorView& input_qkv,
+    TensorView query,
+    TensorView key,
+    TensorView value,
+    int num_heads,
+    cudaStream_t stream) {
+    if (!valid_f32_cuda(input_qkv) || !valid_f32_cuda(query) || !valid_f32_cuda(key) || !valid_f32_cuda(value) ||
+        input_qkv.shape.rank != 3 || query.shape.rank != 4 || key.shape.rank != 4 || value.shape.rank != 4 ||
+        query.device_id != input_qkv.device_id || key.device_id != input_qkv.device_id ||
+        value.device_id != input_qkv.device_id) {
+        return Status::failure(cudaErrorInvalidValue, "invalid permute_qkv tensors");
+    }
+    const std::int64_t batch = input_qkv.shape.dims[0];
+    const std::int64_t time = input_qkv.shape.dims[1];
+    const std::int64_t channels3 = input_qkv.shape.dims[2];
+    if (num_heads <= 0 || channels3 % 3 != 0) {
+        return Status::failure(cudaErrorInvalidValue, "invalid permute_qkv shape");
+    }
+    const std::int64_t channels = channels3 / 3;
+    if (channels % num_heads != 0 || query.shape.dims[0] != batch || query.shape.dims[1] != num_heads ||
+        query.shape.dims[2] != time || query.shape.dims[3] != channels / num_heads ||
+        key.shape.dims != query.shape.dims || value.shape.dims != query.shape.dims ||
+        !fits_int(query.numel())) {
+        return Status::failure(cudaErrorInvalidValue, "invalid permute_qkv output shape");
+    }
+
+    DeviceGuard guard(input_qkv.device_id);
+    const int total = static_cast<int>(query.numel());
+    permute_qkv_kernel<<<one_dim_grid(total), kDefaultBlockSize, 0, stream>>>(
+        input_qkv.data_as<const float>(),
+        query.data_as<float>(),
+        key.data_as<float>(),
+        value.data_as<float>(),
+        total,
+        static_cast<int>(time),
+        static_cast<int>(channels),
+        static_cast<int>(channels / num_heads));
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+Status unpermute_bnhth_to_btc(const TensorView& input, TensorView output, cudaStream_t stream) {
+    if (!valid_f32_cuda(input) || !valid_f32_cuda(output) || input.shape.rank != 4 || output.shape.rank != 3 ||
+        input.device_id != output.device_id) {
+        return Status::failure(cudaErrorInvalidValue, "invalid unpermute tensors");
+    }
+    const std::int64_t batch = input.shape.dims[0];
+    const std::int64_t heads = input.shape.dims[1];
+    const std::int64_t time = input.shape.dims[2];
+    const std::int64_t head_size = input.shape.dims[3];
+    const std::int64_t channels = heads * head_size;
+    if (output.shape.dims[0] != batch || output.shape.dims[1] != time || output.shape.dims[2] != channels ||
+        !fits_int(input.numel())) {
+        return Status::failure(cudaErrorInvalidValue, "invalid unpermute output shape");
+    }
+
+    DeviceGuard guard(input.device_id);
+    const int total = static_cast<int>(input.numel());
+    unpermute_kernel<<<one_dim_grid(total), kDefaultBlockSize, 0, stream>>>(
+        input.data_as<const float>(),
+        output.data_as<float>(),
+        total,
+        static_cast<int>(time),
+        static_cast<int>(channels),
+        static_cast<int>(head_size));
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/KERNAL/softmax_forward.cu b/cuda/KERNAL/softmax_forward.cu
new file mode 100644
index 0000000..577c6c7
--- /dev/null
+++ b/cuda/KERNAL/softmax_forward.cu
@@ -0,0 +1,187 @@
+#include "../includes/softmax.cuh"
+
+#include "../includes/runtime.cuh"
+#include "../includes/utils.cuh"
+
+#include <cmath>
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+constexpr int kSoftmaxBlockSize = 256;
+
+bool fits_int(std::int64_t value) {
+    return value > 0 && value <= std::numeric_limits<int>::max();
+}
+
+bool valid_same_shape_f32(const TensorView& a, const TensorView& b) {
+    if (a.data == nullptr || b.data == nullptr || a.device != DeviceKind::CUDA || b.device != DeviceKind::CUDA ||
+        a.device_id != b.device_id || a.dtype != DType::F32 || b.dtype != DType::F32 ||
+        a.shape.rank != b.shape.rank || !a.shape.is_contiguous() || !b.shape.is_contiguous() ||
+        a.numel() != b.numel()) {
+        return false;
+    }
+    for (int i = 0; i < a.shape.rank; ++i) {
+        if (a.shape.dims[i] != b.shape.dims[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+__device__ float block_sum(float value, float* shared) {
+    value = warp_sum(value);
+    const int lane = threadIdx.x & (kWarpSize - 1);
+    const int warp = threadIdx.x / kWarpSize;
+    if (lane == 0) {
+        shared[warp] = value;
+    }
+    __syncthreads();
+    const int warp_count = (blockDim.x + kWarpSize - 1) / kWarpSize;
+    value = threadIdx.x < warp_count ? shared[lane] : 0.0f;
+    if (warp == 0) {
+        value = warp_sum(value);
+    }
+    if (threadIdx.x == 0) {
+        shared[0] = value;
+    }
+    __syncthreads();
+    return shared[0];
+}
+
+__device__ float block_max(float value, float* shared) {
+    value = warp_max(value);
+    const int lane = threadIdx.x & (kWarpSize - 1);
+    const int warp = threadIdx.x / kWarpSize;
+    if (lane == 0) {
+        shared[warp] = value;
+    }
+    __syncthreads();
+    const int warp_count = (blockDim.x + kWarpSize - 1) / kWarpSize;
+    value = threadIdx.x < warp_count ? shared[lane] : -INFINITY;
+    if (warp == 0) {
+        value = warp_max(value);
+    }
+    if (threadIdx.x == 0) {
+        shared[0] = value;
+    }
+    __syncthreads();
+    return shared[0];
+}
+
+__global__ void softmax_forward_kernel(
+    const float* __restrict__ logits,
+    float* __restrict__ probs,
+    int rows,
+    int cols,
+    int valid_cols) {
+    extern __shared__ float shared[];
+    const int row = blockIdx.x;
+    if (row >= rows) {
+        return;
+    }
+
+    const float* __restrict__ logits_row = logits + row * cols;
+    float* __restrict__ probs_row = probs + row * cols;
+    float local_max = -INFINITY;
+    for (int col = threadIdx.x; col < valid_cols; col += blockDim.x) {
+        local_max = fmaxf(local_max, logits_row[col]);
+    }
+    const float max_val = block_max(local_max, shared);
+
+    float local_sum = 0.0f;
+    for (int col = threadIdx.x; col < valid_cols; col += blockDim.x) {
+        const float value = expf(logits_row[col] - max_val);
+        probs_row[col] = value;
+        local_sum += value;
+    }
+    const float sum = block_sum(local_sum, shared);
+    const float inv_sum = sum == 0.0f ? 0.0f : 1.0f / sum;
+
+    for (int col = threadIdx.x; col < cols; col += blockDim.x) {
+        probs_row[col] = col < valid_cols ? probs_row[col] * inv_sum : 0.0f;
+    }
+}
+
+__global__ void causal_softmax_row_kernel(
+    const float* __restrict__ preatt,
+    float* __restrict__ att,
+    int rows,
+    int time) {
+    extern __shared__ float shared[];
+    const int row = blockIdx.x;
+    if (row >= rows) {
+        return;
+    }
+    const int t = row % time;
+    const int valid_cols = t + 1;
+    const float* __restrict__ preatt_row = preatt + row * time;
+    float* __restrict__ att_row = att + row * time;
+
+    float local_max = -INFINITY;
+    for (int col = threadIdx.x; col < valid_cols; col += blockDim.x) {
+        local_max = fmaxf(local_max, preatt_row[col]);
+    }
+    const float max_val = block_max(local_max, shared);
+
+    float local_sum = 0.0f;
+    for (int col = threadIdx.x; col < valid_cols; col += blockDim.x) {
+        const float value = expf(preatt_row[col] - max_val);
+        att_row[col] = value;
+        local_sum += value;
+    }
+    const float sum = block_sum(local_sum, shared);
+    const float inv_sum = sum == 0.0f ? 0.0f : 1.0f / sum;
+
+    for (int col = threadIdx.x; col < time; col += blockDim.x) {
+        att_row[col] = col < valid_cols ? att_row[col] * inv_sum : 0.0f;
+    }
+}
+
+}  // namespace
+
+Status softmax_forward(const TensorView& logits, TensorView probs, int valid_cols, cudaStream_t stream) {
+    if (!valid_same_shape_f32(logits, probs) || logits.shape.rank != 2 || !fits_int(logits.shape.dims[0]) ||
+        !fits_int(logits.shape.dims[1])) {
+        return Status::failure(cudaErrorInvalidValue, "invalid softmax_forward tensors");
+    }
+    const int rows = static_cast<int>(logits.shape.dims[0]);
+    const int cols = static_cast<int>(logits.shape.dims[1]);
+    if (valid_cols <= 0 || valid_cols > cols) {
+        return Status::failure(cudaErrorInvalidValue, "invalid softmax_forward valid_cols");
+    }
+
+    DeviceGuard guard(logits.device_id);
+    const std::size_t shared_bytes = ((kSoftmaxBlockSize + kWarpSize - 1) / kWarpSize) * sizeof(float);
+    softmax_forward_kernel<<<rows, kSoftmaxBlockSize, shared_bytes, stream>>>(
+        logits.data_as<const float>(),
+        probs.data_as<float>(),
+        rows,
+        cols,
+        valid_cols);
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+Status causal_softmax_forward(const TensorView& preatt, TensorView att, cudaStream_t stream) {
+    if (!valid_same_shape_f32(preatt, att) || preatt.shape.rank != 4 || !fits_int(preatt.shape.dims[0]) ||
+        !fits_int(preatt.shape.dims[1]) || !fits_int(preatt.shape.dims[2]) ||
+        preatt.shape.dims[2] != preatt.shape.dims[3]) {
+        return Status::failure(cudaErrorInvalidValue, "invalid causal_softmax_forward tensors");
+    }
+    const int rows = static_cast<int>(preatt.shape.dims[0] * preatt.shape.dims[1] * preatt.shape.dims[2]);
+    const int time = static_cast<int>(preatt.shape.dims[2]);
+
+    DeviceGuard guard(preatt.device_id);
+    const std::size_t shared_bytes = ((kSoftmaxBlockSize + kWarpSize - 1) / kWarpSize) * sizeof(float);
+    causal_softmax_row_kernel<<<rows, kSoftmaxBlockSize, shared_bytes, stream>>>(
+        preatt.data_as<const float>(),
+        att.data_as<float>(),
+        rows,
+        time);
+    return QUADTRIX_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/adamw.cuh b/cuda/includes/adamw.cuh
new file mode 100644
index 0000000..d7afae8
--- /dev/null
+++ b/cuda/includes/adamw.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+struct AdamWConfig {
+    float learning_rate = 1.0e-4f;
+    float beta1 = 0.9f;
+    float beta2 = 0.95f;
+    float epsilon = 1.0e-8f;
+    float weight_decay = 0.0f;
+    int step = 1;
+};
+
+Status adamw_update(
+    TensorView params,
+    const TensorView& grads,
+    TensorView first_moment,
+    TensorView second_moment,
+    const AdamWConfig& config,
+    float grad_scale = 1.0f,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/attention.cuh b/cuda/includes/attention.cuh
new file mode 100644
index 0000000..7feac08
--- /dev/null
+++ b/cuda/includes/attention.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status attention_forward(
+    const TensorView& input_qkv,
+    TensorView preatt,
+    TensorView att,
+    TensorView output,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+Status attention_backward(
+    const TensorView& grad_output,
+    const TensorView& input_qkv,
+    const TensorView& att,
+    TensorView grad_input_qkv,
+    TensorView grad_preatt,
+    TensorView grad_att,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/common.h b/cuda/includes/common.h
index 269a789..36df155 100644
--- a/cuda/includes/common.h
+++ b/cuda/includes/common.h
@@ -1,275 +1,120 @@
-// Quadtrix CUDA Engine — common.h
-//  Tensor Runtime
-// NO kernel logic, NO device code lives here.
-// This is a pure host-side / shared header.
-
 #pragma once
 
+#include <cuda_runtime.h>
+
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
-#include <cassert>
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h> // __half
-#include <cuda_bf16.h> // __nv_bfloat16
-#include <cublas_v2.h>
-
-// Version / build tags
-
-#define QUADTRIX_VERSION_MAJOR 0
-#define QUADTRIX_VERSION_MINOR 1
-#define QUADTRIX_VERSION_PATCH 0
-// Compiler / architecture hints
-#define QX_INLINE __forceinline__
-#define QX_HOST __host__
-#define QX_DEVICE __device__
-#define QX_HOST_DEVICE __host__ __device__
-#define QX_GLOBAL __global__
-#define QX_RESTRICT __restrict__
-
-// Prevent unused-variable warnings in release builds
-#define QX_UNUSED(x) ((void)(x))
-// Integer math helpers (host + device safe)
-#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
-#define ROUND_UP(x, y) (CEIL_DIV((x), (y)) * (y))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define CLAMP(x, lo, hi) (MIN(MAX((x), (lo)), (hi)))
-#define IS_POW2(x) (((x) & ((x) - 1)) == 0) // Power-of-two utilities
-#define NEXT_POW2(x) \
-      ([](uint32_t v) { v--; v|=v>>1; v|=v>>2; v|=v>>4; v|=v>>8; v|=v>>16; return v+1; }((uint32_t)(x)))
-#define QX_ALIGN(x) __align__(x)
-#define QX_ALIGN_16 QX_ALIGN(16)
-#define QX_ALIGN_32 QX_ALIGN(32)
-#define QX_ALIGN_128 QX_ALIGN(128) // cache-line
-
-// Memory alignment for device allocations (128 bytes = 2 cache lines,
-// ensures coalesced access on any architecture)
-static constexpr size_t QX_MEM_ALIGN = 128;
-// ---- CUDA runtime errors ----
-#define CUDA_CHECK(call)                                   \
-      do                                                   \
-      {                                                    \
-            cudaError_t _err = (call);                     \
-            if (_err != cudaSuccess)                       \
-            {                                              \
-                  fprintf(stderr,                          \
-                          "[CUDA ERROR] %s:%d  call: %s\n" \
-                          "             %s\n",             \
-                          __FILE__, __LINE__, #call,       \
-                          cudaGetErrorString(_err));       \
-                  exit(EXIT_FAILURE);                      \
-            }                                              \
-      } while (0)
-
-// Variant that returns instead of exit (use in non-fatal paths)
-#define CUDA_CHECK_RETURN(call, retval)                         \
-      do                                                        \
-      {                                                         \
-            cudaError_t _err = (call);                          \
-            if (_err != cudaSuccess)                            \
-            {                                                   \
-                  fprintf(stderr,                               \
-                          "[CUDA WARN] %s:%d  call: %s — %s\n", \
-                          __FILE__, __LINE__, #call,            \
-                          cudaGetErrorString(_err));            \
-                  return (retval);                              \
-            }                                                   \
-      } while (0)
-
-// ---- cuBLAS errors ----
-#define CUBLAS_CHECK(call)                                                \
-      do                                                                  \
-      {                                                                   \
-            cublasStatus_t _st = (call);                                  \
-            if (_st != CUBLAS_STATUS_SUCCESS)                             \
-            {                                                             \
-                  fprintf(stderr,                                         \
-                          "[cuBLAS ERROR] %s:%d  call: %s  status: %d\n", \
-                          __FILE__, __LINE__, #call, (int)_st);           \
-                  exit(EXIT_FAILURE);                                     \
-            }                                                             \
-      } while (0)
-
-// ---- NCCL errors (included only when NCCL is available) ----
-#ifdef NCCL_MAJOR
-#include <nccl.h>
-#define NCCL_CHECK(call)                                   \
-      do                                                   \
-      {                                                    \
-            ncclResult_t _r = (call);                      \
-            if (_r != ncclSuccess)                         \
-            {                                              \
-                  fprintf(stderr,                          \
-                          "[NCCL ERROR] %s:%d  call: %s\n" \
-                          "             %s\n",             \
-                          __FILE__, __LINE__, #call,       \
-                          ncclGetErrorString(_r));         \
-                  exit(EXIT_FAILURE);                      \
-            }                                              \
-      } while (0)
-#endif
-
-//  cuDNN errors
-#ifdef CUDNN_MAJOR
-#include <cudnn.h>
-#define CUDNN_CHECK(call)                                   \
-      do                                                    \
-      {                                                     \
-            cudnnStatus_t _st = (call);                     \
-            if (_st != CUDNN_STATUS_SUCCESS)                \
-            {                                               \
-                  fprintf(stderr,                           \
-                          "[cuDNN ERROR] %s:%d  call: %s\n" \
-                          "              %s\n",             \
-                          __FILE__, __LINE__, #call,        \
-                          cudnnGetErrorString(_st));        \
-                  exit(EXIT_FAILURE);                       \
-            }                                               \
-      } while (0)
-#endif // CUDNN_MAJOR
-
-// Convenience: sync + check after kernel launches
-#define CUDA_KERNEL_CHECK()                      \
-      do                                         \
-      {                                          \
-            CUDA_CHECK(cudaGetLastError());      \
-            CUDA_CHECK(cudaDeviceSynchronize()); \
-      } while (0)
-
-// Debug-only sync (compiles away in release)
-#ifdef QX_DEBUG
-#define CUDA_KERNEL_CHECK_DEBUG() CUDA_KERNEL_CHECK()
-#else
-#define CUDA_KERNEL_CHECK_DEBUG() CUDA_CHECK(cudaGetLastError())
-#endif
-// Drives runtime dtype dispatch throughout the engine.
-// Every tensor carries one of these.
-typedef enum
-{
-      DTYPE_FLOAT32 = 0,  // float       — 4 bytes — default training dtype
-      DTYPE_FLOAT16 = 1,  // __half      — 2 bytes — mixed-precision forward
-      DTYPE_BFLOAT16 = 2, // __nv_bfloat16 — 2 bytes — preferred on A100/H100
-      DTYPE_INT32 = 3,    // int32_t     — 4 bytes — indices, token IDs
-      DTYPE_INT8 = 4,     // int8_t      — 1 byte  — quantized inference
-      DTYPE_BOOL = 5,     // uint8_t     — 1 byte  — masks
-      DTYPE_UNKNOWN = 255
-} DType;
-
-// Human-readable dtype name (for logging)
-static inline const char *dtype_name(DType d)
-{
-      switch (d)
-      {
-      case DTYPE_FLOAT32:
-            return "float32";
-      case DTYPE_FLOAT16:
-            return "float16";
-      case DTYPE_BFLOAT16:
-            return "bfloat16";
-      case DTYPE_INT32:
-            return "int32";
-      case DTYPE_INT8:
-            return "int8";
-      case DTYPE_BOOL:
-            return "bool";
-      default:
-            return "unknown";
-      }
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class DType : std::uint8_t {
+    F32,
+    F16,
+    BF16,
+    I32,
+    U8,
+};
+
+enum class DeviceKind : std::uint8_t {
+    CPU,
+    CUDA,
+};
+
+struct Status {
+    bool ok;
+    cudaError_t cuda_error;
+    const char* message;
+
+    static Status success() {
+        return {true, cudaSuccess, "ok"};
+    }
+
+    static Status failure(cudaError_t error, const char* message) {
+        return {false, error, message};
+    }
+};
+
+inline const char* dtype_name(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return "f32";
+        case DType::F16:
+            return "f16";
+        case DType::BF16:
+            return "bf16";
+        case DType::I32:
+            return "i32";
+        case DType::U8:
+            return "u8";
+    }
+    return "unknown";
 }
 
-// Byte size per element for a given dtype
-static inline size_t dtype_size(DType d)
-{
-      switch (d)
-      {
-      case DTYPE_FLOAT32:
+inline std::size_t dtype_size(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
             return 4;
-      case DTYPE_FLOAT16:
+        case DType::F16:
             return 2;
-      case DTYPE_BFLOAT16:
+        case DType::BF16:
             return 2;
-      case DTYPE_INT32:
+        case DType::I32:
             return 4;
-      case DTYPE_INT8:
-            return 1;
-      case DTYPE_BOOL:
+        case DType::U8:
             return 1;
-      default:
-            fprintf(stderr, "[QX] dtype_size: unknown dtype %d\n", (int)d);
-            exit(EXIT_FAILURE);
-      }
+    }
+
+    std::fprintf(stderr, "Unknown CUDA dtype value %u\n", static_cast<unsigned int>(dtype));
+    std::abort();
 }
 
-// Map DType → cublas compute / data types (used in matmul wrappers)
-static inline cudaDataType_t dtype_to_cuda(DType d)
-{
-      switch (d)
-      {
-      case DTYPE_FLOAT32:
-            return CUDA_R_32F;
-      case DTYPE_FLOAT16:
-            return CUDA_R_16F;
-      case DTYPE_BFLOAT16:
-            return CUDA_R_16BF;
-      case DTYPE_INT8:
-            return CUDA_R_8I;
-      default:
-            fprintf(stderr, "[QX] dtype_to_cuda: unsupported dtype %d\n", (int)d);
-            exit(EXIT_FAILURE);
-      }
+inline bool checked_mul(std::size_t lhs, std::size_t rhs, std::size_t* out) {
+    if (lhs != 0 && rhs > std::numeric_limits<std::size_t>::max() / lhs) {
+        return false;
+    }
+    *out = lhs * rhs;
+    return true;
 }
 
-static constexpr int QX_MAX_DIMS = 5;    // max tensor rank supported
-static constexpr int QX_MAX_DEVICES = 8; // max GPUs in a node
-static constexpr int QX_BLOCK_SIZE_SMALL = 128;
-static constexpr int QX_BLOCK_SIZE = 256;
-static constexpr int QX_BLOCK_SIZE_LARGE = 512;
-static constexpr int QX_BLOCK_SIZE_MAX = 1024;
-static constexpr int QX_WARP_SIZE = 32;
-static constexpr int QX_MAX_GRID_X = 65535;
-// Memory location enum
-//   Tracks where a tensor buffer actually lives.
-typedef enum
-{
-      MEM_HOST = 0,        // CPU
-      MEM_HOST_PINNED = 1, // CPU pinned — fast H2D/D2H transfers
-      MEM_DEVICE = 2,      // GPU global memory
-      MEM_MANAGED = 3,     // CUDA unified / managed memory
-      MEM_UNKNOWN = 255
-} MemLocation;
+inline Status check_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return Status::success();
+    }
+
+    std::fprintf(
+        stderr,
+        "CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    return Status::failure(error, expression);
+}
+
+inline void abort_on_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return;
+    }
+
+    std::fprintf(
+        stderr,
+        "Fatal CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    std::abort();
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
+
+#define QUADTRIX_CUDA_CHECK(expr) \
+    ::quadtrix::cuda::check_cuda((expr), #expr, __FILE__, __LINE__)
 
-//
-// Training phase enum
-// Passed into forward/backward launchers so they can skip
-// gradient computation during inference.
-typedef enum
-{
-      PHASE_TRAIN = 0,
-      PHASE_EVAL = 1,
-      PHASE_INFER = 2
-} TrainingPhase;
-// Reduction operation enum
-//   Used by reduction kernels (global_norm, softmax, layernorm, etc.)
-typedef enum
-{
-      REDUCE_SUM = 0,
-      REDUCE_MAX = 1,
-      REDUCE_MIN = 2,
-      REDUCE_MEAN = 3,
-      REDUCE_PROD = 4
-} ReduceOp;
-static constexpr float QX_EPS_F32 = 1e-6f; // layernorm / division guard
-static constexpr float QX_EPS_F16 = 1e-4f; // fp16 is less precise
-static constexpr float QX_INF_F32 = 1e38f; // softmax init sentinel
-static constexpr float QX_NEG_INF_F32 = -1e38f;
-// GELU approximation constant (tanh variant)
-static constexpr float QX_GELU_COEFF = 0.044715f;
-static constexpr float QX_SQRT_2_PI = 0.7978845608f; // sqrt(2/pi)
-static constexpr float QX_ATT_SCALE_AUTO = -1.0f;
-static_assert(sizeof(float) == 4, "float must be 4 bytes");
-static_assert(sizeof(__half) == 2, "__half must be 2 bytes");
-static_assert(sizeof(__nv_bfloat16) == 2, "bfloat16 must be 2 bytes");
-static_assert(QX_WARP_SIZE == 32, "warp size invariant broken");
\ No newline at end of file
+#define QUADTRIX_CUDA_ABORT(expr) \
+    ::quadtrix::cuda::abort_on_cuda((expr), #expr, __FILE__, __LINE__)
diff --git a/cuda/includes/memory.cuh b/cuda/includes/memory.cuh
index 3610519..e08fa4a 100644
--- a/cuda/includes/memory.cuh
+++ b/cuda/includes/memory.cuh
@@ -1,107 +1,120 @@
 #pragma once
+
 #include "common.h"
-#include "tensor.cuh"
-#include <cstring>
-static inline void *qx_host_alloc(size_t n)
-{
-    void *p = malloc(n);
-    if (!p && n)
-    {
-        perror("[QX] malloc");
-        exit(1);
+#include "runtime.cuh"
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <utility>
+
+namespace quadtrix {
+namespace cuda {
+
+class DeviceBuffer {
+public:
+    DeviceBuffer() = default;
+
+    explicit DeviceBuffer(std::size_t bytes, int device_id = -1) {
+        allocate(bytes, device_id);
     }
-    return p;
-}
-static inline void qx_host_free(void *p)
-{
-    free(p);
-}
 
-static inline void *qx_pinned_alloc(size_t n)
-{
-    void *p = nullptr;
-    CUDA_CHECK(cudaMallocHost(&p, n));
-    return p;
-}
-static inline void qx_pinned_free(void *p)
-{
-    if (p)
-        CUDA_CHECK(cudaFreeHost(p));
-}
+    ~DeviceBuffer() {
+        release();
+    }
 
-static inline void *qx_device_alloc(size_t n, int dev = 0)
-{
-    CUDA_CHECK(cudaSetDevice(dev));
-    void *p = nullptr;
-    CUDA_CHECK(cudaMalloc(&p, ROUND_UP(n, QX_MEM_ALIGN)));
-    return p;
-}
-static inline void qx_device_free(void *p)
-{
-    if (p)
-        CUDA_CHECK(cudaFree(p));
-}
-static inline void qx_device_zero(void *p, size_t n, cudaStream_t s = 0)
-{
-    if (p && n)
-        CUDA_CHECK(cudaMemsetAsync(p, 0, n, s));
-}
-// Tensor allocators
-static inline Tensor *tensor_alloc_device(const TensorShape &sh, DType dt,
-                                          int dev = 0, cudaStream_t s = 0,
-                                          const char *name = "")
-{
-    Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
-    t->shape = sh;
-    t->dtype = dt;
-    t->mem_loc = MEM_DEVICE;
-    t->owns_data = true;
-    t->device_id = dev;
-    strncpy(t->name, name, 63);
-    t->data = qx_device_alloc((size_t)sh.numel() * dtype_size(dt), dev);
-    qx_device_zero(t->data, (size_t)sh.numel() * dtype_size(dt), s);
-    return t;
-}
+    DeviceBuffer(const DeviceBuffer&) = delete;
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
 
-static inline Tensor *tensor_alloc_host(const TensorShape &sh, DType dt,
-                                        bool pinned = false, const char *name = "")
-{
-    Tensor *t = (Tensor *)calloc(1, sizeof(Tensor));
-    t->shape = sh;
-    t->dtype = dt;
-    t->mem_loc = pinned ? MEM_HOST_PINNED : MEM_HOST;
-    t->owns_data = true;
-    t->device_id = -1;
-    strncpy(t->name, name, 63);
-    size_t nb = (size_t)sh.numel() * dtype_size(dt);
-    t->data = pinned ? qx_pinned_alloc(nb) : calloc(1, nb);
-    return t;
-}
+    DeviceBuffer(DeviceBuffer&& other) noexcept {
+        swap(other);
+    }
 
-static inline void tensor_free(Tensor *t)
-{
-    if (!t)
-        return;
-    if (t->owns_data && t->data)
-    {
-        if (t->mem_loc == MEM_DEVICE)
-            qx_device_free(t->data);
-        else if (t->mem_loc == MEM_HOST_PINNED)
-            qx_pinned_free(t->data);
-        else
-            free(t->data);
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
+        if (this != &other) {
+            release();
+            swap(other);
+        }
+        return *this;
     }
-    free(t);
+
+    void allocate(std::size_t bytes, int device_id = -1) {
+        release();
+        if (bytes == 0) {
+            return;
+        }
+        if (device_id >= 0) {
+            device_id_ = device_id;
+            DeviceGuard guard(device_id);
+            QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes));
+        } else {
+            device_id_ = current_device();
+            QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes));
+        }
+        bytes_ = bytes;
+    }
+
+    void release() {
+        if (ptr_ != nullptr) {
+            if (device_id_ >= 0) {
+                DeviceGuard guard(device_id_);
+                cudaFree(ptr_);
+            } else {
+                cudaFree(ptr_);
+            }
+            ptr_ = nullptr;
+            bytes_ = 0;
+            device_id_ = -1;
+        }
+    }
+
+    void* data() {
+        return ptr_;
+    }
+
+    const void* data() const {
+        return ptr_;
+    }
+
+    std::size_t bytes() const {
+        return bytes_;
+    }
+
+    bool empty() const {
+        return ptr_ == nullptr || bytes_ == 0;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    void swap(DeviceBuffer& other) noexcept {
+        std::swap(ptr_, other.ptr_);
+        std::swap(bytes_, other.bytes_);
+        std::swap(device_id_, other.device_id_);
+    }
+
+private:
+    void* ptr_ = nullptr;
+    std::size_t bytes_ = 0;
+    int device_id_ = -1;
+};
+
+inline Status copy_h2d(void* dst_device, const void* src_host, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_host, bytes, cudaMemcpyHostToDevice, stream));
 }
-static inline void tensor_h2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
-{
-    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyHostToDevice, s));
+
+inline Status copy_d2h(void* dst_host, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_host, src_device, bytes, cudaMemcpyDeviceToHost, stream));
 }
-static inline void tensor_d2h(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
-{
-    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToHost, s));
+
+inline Status copy_d2d(void* dst_device, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_device, bytes, cudaMemcpyDeviceToDevice, stream));
 }
-static inline void tensor_d2d(Tensor *dst, const Tensor *src, cudaStream_t s = 0)
-{
-    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, dst->nbytes(), cudaMemcpyDeviceToDevice, s));
+
+inline Status memset_device(void* dst_device, int value, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemsetAsync(dst_device, value, bytes, stream));
 }
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/reduce.cuh b/cuda/includes/reduce.cuh
deleted file mode 100644
index c9649f7..0000000
--- a/cuda/includes/reduce.cuh
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-#include "common.h"
-
-#ifdef __CUDACC__
-
-static constexpr unsigned FULL_MASK = 0xffffffff;
-// Warp reductions
-__device__ QX_INLINE float warpReduceSum(float v)
-{
-    v += __shfl_xor_sync(FULL_MASK, v, 16);
-    v += __shfl_xor_sync(FULL_MASK, v, 8);
-    v += __shfl_xor_sync(FULL_MASK, v, 4);
-    v += __shfl_xor_sync(FULL_MASK, v, 2);
-    v += __shfl_xor_sync(FULL_MASK, v, 1);
-    return v;
-}
-__device__ QX_INLINE float warpReduceMax(float v)
-{
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 16));
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 8));
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 4));
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 2));
-    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, 1));
-    return v;
-}
-__device__ QX_INLINE float warpReduceMin(float v)
-{
-    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 16));
-    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 8));
-    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 4));
-    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 2));
-    v = fminf(v, __shfl_xor_sync(FULL_MASK, v, 1));
-    return v;
-}
-__device__ QX_INLINE float warpBroadcast(float v)
-{
-    return __shfl_sync(FULL_MASK, v, 0);
-}
-__device__ QX_INLINE float blockReduceSum(float v, float *smem)
-{
-    int lane = threadIdx.x % QX_WARP_SIZE;
-    int wid = threadIdx.x / QX_WARP_SIZE;
-    v = warpReduceSum(v);
-    if (lane == 0)
-        smem[wid] = v;
-    __syncthreads();
-    v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : 0.f;
-    if (wid == 0)
-        v = warpReduceSum(v);
-    return v;
-}
-__device__ QX_INLINE float blockReduceMax(float v, float *smem)
-{
-    int lane = threadIdx.x % QX_WARP_SIZE;
-    int wid = threadIdx.x / QX_WARP_SIZE;
-    v = warpReduceMax(v);
-    if (lane == 0)
-        smem[wid] = v;
-    __syncthreads();
-    v = (threadIdx.x < blockDim.x / QX_WARP_SIZE) ? smem[lane] : QX_NEG_INF_F32;
-    if (wid == 0)
-        v = warpReduceMax(v);
-    return v;
-}
-
-#endif // __CUDACC__
diff --git a/cuda/includes/runtime.cuh b/cuda/includes/runtime.cuh
new file mode 100644
index 0000000..c11ce9c
--- /dev/null
+++ b/cuda/includes/runtime.cuh
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "common.h"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+struct DeviceGuard {
+    int previous_device = -1;
+    bool changed = false;
+
+    explicit DeviceGuard(int device_id) {
+        QUADTRIX_CUDA_ABORT(cudaGetDevice(&previous_device));
+        if (previous_device != device_id) {
+            QUADTRIX_CUDA_ABORT(cudaSetDevice(device_id));
+            changed = true;
+        }
+    }
+
+    ~DeviceGuard() {
+        if (changed) {
+            cudaSetDevice(previous_device);
+        }
+    }
+
+    DeviceGuard(const DeviceGuard&) = delete;
+    DeviceGuard& operator=(const DeviceGuard&) = delete;
+};
+
+struct Stream {
+    cudaStream_t handle = nullptr;
+    bool owns = false;
+
+    Stream() = default;
+
+    explicit Stream(cudaStream_t external_handle) : handle(external_handle), owns(false) {}
+
+    static Stream create(unsigned int flags = cudaStreamNonBlocking) {
+        Stream stream;
+        QUADTRIX_CUDA_ABORT(cudaStreamCreateWithFlags(&stream.handle, flags));
+        stream.owns = true;
+        return stream;
+    }
+
+    ~Stream() {
+        if (owns && handle != nullptr) {
+            cudaStreamDestroy(handle);
+        }
+    }
+
+    Stream(const Stream&) = delete;
+    Stream& operator=(const Stream&) = delete;
+
+    Stream(Stream&& other) noexcept : handle(other.handle), owns(other.owns) {
+        other.handle = nullptr;
+        other.owns = false;
+    }
+
+    Stream& operator=(Stream&& other) noexcept {
+        if (this == &other) {
+            return *this;
+        }
+        if (owns && handle != nullptr) {
+            cudaStreamDestroy(handle);
+        }
+        handle = other.handle;
+        owns = other.owns;
+        other.handle = nullptr;
+        other.owns = false;
+        return *this;
+    }
+
+    void synchronize() const {
+        QUADTRIX_CUDA_ABORT(cudaStreamSynchronize(handle));
+    }
+};
+
+inline int current_device() {
+    int device = 0;
+    QUADTRIX_CUDA_ABORT(cudaGetDevice(&device));
+    return device;
+}
+
+inline int device_count() {
+    int count = 0;
+    QUADTRIX_CUDA_ABORT(cudaGetDeviceCount(&count));
+    return count;
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/tensor.cuh b/cuda/includes/tensor.cuh
index c012893..c61d77e 100644
--- a/cuda/includes/tensor.cuh
+++ b/cuda/includes/tensor.cuh
@@ -1,100 +1,168 @@
 #pragma once
+
 #include "common.h"
-// TensorShape — dimensions + strides (row-major by default)
-struct QX_ALIGN_16 TensorShape
-{
-    int dims[QX_MAX_DIMS];
-    int strides[QX_MAX_DIMS];
-    int ndim;
-    int _pad;
-
-    QX_HOST_DEVICE QX_INLINE int64_t numel() const
-    {
-        int64_t n = 1;
-        for (int i = 0; i < ndim; i++)
-            n *= dims[i];
-        return n;
-    }
-
-    QX_HOST QX_INLINE void compute_strides()
-    {
-        strides[ndim - 1] = 1;
-        for (int i = ndim - 2; i >= 0; i--)
-            strides[i] = strides[i + 1] * dims[i + 1];
-    }
-
-    QX_HOST QX_INLINE bool is_contiguous() const
-    {
-        int expected = 1;
-        for (int i = ndim - 1; i >= 0; i--)
-        {
-            if (strides[i] != expected)
+#include "memory.cuh"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+constexpr int kMaxTensorDims = 8;
+
+struct TensorShape {
+    int rank = 0;
+    std::array<std::int64_t, kMaxTensorDims> dims{};
+    std::array<std::int64_t, kMaxTensorDims> strides{};
+
+    static TensorShape contiguous(const std::int64_t* sizes, int ndim) {
+        if (ndim < 1 || ndim > kMaxTensorDims) {
+            std::fprintf(stderr, "Tensor rank %d is outside supported range [1, %d]\n", ndim, kMaxTensorDims);
+            std::abort();
+        }
+
+        TensorShape shape;
+        shape.rank = ndim;
+        for (int i = 0; i < ndim; ++i) {
+            if (sizes[i] <= 0) {
+                std::fprintf(stderr, "Tensor dimension %d must be positive, got %lld\n", i, static_cast<long long>(sizes[i]));
+                std::abort();
+            }
+            shape.dims[i] = sizes[i];
+        }
+
+        std::int64_t stride = 1;
+        for (int i = ndim - 1; i >= 0; --i) {
+            shape.strides[i] = stride;
+            stride *= shape.dims[i];
+        }
+        return shape;
+    }
+
+    std::size_t numel() const {
+        std::size_t total = 1;
+        for (int i = 0; i < rank; ++i) {
+            if (dims[i] <= 0) {
+                return 0;
+            }
+            std::size_t next = 0;
+            if (!checked_mul(total, static_cast<std::size_t>(dims[i]), &next)) {
+                return 0;
+            }
+            total = next;
+        }
+        return rank == 0 ? 0 : total;
+    }
+
+    bool is_contiguous() const {
+        std::int64_t expected = 1;
+        for (int i = rank - 1; i >= 0; --i) {
+            if (strides[i] != expected) {
                 return false;
+            }
             expected *= dims[i];
         }
         return true;
     }
 };
 
-static inline TensorShape make_shape(const int *d, int ndim)
-{
-    TensorShape s;
-    s.ndim = ndim;
-    s._pad = 0;
-    for (int i = 0; i < ndim; i++)
-        s.dims[i] = d[i];
-    for (int i = ndim; i < QX_MAX_DIMS; i++)
-    {
-        s.dims[i] = 1;
-        s.strides[i] = 1;
-    }
-    s.compute_strides();
-    return s;
-}
-static inline TensorShape make_shape1d(int a)
-{
-    int d[] = {a};
-    return make_shape(d, 1);
-}
-static inline TensorShape make_shape2d(int a, int b)
-{
-    int d[] = {a, b};
-    return make_shape(d, 2);
-}
-static inline TensorShape make_shape3d(int a, int b, int c)
-{
-    int d[] = {a, b, c};
-    return make_shape(d, 3);
-}
-static inline TensorShape make_shape4d(int a, int b, int c, int e)
-{
-    int d[] = {a, b, c, e};
-    return make_shape(d, 4);
-}
-// Tensor — primary data carrier (host struct, kernels get raw pointers)
-struct Tensor
-{
-    void *data;
+struct TensorView {
+    void* data = nullptr;
     TensorShape shape;
-    DType dtype;
-    MemLocation mem_loc;
-    bool owns_data;
-    int device_id;
-    char name[64];
+    DType dtype = DType::F32;
+    DeviceKind device = DeviceKind::CUDA;
+    int device_id = 0;
+
+    std::size_t numel() const {
+        return shape.numel();
+    }
+
+    std::size_t bytes() const {
+        std::size_t out = 0;
+        if (!checked_mul(numel(), dtype_size(dtype), &out)) {
+            return 0;
+        }
+        return out;
+    }
 
     template <typename T>
-    QX_HOST_DEVICE QX_INLINE T *as()
-    {
-        return reinterpret_cast<T *>(data);
+    T* data_as() {
+        return static_cast<T*>(data);
     }
+
     template <typename T>
-    QX_HOST_DEVICE QX_INLINE const T *as() const
-    {
-        return reinterpret_cast<const T *>(data);
+    const T* data_as() const {
+        return static_cast<const T*>(data);
     }
+};
+
+class Tensor {
+public:
+    Tensor() = default;
 
-    QX_HOST QX_INLINE size_t nbytes() const { return (size_t)shape.numel() * dtype_size(dtype); }
-    QX_HOST_DEVICE QX_INLINE int dim(int i) const { return shape.dims[i]; }
-    QX_HOST_DEVICE QX_INLINE int ndim() const { return shape.ndim; }
-    QX_HOST_DEVICE QX_INLINE int64_t numel() const { return shape.numel(); }
+    Tensor(const std::int64_t* dims, int rank, DType dtype, int device_id = 0)
+        : shape_(TensorShape::contiguous(dims, rank)), dtype_(dtype), device_id_(device_id) {
+        allocate();
+    }
+
+    Tensor(const Tensor&) = delete;
+    Tensor& operator=(const Tensor&) = delete;
+    Tensor(Tensor&&) noexcept = default;
+    Tensor& operator=(Tensor&&) noexcept = default;
+
+    TensorView view() {
+        return {storage_.data(), shape_, dtype_, DeviceKind::CUDA, device_id_};
+    }
+
+    TensorView view() const {
+        return {const_cast<void*>(storage_.data()), shape_, dtype_, DeviceKind::CUDA, device_id_};
+    }
+
+    const TensorShape& shape() const {
+        return shape_;
+    }
+
+    DType dtype() const {
+        return dtype_;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    std::size_t numel() const {
+        return shape_.numel();
+    }
+
+    std::size_t bytes() const {
+        return storage_.bytes();
+    }
+
+    void* data() {
+        return storage_.data();
+    }
+
+    const void* data() const {
+        return storage_.data();
+    }
+
+private:
+    void allocate() {
+        std::size_t bytes = 0;
+        if (!checked_mul(shape_.numel(), dtype_size(dtype_), &bytes)) {
+            std::fprintf(stderr, "Tensor allocation size overflow\n");
+            std::abort();
+        }
+        storage_.allocate(bytes, device_id_);
+    }
+
+    TensorShape shape_;
+    DType dtype_ = DType::F32;
+    int device_id_ = 0;
+    DeviceBuffer storage_;
 };
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/includes/utils.cuh b/cuda/includes/utils.cuh
index 84064ee..5595a89 100644
--- a/cuda/includes/utils.cuh
+++ b/cuda/includes/utils.cuh
@@ -1,9 +1,47 @@
 #pragma once
 
-// Aggregator — include this one header to get the full Day 1 runtime.
-// Each sub-header is small and independently loadable.
+#include "common.h"
 
-#include "common.h"   // macros, enums, error checks, dtype helpers
-#include "tensor.cuh" // TensorShape, Tensor struct
-#include "memory.cuh" // allocators, tensor_alloc_*, tensor_free, transfers
-#include "reduce.cuh" // warpReduceSum/Max/Min, blockReduceSum/Max
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
+namespace quadtrix {
+namespace cuda {
+
+constexpr int kWarpSize = 32;
+constexpr int kDefaultBlockSize = 256;
+
+inline int ceil_div(int value, int divisor) {
+    return (value + divisor - 1) / divisor;
+}
+
+inline std::size_t ceil_div(std::size_t value, std::size_t divisor) {
+    return (value + divisor - 1) / divisor;
+}
+
+inline dim3 one_dim_grid(std::size_t n, int block_size = kDefaultBlockSize) {
+    return dim3(static_cast<unsigned int>(ceil_div(n, static_cast<std::size_t>(block_size))));
+}
+
+#ifdef __CUDACC__
+template <typename T>
+__device__ __forceinline__ T warp_sum(T value) {
+    for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
+        value += __shfl_down_sync(0xffffffffu, value, offset);
+    }
+    return value;
+}
+
+template <typename T>
+__device__ __forceinline__ T warp_max(T value) {
+    for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
+        T other = __shfl_down_sync(0xffffffffu, value, offset);
+        value = value > other ? value : other;
+    }
+    return value;
+}
+#endif
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/cuda/llm_train.cu b/cuda/llm_train.cu
new file mode 100644
index 0000000..244bb56
--- /dev/null
+++ b/cuda/llm_train.cu
@@ -0,0 +1,174 @@
+#include "includes/adamw.cuh"
+#include "includes/checkpoint.h"
+#include "includes/common.h"
+#include "includes/dataloader.h"
+#include "includes/global_norm.cuh"
+#include "includes/logger.h"
+#include "includes/memory.cuh"
+#include "includes/runtime.cuh"
+#include "includes/schedulers.h"
+#include "includes/tensor.cuh"
+
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+namespace {
+
+struct TrainingConfig {
+    int batch_size = 1;
+    int sequence_length = 8;
+    int channels = 32;
+    int num_heads = 4;
+    int vocab_size = 256;
+    int total_steps = 1;
+    int warmup_steps = 1;
+    float max_lr = 1.0e-4f;
+    float min_lr = 1.0e-5f;
+    float beta1 = 0.9f;
+    float beta2 = 0.95f;
+    float epsilon = 1.0e-8f;
+    float weight_decay = 0.01f;
+    float grad_clip = 1.0f;
+};
+
+struct TrainingBuffers {
+    Tensor params;
+    Tensor grads;
+    Tensor first_moment;
+    Tensor second_moment;
+    Tensor norm_partials;
+    Tensor activations;
+    Tensor grad_activations;
+};
+
+bool parse_int_arg(const char* arg, const char* prefix, int* out) {
+    const std::size_t prefix_len = std::strlen(prefix);
+    if (std::strncmp(arg, prefix, prefix_len) != 0) {
+        return false;
+    }
+    *out = std::atoi(arg + prefix_len);
+    return true;
+}
+
+TrainingConfig parse_config(int argc, char** argv) {
+    TrainingConfig config;
+    for (int i = 1; i < argc; ++i) {
+        parse_int_arg(argv[i], "--steps=", &config.total_steps) ||
+            parse_int_arg(argv[i], "--batch=", &config.batch_size) ||
+            parse_int_arg(argv[i], "--seq=", &config.sequence_length) ||
+            parse_int_arg(argv[i], "--channels=", &config.channels) ||
+            parse_int_arg(argv[i], "--heads=", &config.num_heads) ||
+            parse_int_arg(argv[i], "--vocab=", &config.vocab_size);
+    }
+    return config;
+}
+
+bool validate_config(const TrainingConfig& config) {
+    if (config.batch_size <= 0 || config.sequence_length <= 0 || config.channels <= 0 ||
+        config.num_heads <= 0 || config.vocab_size <= 0 || config.total_steps <= 0) {
+        return false;
+    }
+    return config.channels % config.num_heads == 0;
+}
+
+TrainingBuffers allocate_buffers(const TrainingConfig& config, int device_id) {
+    const std::int64_t param_dims[] = {config.channels * config.channels};
+    const std::int64_t partial_dims[] = {256};
+    const std::int64_t activation_dims[] = {config.batch_size, config.sequence_length, config.channels};
+
+    TrainingBuffers buffers;
+    buffers.params = Tensor(param_dims, 1, DType::F32, device_id);
+    buffers.grads = Tensor(param_dims, 1, DType::F32, device_id);
+    buffers.first_moment = Tensor(param_dims, 1, DType::F32, device_id);
+    buffers.second_moment = Tensor(param_dims, 1, DType::F32, device_id);
+    buffers.norm_partials = Tensor(partial_dims, 1, DType::F32, device_id);
+    buffers.activations = Tensor(activation_dims, 3, DType::F32, device_id);
+    buffers.grad_activations = Tensor(activation_dims, 3, DType::F32, device_id);
+    return buffers;
+}
+
+Status zero_training_buffers(TrainingBuffers& buffers, cudaStream_t stream) {
+    Status status = memset_device(buffers.params.data(), 0, buffers.params.bytes(), stream);
+    if (!status.ok) {
+        return status;
+    }
+    status = memset_device(buffers.grads.data(), 0, buffers.grads.bytes(), stream);
+    if (!status.ok) {
+        return status;
+    }
+    status = memset_device(buffers.first_moment.data(), 0, buffers.first_moment.bytes(), stream);
+    if (!status.ok) {
+        return status;
+    }
+    return memset_device(buffers.second_moment.data(), 0, buffers.second_moment.bytes(), stream);
+}
+
+Status run_optimizer_step(TrainingBuffers& buffers, const TrainingConfig& config, int step, cudaStream_t stream) {
+    AdamWConfig adamw;
+    adamw.learning_rate = cosine_learning_rate(step, config.warmup_steps, config.total_steps, config.max_lr, config.min_lr);
+    adamw.beta1 = config.beta1;
+    adamw.beta2 = config.beta2;
+    adamw.epsilon = config.epsilon;
+    adamw.weight_decay = config.weight_decay;
+    adamw.step = step + 1;
+
+    return adamw_update(
+        buffers.params.view(),
+        buffers.grads.view(),
+        buffers.first_moment.view(),
+        buffers.second_moment.view(),
+        adamw,
+        1.0f,
+        stream);
+}
+
+}  // namespace
+}  // namespace cuda
+}  // namespace quadtrix
+
+int main(int argc, char** argv) {
+    using namespace quadtrix::cuda;
+
+    const TrainingConfig config = parse_config(argc, argv);
+    if (!validate_config(config)) {
+        log_message(LogLevel::Error, "invalid training config");
+        return 1;
+    }
+
+    const int devices = device_count();
+    if (devices <= 0) {
+        log_message(LogLevel::Error, "no CUDA devices found");
+        return 2;
+    }
+
+    DeviceGuard guard(0);
+    Stream stream = Stream::create();
+    TrainingBuffers buffers = allocate_buffers(config, 0);
+    Status zero_status = zero_training_buffers(buffers, stream.handle);
+    if (!zero_status.ok) {
+        return 3;
+    }
+
+    DataLoader dataloader;
+    TokenBatchView batch;
+    (void)dataloader.next(&batch);
+
+    for (int step = 0; step < config.total_steps; ++step) {
+        const float lr = cosine_learning_rate(step, config.warmup_steps, config.total_steps, config.max_lr, config.min_lr);
+        log_message(LogLevel::Info, "step %d lr %.8f", step + 1, lr);
+
+        Status update_status = run_optimizer_step(buffers, config, step, stream.handle);
+        if (!update_status.ok) {
+            log_message(LogLevel::Error, "optimizer step failed");
+            return 4;
+        }
+    }
+
+    stream.synchronize();
+    log_message(LogLevel::Info, "Quadtrix CUDA training orchestrator finished dry run on %d device(s)", devices);
+    return 0;
+}
diff --git a/cuda/tests/day1_smoke.cu b/cuda/tests/day1_smoke.cu
new file mode 100644
index 0000000..a78efce
--- /dev/null
+++ b/cuda/tests/day1_smoke.cu
@@ -0,0 +1,42 @@
+#include "../includes/memory.cuh"
+#include "../includes/runtime.cuh"
+#include "../includes/tensor.cuh"
+
+#include <cstdio>
+
+int main() {
+    using namespace quadtrix::cuda;
+
+    if (device_count() <= 0) {
+        std::fprintf(stderr, "No CUDA devices found\n");
+        return 1;
+    }
+
+    const std::int64_t dims[] = {2, 3};
+    Tensor tensor(dims, 2, DType::F32, 0);
+
+    float host_in[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    float host_out[6] = {};
+
+    Stream stream = Stream::create();
+    Status h2d = copy_h2d(tensor.data(), host_in, sizeof(host_in), stream.handle);
+    if (!h2d.ok) {
+        return 2;
+    }
+
+    Status d2h = copy_d2h(host_out, tensor.data(), sizeof(host_out), stream.handle);
+    if (!d2h.ok) {
+        return 3;
+    }
+    stream.synchronize();
+
+    for (int i = 0; i < 6; ++i) {
+        if (host_out[i] != host_in[i]) {
+            std::fprintf(stderr, "Mismatch at %d: got %f expected %f\n", i, host_out[i], host_in[i]);
+            return 4;
+        }
+    }
+
+    std::printf("Day 1 CUDA runtime smoke test passed\n");
+    return 0;
+}
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..8191856
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,34 @@
+services:
+  quadtrix:
+    image: ghcr.io/eamon2009/quadtrix.cpp:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        # for cuda
+        # BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu24.04
+        BASE_IMAGE: ubuntu:24.04
+
+    ports:
+      - "3001:3001" # FastAPI backend
+      - "8080:8080" # React frontend
+
+    volumes:
+      # Place best_model.pt and/or best_model.bin inside ./models/
+      - ./models:/app/models
+
+    environment:
+      TORCH_CHECKPOINT_PATH: /app/models/best_model.pt
+      GPT_MODEL_PATH: /app/models/best_model.bin
+      CORS_ORIGINS: http://localhost:8080
+      LOG_LEVEL: INFO
+      MAX_SESSIONS: 1000
+      SESSION_TTL_HOURS: 24
+    restart: unless-stopped
+
+    healthcheck:
+      test: [ "CMD", "curl", "-f", "http://localhost:3001/api/health" ]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 20s
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100644
index 0000000..a9fbdfe
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,36 @@
+set -e
+echo ""
+echo "  ██████╗ ██╗   ██╗ █████╗ ██████╗ ████████╗██████╗ ██╗██╗  ██╗"
+echo "  ██╔═══██╗██║   ██║██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗██║╚██╗██╔╝"
+echo "  ██║   ██║██║   ██║███████║██║  ██║   ██║   ██████╔╝██║ ╚███╔╝ "
+echo "  ██║▄▄ ██║██║   ██║██╔══██║██║  ██║   ██║   ██╔══██╗██║ ██╔██╗ "
+echo "  ╚██████╔╝╚██████╔╝██║  ██║██████╔╝   ██║   ██║  ██║██║██╔╝ ██╗"
+echo "   ╚══▀▀═╝  ╚═════╝ ╚═╝  ╚═╝╚═════╝    ╚═╝   ╚═╝  ╚═╝╚═╝╚═╝  ╚═╝"
+echo ""
+echo "  Starting Quadtrix.cpp..."
+echo ""
+echo "  FastAPI backend  at  http://localhost:3001"
+echo "  React frontend   at  http://localhost:8080"
+echo "  Models volume    at  /app/models"
+echo ""
+WEIGHTS_FOUND=0
+
+if [ -f "/app/models/best_model.pt" ]; then
+    echo " PyTorch checkpoint found: /app/models/best_model.pt"
+    WEIGHTS_FOUND=1
+fi
+
+if [ -f "/app/models/best_model.bin" ]; then
+    echo "  C++ checkpoint found:     /app/models/best_model.bin"
+    WEIGHTS_FOUND=1
+fi
+
+if [ "$WEIGHTS_FOUND" = "0" ]; then
+    echo ""
+    echo "  WARNING: No model weights found in /app/models"
+    echo "      The backend will start but inference will fail until weights are mounted."
+    echo "      Mount your weights directory:"
+    echo "        docker run -v /path/to/your/models:/app/models ..."
+    echo ""
+fi
+exec /usr/bin/supervisord -c /etc/supervisor/supervisord.conf
\ No newline at end of file
diff --git a/engine/engine.c b/engine/engine.c
deleted file mode 100644
index 1b494fa..0000000
--- a/engine/engine.c
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- *
- * No dependencies 
- * 
- * Compile: gcc -O3 -o gpt_inference chat.c -lm
- * Usage: ./gpt_inference model.bin
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-//  CONFIGURATION (must match Python training config)
-
-#define VOCAB_SIZE 50257  // GPT-2 tokenizer vocabulary size
-#define N_EMBD 64
-#define N_HEAD 4
-#define N_LAYER 4
-#define BLOCK_SIZE 32
-#define MAX_TOKENS 500
-//  DATA STRUCTURES
-
-typedef struct {
-    float* data;
-    int rows;
-    int cols;
-} Matrix;
-
-typedef struct {
-    float* data;
-    int size;
-} Vector;
-
-typedef struct {
-    // Head components
-    Matrix key_weight;
-    Matrix query_weight;
-    Matrix value_weight;
-} Head;
-
-typedef struct {
-    Head* heads;
-    Matrix proj_weight;
-    float* proj_bias;
-    int num_heads;
-    int head_size;
-} MultiHeadAttention;
-
-typedef struct {
-    Matrix fc1_weight;
-    float* fc1_bias;
-    Matrix fc2_weight;
-    float* fc2_bias;
-} FeedForward;
-
-typedef struct {
-    MultiHeadAttention attn;
-    FeedForward ffwd;
-    float* ln1_weight;
-    float* ln1_bias;
-    float* ln2_weight;
-    float* ln2_bias;
-} Block;
-
-typedef struct {
-    Matrix token_embedding;
-    Matrix position_embedding;
-    Block* blocks;
-    float* ln_f_weight;
-    float* ln_f_bias;
-    Matrix lm_head_weight;
-    float* lm_head_bias;
-} GPTModel;
-
-//  MEMORY MANAGEMENT
-
-
-Matrix create_matrix(int rows, int cols) {
-    Matrix m;
-    m.rows = rows;
-    m.cols = cols;
-    m.data = (float*)calloc(rows * cols, sizeof(float));
-    return m;
-}
-
-Vector create_vector(int size) {
-    Vector v;
-    v.size = size;
-    v.data = (float*)calloc(size, sizeof(float));
-    return v;
-}
-
-void free_matrix(Matrix* m) {
-    if (m->data) free(m->data);
-    m->data = NULL;
-}
-
-void free_vector(Vector* v) {
-    if (v->data) free(v->data);
-    v->data = NULL;
-}
-
-void matmul(float* out, float* a, float* b, int m, int n, int k) {
-    // out(m,k) = a(m,n) @ b(n,k)
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < k; j++) {
-            float sum = 0.0f;
-            for (int l = 0; l < n; l++) {
-                sum += a[i * n + l] * b[l * k + j];
-            }
-            out[i * k + j] = sum;
-        }
-    }
-}
-
-void softmax(float* x, int size) {
-    float max_val = x[0];
-    for (int i = 1; i < size; i++) {
-        if (x[i] > max_val) max_val = x[i];
-    }
-    
-    float sum = 0.0f;
-    for (int i = 0; i < size; i++) {
-        x[i] = expf(x[i] - max_val);
-        sum += x[i];
-    }
-    
-    for (int i = 0; i < size; i++) {
-        x[i] /= sum;
-    }
-}
-
-void layer_norm(float* out, float* x, float* weight, float* bias, int size) {
-    float mean = 0.0f;
-    for (int i = 0; i < size; i++) {
-        mean += x[i];
-    }
-    mean /= size;
-    
-    float variance = 0.0f;
-    for (int i = 0; i < size; i++) {
-        float diff = x[i] - mean;
-        variance += diff * diff;
-    }
-    variance /= size;
-    
-    float std = sqrtf(variance + 1e-5f);
-    
-    for (int i = 0; i < size; i++) {
-        out[i] = (x[i] - mean) / std * weight[i] + bias[i];
-    }
-}
-
-void relu(float* x, int size) {
-    for (int i = 0; i < size; i++) {
-        if (x[i] < 0) x[i] = 0;
-    }
-}
-
-// 
-//  MODEL OPERATIONS
-
-void attention_head_forward(float* out, float* x, Head* head, int T, int head_size) {
-    // Allocate buffers
-    float* q = (float*)malloc(T * head_size * sizeof(float));
-    float* k = (float*)malloc(T * head_size * sizeof(float));
-    float* v = (float*)malloc(T * head_size * sizeof(float));
-    float* scores = (float*)malloc(T * T * sizeof(float));
-    
-    // Compute Q, K, V
-    matmul(q, x, head->query_weight.data, T, N_EMBD, head_size);
-    matmul(k, x, head->key_weight.data, T, N_EMBD, head_size);
-    matmul(v, x, head->value_weight.data, T, N_EMBD, head_size);
-    
-    // Compute attention scores: Q @ K^T / sqrt(head_size)
-    float scale = 1.0f / sqrtf((float)head_size);
-    for (int i = 0; i < T; i++) {
-        for (int j = 0; j < T; j++) {
-            float sum = 0.0f;
-            for (int d = 0; d < head_size; d++) {
-                sum += q[i * head_size + d] * k[j * head_size + d];
-            }
-            scores[i * T + j] = sum * scale;
-            
-            // Causal mask
-            if (j > i) {
-                scores[i * T + j] = -INFINITY;
-            }
-        }
-    }
-    
-    // Apply softmax to each row
-    for (int i = 0; i < T; i++) {
-        softmax(&scores[i * T], T);
-    }
-    
-    // Apply attention to values
-    matmul(out, scores, v, T, T, head_size);
-    
-    free(q);
-    free(k);
-    free(v);
-    free(scores);
-}
-
-void multi_head_attention_forward(float* out, float* x, MultiHeadAttention* mha, int T) {
-    int head_size = mha->head_size;
-    float* concat = (float*)malloc(T * N_EMBD * sizeof(float));
-    float* head_out = (float*)malloc(T * head_size * sizeof(float));
-    
-    // Run each head
-    for (int h = 0; h < mha->num_heads; h++) {
-        attention_head_forward(head_out, x, &mha->heads[h], T, head_size);
-        
-        // Copy to concat buffer
-        for (int t = 0; t < T; t++) {
-            for (int d = 0; d < head_size; d++) {
-                concat[t * N_EMBD + h * head_size + d] = head_out[t * head_size + d];
-            }
-        }
-    }
-    
-    // Project concatenated heads
-    matmul(out, concat, mha->proj_weight.data, T, N_EMBD, N_EMBD);
-    
-    // Add bias
-    for (int t = 0; t < T; t++) {
-        for (int d = 0; d < N_EMBD; d++) {
-            out[t * N_EMBD + d] += mha->proj_bias[d];
-        }
-    }
-    
-    free(concat);
-    free(head_out);
-}
-
-void feedforward_forward(float* out, float* x, FeedForward* ff, int T) {
-    float* hidden = (float*)malloc(T * 4 * N_EMBD * sizeof(float));
-    
-    // First layer
-    matmul(hidden, x, ff->fc1_weight.data, T, N_EMBD, 4 * N_EMBD);
-    for (int i = 0; i < T * 4 * N_EMBD; i++) {
-        hidden[i] += ff->fc1_bias[i % (4 * N_EMBD)];
-    }
-    relu(hidden, T * 4 * N_EMBD);
-    
-    // Second layer
-    matmul(out, hidden, ff->fc2_weight.data, T, 4 * N_EMBD, N_EMBD);
-    for (int i = 0; i < T * N_EMBD; i++) {
-        out[i] += ff->fc2_bias[i % N_EMBD];
-    }
-    
-    free(hidden);
-}
-
-void block_forward(float* out, float* x, Block* block, int T) {
-    float* attn_out = (float*)malloc(T * N_EMBD * sizeof(float));
-    float* ln1_out = (float*)malloc(T * N_EMBD * sizeof(float));
-    float* ln2_out = (float*)malloc(T * N_EMBD * sizeof(float));
-    float* ff_out = (float*)malloc(T * N_EMBD * sizeof(float));
-    
-    // Layer norm 1
-    for (int t = 0; t < T; t++) {
-        layer_norm(&ln1_out[t * N_EMBD], &x[t * N_EMBD], 
-                   block->ln1_weight, block->ln1_bias, N_EMBD);
-    }
-    
-    // Attention + residual
-    multi_head_attention_forward(attn_out, ln1_out, &block->attn, T);
-    for (int i = 0; i < T * N_EMBD; i++) {
-        attn_out[i] += x[i];
-    }
-    
-    // Layer norm 2
-    for (int t = 0; t < T; t++) {
-        layer_norm(&ln2_out[t * N_EMBD], &attn_out[t * N_EMBD], 
-                   block->ln2_weight, block->ln2_bias, N_EMBD);
-    }
-    
-    // Feedforward + residual
-    feedforward_forward(ff_out, ln2_out, &block->ffwd, T);
-    for (int i = 0; i < T * N_EMBD; i++) {
-        out[i] = ff_out[i] + attn_out[i];
-    }
-    
-    free(attn_out);
-    free(ln1_out);
-    free(ln2_out);
-    free(ff_out);
-}
-
-void gpt_forward(float* logits, GPTModel* model, int* tokens, int T) {
-    float* x = (float*)malloc(T * N_EMBD * sizeof(float));
-    float* block_out = (float*)malloc(T * N_EMBD * sizeof(float));
-    
-    // Token + position embeddings
-    for (int t = 0; t < T; t++) {
-        int tok = tokens[t];
-        for (int d = 0; d < N_EMBD; d++) {
-            x[t * N_EMBD + d] = model->token_embedding.data[tok * N_EMBD + d] +
-                                 model->position_embedding.data[t * N_EMBD + d];
-        }
-    }
-    
-    // Run through blocks
-    for (int layer = 0; layer < N_LAYER; layer++) {
-        block_forward(block_out, x, &model->blocks[layer], T);
-        memcpy(x, block_out, T * N_EMBD * sizeof(float));
-    }
-    
-    // Final layer norm
-    for (int t = 0; t < T; t++) {
-        layer_norm(&block_out[t * N_EMBD], &x[t * N_EMBD], 
-                   model->ln_f_weight, model->ln_f_bias, N_EMBD);
-    }
-    
-    // LM head (only compute for last token)
-    matmul(logits, &block_out[(T-1) * N_EMBD], model->lm_head_weight.data, 
-           1, N_EMBD, VOCAB_SIZE);
-    
-    if (model->lm_head_bias) {
-        for (int i = 0; i < VOCAB_SIZE; i++) {
-            logits[i] += model->lm_head_bias[i];
-        }
-    }
-    
-    free(x);
-    free(block_out);
-}
-
-int sample_token(float* logits) {
-    softmax(logits, VOCAB_SIZE);
-    
-    float r = (float)rand() / RAND_MAX;
-    float cumsum = 0.0f;
-    
-    for (int i = 0; i < VOCAB_SIZE; i++) {
-        cumsum += logits[i];
-        if (r < cumsum) {
-            return i;
-        }
-    }
-    
-    return VOCAB_SIZE - 1;
-}
-
-
-//  MODEL LOADING
-
-
-int load_model(GPTModel* model, const char* filename) {
-    FILE* f = fopen(filename, "rb");
-    if (!f) {
-        fprintf(stderr, "Error: Cannot open model file %s\n", filename);
-        return 0;
-    }
-    
-    // Allocate model components
-    model->token_embedding = create_matrix(VOCAB_SIZE, N_EMBD);
-    model->position_embedding = create_matrix(BLOCK_SIZE, N_EMBD);
-    model->blocks = (Block*)malloc(N_LAYER * sizeof(Block));
-    model->ln_f_weight = (float*)malloc(N_EMBD * sizeof(float));
-    model->ln_f_bias = (float*)malloc(N_EMBD * sizeof(float));
-    model->lm_head_weight = create_matrix(N_EMBD, VOCAB_SIZE);
-    model->lm_head_bias = (float*)malloc(VOCAB_SIZE * sizeof(float));
-    
-    int head_size = N_EMBD / N_HEAD;
-    
-    // Load embeddings
-    fread(model->token_embedding.data, sizeof(float), VOCAB_SIZE * N_EMBD, f);
-    fread(model->position_embedding.data, sizeof(float), BLOCK_SIZE * N_EMBD, f);
-    
-    // Load blocks
-    for (int layer = 0; layer < N_LAYER; layer++) {
-        Block* block = &model->blocks[layer];
-        
-        // Multi-head attention
-        block->attn.num_heads = N_HEAD;
-        block->attn.head_size = head_size;
-        block->attn.heads = (Head*)malloc(N_HEAD * sizeof(Head));
-        
-        for (int h = 0; h < N_HEAD; h++) {
-            block->attn.heads[h].key_weight = create_matrix(N_EMBD, head_size);
-            block->attn.heads[h].query_weight = create_matrix(N_EMBD, head_size);
-            block->attn.heads[h].value_weight = create_matrix(N_EMBD, head_size);
-            
-            fread(block->attn.heads[h].key_weight.data, sizeof(float), N_EMBD * head_size, f);
-            fread(block->attn.heads[h].query_weight.data, sizeof(float), N_EMBD * head_size, f);
-            fread(block->attn.heads[h].value_weight.data, sizeof(float), N_EMBD * head_size, f);
-        }
-        
-        block->attn.proj_weight = create_matrix(N_EMBD, N_EMBD);
-        block->attn.proj_bias = (float*)malloc(N_EMBD * sizeof(float));
-        fread(block->attn.proj_weight.data, sizeof(float), N_EMBD * N_EMBD, f);
-        fread(block->attn.proj_bias, sizeof(float), N_EMBD, f);
-        
-        // Layer norms
-        block->ln1_weight = (float*)malloc(N_EMBD * sizeof(float));
-        block->ln1_bias = (float*)malloc(N_EMBD * sizeof(float));
-        block->ln2_weight = (float*)malloc(N_EMBD * sizeof(float));
-        block->ln2_bias = (float*)malloc(N_EMBD * sizeof(float));
-        fread(block->ln1_weight, sizeof(float), N_EMBD, f);
-        fread(block->ln1_bias, sizeof(float), N_EMBD, f);
-        fread(block->ln2_weight, sizeof(float), N_EMBD, f);
-        fread(block->ln2_bias, sizeof(float), N_EMBD, f);
-        
-        // Feedforward
-        block->ffwd.fc1_weight = create_matrix(N_EMBD, 4 * N_EMBD);
-        block->ffwd.fc1_bias = (float*)malloc(4 * N_EMBD * sizeof(float));
-        block->ffwd.fc2_weight = create_matrix(4 * N_EMBD, N_EMBD);
-        block->ffwd.fc2_bias = (float*)malloc(N_EMBD * sizeof(float));
-        fread(block->ffwd.fc1_weight.data, sizeof(float), N_EMBD * 4 * N_EMBD, f);
-        fread(block->ffwd.fc1_bias, sizeof(float), 4 * N_EMBD, f);
-        fread(block->ffwd.fc2_weight.data, sizeof(float), 4 * N_EMBD * N_EMBD, f);
-        fread(block->ffwd.fc2_bias, sizeof(float), N_EMBD, f);
-    }
-    
-    // Final layer norm and LM head
-    fread(model->ln_f_weight, sizeof(float), N_EMBD, f);
-    fread(model->ln_f_bias, sizeof(float), N_EMBD, f);
-    fread(model->lm_head_weight.data, sizeof(float), N_EMBD * VOCAB_SIZE, f);
-    fread(model->lm_head_bias, sizeof(float), VOCAB_SIZE, f);
-    
-    fclose(f);
-    printf(" Model loaded successfully from %s\n", filename);
-    return 1;
-}
-
-
-//  MAIN
-
-
-int main(int argc, char** argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <model.bin>\n", argv[0]);
-        return 1;
-    }
-    
-    srand(time(NULL));
-    
-    GPTModel model;
-    if (!load_model(&model, argv[1])) {
-        return 1;
-    }
-    
-    printf("\n------------------------------------------------------------------------------\n");
-    printf("  QUADTRIX Engine (Pure C)\n");
-    printf("------------------------------------------------------------------------------\n\n");
-    
-    // Example: Simple generation (you'll need to implement proper tokenization)
-    int tokens[BLOCK_SIZE];
-    tokens[0] = 198;  // Example token (newline in GPT-2)
-    int n_tokens = 1;
-    
-    printf("Generating text (simplified - add BPE tokenizer for full functionality)...\n\n");
-    
-    float* logits = (float*)malloc(VOCAB_SIZE * sizeof(float));
-    
-    for (int i = 0; i < 50; i++) {
-        int context_len = n_tokens < BLOCK_SIZE ? n_tokens : BLOCK_SIZE;
-        int* context = &tokens[n_tokens - context_len];
-        
-        gpt_forward(logits, &model, context, context_len);
-        int next_token = sample_token(logits);
-        
-        printf("%d ", next_token);
-        fflush(stdout);
-        
-        if (n_tokens < MAX_TOKENS) {
-            tokens[n_tokens++] = next_token;
-        } else {
-            // Shift tokens left
-            memmove(tokens, tokens + 1, (MAX_TOKENS - 1) * sizeof(int));
-            tokens[MAX_TOKENS - 1] = next_token;
-        }
-    }
-    
-    printf("\n\n");
-    free(logits);
-    
-    printf("------------------------------------------------------------------------------\n");
-    
-    return 0;
-}
\ No newline at end of file
diff --git a/engine/export_weights.py b/engine/export_weights.py
deleted file mode 100644
index d8cb45c..0000000
--- a/engine/export_weights.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import torch
-import sys
-import os
-import numpy as np
-
-def convert_model_to_bin(pt_path, bin_path):
-    """Convert PyTorch .pt file to binary format for C inference"""
-
-    print(f"Loading PyTorch model from {pt_path}...")
-    state_dict = torch.load(pt_path, map_location='cpu')
-
-    print("Converting to binary format...")
-
-    with open(bin_path, 'wb') as f:
-        # Write embeddings
-        print("  - Token embeddings")
-        token_emb = state_dict['token_embedding_table.weight'].numpy().flatten()
-        f.write(token_emb.tobytes())
-
-        print("  - Position embeddings")
-        pos_emb = state_dict['position_embedding_table.weight'].numpy().flatten()
-        f.write(pos_emb.tobytes())
-
-        # Model config (must match training)
-        n_layer = 4
-        n_head = 4
-        n_embd = 64
-
-        for layer in range(n_layer):
-            print(f"  - Block {layer}")
-
-            for h in range(n_head):
-                # Key
-                key_weight = state_dict[f'blocks.{layer}.sa.heads.{h}.key.weight'].numpy()
-                f.write(key_weight.tobytes())
-
-                # Query
-                query_weight = state_dict[f'blocks.{layer}.sa.heads.{h}.query.weight'].numpy()
-                f.write(query_weight.tobytes())
-
-                # Value
-                value_weight = state_dict[f'blocks.{layer}.sa.heads.{h}.value.weight'].numpy()
-                f.write(value_weight.tobytes())
-
-            # Projection
-            proj_weight = state_dict[f'blocks.{layer}.sa.proj.weight'].numpy()
-            f.write(proj_weight.tobytes())
-
-            proj_bias = state_dict[f'blocks.{layer}.sa.proj.bias'].numpy()
-            f.write(proj_bias.tobytes())
-
-            # LayerNorm 1
-            ln1_weight = state_dict[f'blocks.{layer}.ln1.weight'].numpy()
-            f.write(ln1_weight.tobytes())
-
-            ln1_bias = state_dict[f'blocks.{layer}.ln1.bias'].numpy()
-            f.write(ln1_bias.tobytes())
-
-            # LayerNorm 2
-            ln2_weight = state_dict[f'blocks.{layer}.ln2.weight'].numpy()
-            f.write(ln2_weight.tobytes())
-
-            ln2_bias = state_dict[f'blocks.{layer}.ln2.bias'].numpy()
-            f.write(ln2_bias.tobytes())
-
-            # Feedforward
-            ffwd_fc1_weight = state_dict[f'blocks.{layer}.ffwd.net.0.weight'].numpy()
-            f.write(ffwd_fc1_weight.tobytes())
-
-            ffwd_fc1_bias = state_dict[f'blocks.{layer}.ffwd.net.0.bias'].numpy()
-            f.write(ffwd_fc1_bias.tobytes())
-
-            ffwd_fc2_weight = state_dict[f'blocks.{layer}.ffwd.net.2.weight'].numpy()
-            f.write(ffwd_fc2_weight.tobytes())
-
-            ffwd_fc2_bias = state_dict[f'blocks.{layer}.ffwd.net.2.bias'].numpy()
-            f.write(ffwd_fc2_bias.tobytes())
-
-        # Final LayerNorm
-        print("  - Final layer norm")
-        ln_f_weight = state_dict['ln_f.weight'].numpy()
-        f.write(ln_f_weight.tobytes())
-
-        ln_f_bias = state_dict['ln_f.bias'].numpy()
-        f.write(ln_f_bias.tobytes())
-
-        # LM Head
-        print("  - Language model head")
-        lm_head_weight = state_dict['lm_head.weight'].numpy()
-        f.write(lm_head_weight.tobytes())
-
-        if 'lm_head.bias' in state_dict:
-            lm_head_bias = state_dict['lm_head.bias'].numpy()
-            f.write(lm_head_bias.tobytes())
-        else:
-            vocab_size = 50257
-            zeros = np.zeros(vocab_size, dtype=np.float32)
-            f.write(zeros.tobytes())
-
-    print(f"\n Conversion complete!")
-
-    # File info
-    abs_path = os.path.abspath(bin_path)
-    size_mb = os.path.getsize(bin_path) / (1024 * 1024)
-
-    print(f" Saved to: {abs_path}")
-    print(f" File size: {size_mb:.2f} MB")
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("Usage: python convert_pt_to_bin.py <input.pt> <output.bin>")
-        sys.exit(1)
-
-    pt_path = sys.argv[1]
-
-    # FORCE OUTPUT DIRECTORY
-    output_dir = r"C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\GPU & CPU"
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Keep filename but override location
-    output_filename = os.path.basename(sys.argv[2])
-    bin_path = os.path.join(output_dir, output_filename)
-
-    try:
-        convert_model_to_bin(pt_path, bin_path)
-    except Exception as e:
-        print(f"\n Error during conversion: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
\ No newline at end of file
diff --git a/engine/fine-tune/chat.py b/engine/fine-tune/chat.py
deleted file mode 100644
index ce21f3c..0000000
--- a/engine/fine-tune/chat.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-import tiktoken
-DEFAULT_CONFIG = {
-    'n_embd':      64,
-    'n_head':      4,
-    'n_layer':     4,
-    'block_size':  32,
-    'dropout':     0.0,   
-}
-
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-tokenizer = tiktoken.get_encoding("gpt2")
-vocab_size = tokenizer.n_vocab
-
-class Head(nn.Module):
-    def __init__(self, head_size, block_size, dropout):
-        super().__init__()
-        self.key   = nn.Linear(DEFAULT_CONFIG['n_embd'], head_size, bias=False)
-        self.query = nn.Linear(DEFAULT_CONFIG['n_embd'], head_size, bias=False)
-        self.value = nn.Linear(DEFAULT_CONFIG['n_embd'], head_size, bias=False)
-        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        _, T, _ = x.shape
-        k = self.key(x)
-        q = self.query(x)
-        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
-        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
-        wei = F.softmax(wei, dim=-1)
-        wei = self.dropout(wei)
-        return wei @ self.value(x)
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, num_heads, head_size, block_size, dropout):
-        super().__init__()
-        n_embd = DEFAULT_CONFIG['n_embd']
-        self.heads   = nn.ModuleList([Head(head_size, block_size, dropout) for _ in range(num_heads)])
-        self.proj    = nn.Linear(head_size * num_heads, n_embd)
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        out = torch.cat([h(x) for h in self.heads], dim=-1)
-        return self.dropout(self.proj(out))
-
-
-class FeedForward(nn.Module):
-    def __init__(self, n_embd, dropout):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(n_embd, 4 * n_embd),
-            nn.ReLU(),
-            nn.Linear(4 * n_embd, n_embd),
-            nn.Dropout(dropout),
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class Block(nn.Module):
-    def __init__(self, n_embd, n_head, block_size, dropout):
-        super().__init__()
-        head_size = n_embd // n_head
-        self.sa   = MultiHeadAttention(n_head, head_size, block_size, dropout)
-        self.ffwd = FeedForward(n_embd, dropout)
-        self.ln1  = nn.LayerNorm(n_embd)
-        self.ln2  = nn.LayerNorm(n_embd)
-
-    def forward(self, x):
-        x = x + self.sa(self.ln1(x))
-        x = x + self.ffwd(self.ln2(x))
-        return x
-
-
-class GPTLanguageModel(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        n_embd     = cfg['n_embd']
-        n_head     = cfg['n_head']
-        n_layer    = cfg['n_layer']
-        block_size = cfg['block_size']
-        dropout    = cfg['dropout']
-
-        self.block_size = block_size
-        self.token_embedding_table    = nn.Embedding(vocab_size, n_embd)
-        self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks = nn.Sequential(
-            *[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)]
-        )
-        self.ln_f   = nn.LayerNorm(n_embd)
-        self.lm_head = nn.Linear(n_embd, vocab_size)
-
-    def forward(self, idx, targets=None):
-        B, T = idx.shape
-        tok_emb = self.token_embedding_table(idx)
-        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
-        x = tok_emb + pos_emb
-        x = self.blocks(x)
-        x = self.ln_f(x)
-        logits = self.lm_head(x)
-        loss = None
-        if targets is not None:
-            B, T, C = logits.shape
-            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))
-        return logits, loss
-
-    @torch.no_grad()
-    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, top_p=None):
-        for _ in range(max_new_tokens):
-            idx_cond = idx[:, -self.block_size:]
-            logits, _ = self(idx_cond)
-            logits = logits[:, -1, :] / temperature          # (B, vocab_size)
-
-            # Top-k filtering
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = float('-inf')
-
-            # Top-p (nucleus) filtering
-            if top_p is not None:
-                sorted_logits, sorted_idx = torch.sort(logits, descending=True)
-                cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-                remove = cum_probs - F.softmax(sorted_logits, dim=-1) > top_p
-                sorted_logits[remove] = float('-inf')
-                logits = torch.zeros_like(logits).scatter(1, sorted_idx, sorted_logits)
-
-            probs = F.softmax(logits, dim=-1)
-            idx_next = torch.multinomial(probs, num_samples=1)
-            idx = torch.cat((idx, idx_next), dim=1)
-        return idx
-def load_model(pt_path: str) -> GPTLanguageModel:
-    print(f"Loading checkpoint: {pt_path}")
-    checkpoint = torch.load(pt_path, map_location=device)
-
-    if isinstance(checkpoint, dict):
-        cfg        = checkpoint.get('config', DEFAULT_CONFIG)
-        state_dict = checkpoint.get('model', checkpoint)
-    else:
-        # Raw state dict saved directly
-        cfg        = DEFAULT_CONFIG
-        state_dict = checkpoint
-
-    # Merge missing keys with defaults
-    for k, v in DEFAULT_CONFIG.items():
-        cfg.setdefault(k, v)
-    cfg['dropout'] = 0.0          # always off at inference
-    cfg['vocab_size'] = vocab_size
-
-    # Update module-level config so layers build correctly
-    DEFAULT_CONFIG.update(cfg)
-
-    model = GPTLanguageModel(cfg).to(device)
-    model.load_state_dict(state_dict, strict=False)
-    model.eval()
-
-    total_params = sum(p.numel() for p in model.parameters())
-    print(f"Model loaded  |  params: {total_params:,}  |  device: {device}")
-    print(f"  block_size={cfg['block_size']}  n_embd={cfg['n_embd']}  "
-          f"n_head={cfg['n_head']}  n_layer={cfg['n_layer']}")
-    return model
-def generate_reply(model, prompt: str, max_new_tokens=200,
-                   temperature=0.8, top_k=50, top_p=0.95) -> str:
-    tokens = tokenizer.encode(prompt)
-    idx    = torch.tensor([tokens], dtype=torch.long, device=device)
-    out    = model.generate(idx, max_new_tokens=max_new_tokens,
-                            temperature=temperature, top_k=top_k, top_p=top_p)
-    # Return only the newly generated part
-    new_tokens = out[0][len(tokens):].tolist()
-    return tokenizer.decode(new_tokens)
-
-def chat(model):
-    print("\n" + "═" * 60)
-    print("  Quadtrix Chat  —  type 'quit' or 'exit' to stop")
-    print("  Commands: /temp <0-2>  /tokens <n>  /topk <n>  /topp <0-1>  /reset")
-    print("═" * 60 + "\n")
-
-    # Mutable settings
-    settings = {
-        'temperature': 0.8,
-        'max_new_tokens': 200,
-        'top_k': 50,
-        'top_p': 0.95,
-        'context_window': True,   # keep rolling context
-    }
-    history = ""   # rolling conversation context
-
-    while True:
-        try:
-            user_input = input("You: ").strip()
-        except (EOFError, KeyboardInterrupt):
-            print("\nGoodbye!")
-            break
-
-        if not user_input:
-            continue
-
-        if user_input.lower() in ('quit', 'exit'):
-            print("Goodbye!")
-            break
-        if user_input.startswith('/'):
-            parts = user_input.split()
-            cmd   = parts[0].lower()
-            try:
-                if cmd == '/temp'   and len(parts) == 2:
-                    settings['temperature'] = float(parts[1])
-                    print(f"   temperature set to {settings['temperature']}")
-                elif cmd == '/tokens' and len(parts) == 2:
-                    settings['max_new_tokens'] = int(parts[1])
-                    print(f"  max_new_tokens set to {settings['max_new_tokens']}")
-                elif cmd == '/topk'  and len(parts) == 2:
-                    settings['top_k'] = int(parts[1])
-                    print(f"   top_k set to {settings['top_k']}")
-                elif cmd == '/topp'  and len(parts) == 2:
-                    settings['top_p'] = float(parts[1])
-                    print(f"   top_p set to {settings['top_p']}")
-                elif cmd == '/reset':
-                    history = ""
-                    print("  conversation history cleared")
-                elif cmd == '/settings':
-                    print(f"  {settings}")
-                else:
-                    print(f"   Unknown command: {cmd}")
-            except ValueError:
-                print("  Invalid value")
-            continue
-        history += user_input + "\n"
-        prompt   = history
-        reply = generate_reply(
-            model, prompt,
-            max_new_tokens = settings['max_new_tokens'],
-            temperature    = settings['temperature'],
-            top_k          = settings['top_k'],
-            top_p          = settings['top_p'],
-        )
-
-        print(f"\nModel: {reply.strip()}\n")
-        history += reply + "\n"
-        tokens = tokenizer.encode(history)
-        if len(tokens) > model.block_size - 50:
-            history = tokenizer.decode(tokens[-(model.block_size - 50):])
-
-if __name__ == '__main__':
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Chat with model')
-    parser.add_argument('--model', type=str, default='finetuned_model.pt',
-                        help='Path to the .pt checkpoint file (default: finetuned_model.pt)')
-    parser.add_argument('--max-tokens',  type=int,   default=200,  help='Max new tokens per reply')
-    parser.add_argument('--temperature', type=float, default=0.8,  help='Sampling temperature (0.1–2.0)')
-    parser.add_argument('--top-k',       type=int,   default=50,   help='Top-k sampling (0 = disabled)')
-    parser.add_argument('--top-p',       type=float, default=0.95, help='Top-p nucleus sampling')
-    parser.add_argument('--prompt',      type=str,   default=None, help='Single prompt (non-interactive)')
-    args = parser.parse_args()
-
-    model = load_model(args.model)
-
-    if args.prompt:
-        # One-shot mode
-        reply = generate_reply(model, args.prompt,
-                                max_new_tokens=args.max_tokens,
-                                temperature=args.temperature,
-                                top_k=args.top_k if args.top_k > 0 else None,
-                                top_p=args.top_p)
-        print(reply)
-    else:
-        chat(model)
\ No newline at end of file
diff --git a/engine/fine-tune/data-set.py b/engine/fine-tune/data-set.py
deleted file mode 100644
index 5374c56..0000000
--- a/engine/fine-tune/data-set.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-import sys
-import argparse
-from datasets import load_dataset
-
-TARGET_MB = 30
-OUTPUT_FILE = "input.txt"
-
-DATASETS = {
-    "alpaca":      ("yahma/alpaca-cleaned",            None,                  "train"),
-    "dolly":       ("databricks/databricks-dolly-15k", None,                  "train"),
-    "tinystories": ("roneneldan/TinyStories",          None,                  "train"),
-    "wikitext":    ("Salesforce/wikitext",             "wikitext-103-raw-v1", "train"),
-    "oasst":       ("OpenAssistant/oasst1",            None,                  "train"),
-    "gsm8k":       ("openai/gsm8k",                   "main",                "train"),
-}
-
-def row_to_text(row):
-    for field in ("text", "code", "content", "response", "output"):
-        if row.get(field, "").strip():
-            return row[field].strip() + "\n\n"
-    parts = []
-    if row.get("instruction", "").strip():
-        parts.append("### Instruction:\n" + row["instruction"].strip())
-    if row.get("input", "").strip():
-        parts.append("### Input:\n" + row["input"].strip())
-    if row.get("output", "").strip():
-        parts.append("### Response:\n" + row["output"].strip())
-    if parts:
-        return "\n\n".join(parts) + "\n\n"
-    for v in row.values():
-        if isinstance(v, str) and v.strip():
-            return v.strip() + "\n\n"
-    return ""
-
-def download(dataset_key, target_mb, output_file):
-    target_bytes = target_mb * 1024 * 1024
-
-    if dataset_key not in DATASETS:
-        print(f"Unknown dataset '{dataset_key}'. Choose from: {', '.join(DATASETS)}")
-        sys.exit(1)
-
-    hf_id, config, split = DATASETS[dataset_key]
-    print(f"Downloading '{dataset_key}' → {output_file} (target: {target_mb} MB)")
-
-    load_kwargs = dict(split=split, streaming=True, trust_remote_code=True)
-    if config:
-        load_kwargs["name"] = config
-
-    ds = load_dataset(hf_id, **load_kwargs)
-
-    written = 0
-    rows = 0
-    with open(output_file, "w", encoding="utf-8") as f:
-        for row in ds:
-            text = row_to_text(row)
-            if not text:
-                continue
-            f.write(text)
-            written += len(text.encode())
-            rows += 1
-            if written >= target_bytes:
-                break
-
-    print(f"Done. {rows} rows | {written / 1024 / 1024:.2f} MB → {output_file}")
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", default="alpaca", choices=DATASETS.keys())
-    parser.add_argument("--mb", type=int, default=TARGET_MB)
-    parser.add_argument("--output", default=OUTPUT_FILE)
-    args = parser.parse_args()
-    download(args.dataset, args.mb, args.output)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/engine/fine-tune/main.py b/engine/fine-tune/main.py
deleted file mode 100644
index 64b655c..0000000
--- a/engine/fine-tune/main.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-import time
-import tiktoken
-from pathlib import Path
-
-# Training configuration
-script_dir = Path(__file__).parent
-file_path = script_dir / "input.txt"
-model_path=script_dir / 'best_model.pt'
-batch_size    = 16
-block_size    = 32
-max_iters     = 2000
-eval_interval = 100
-learning_rate = 1e-3
-device        = 'cuda' if torch.cuda.is_available() else 'cpu'
-eval_iters    = 20
-n_embd        = 64
-n_head        = 4
-n_layer       = 4
-dropout       = 0.1
-
-# Tokenizer setup
-tokenizer = tiktoken.get_encoding("gpt2")
-vocab_size = tokenizer.n_vocab
-
-# Model definition (minimal GPT)
-class Head(nn.Module):
-    def __init__(self, head_size):
-        super().__init__()
-        self.key = nn.Linear(n_embd, head_size, bias=False)
-        self.query = nn.Linear(n_embd, head_size, bias=False)
-        self.value = nn.Linear(n_embd, head_size, bias=False)
-        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        _, T, _ = x.shape
-        k = self.key(x)
-        q = self.query(x)
-        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
-        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
-        wei = F.softmax(wei, dim=-1)
-        wei = self.dropout(wei)
-        return wei @ self.value(x)
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, num_heads, head_size):
-        super().__init__()
-        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
-        self.proj = nn.Linear(head_size * num_heads, n_embd)
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        out = torch.cat([h(x) for h in self.heads], dim=-1)
-        return self.dropout(self.proj(out))
-
-class FeedForward(nn.Module):
-    def __init__(self, n_embd):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(n_embd, 4 * n_embd),
-            nn.ReLU(),
-            nn.Linear(4 * n_embd, n_embd),
-            nn.Dropout(dropout),
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-class Block(nn.Module):
-    def __init__(self, n_embd, n_head):
-        super().__init__()
-        head_size = n_embd // n_head
-        self.sa = MultiHeadAttention(n_head, head_size)
-        self.ffwd = FeedForward(n_embd)
-        self.ln1 = nn.LayerNorm(n_embd)
-        self.ln2 = nn.LayerNorm(n_embd)
-
-    def forward(self, x):
-        x = x + self.sa(self.ln1(x))
-        x = x + self.ffwd(self.ln2(x))
-        return x
-
-class GPTLanguageModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
-        self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
-        self.ln_f = nn.LayerNorm(n_embd)
-        self.lm_head = nn.Linear(n_embd, vocab_size)
-
-    def forward(self, idx, targets=None):
-        B, T = idx.shape
-        tok_emb = self.token_embedding_table(idx)
-        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
-        x = tok_emb + pos_emb
-        x = self.blocks(x)
-        x = self.ln_f(x)
-        logits = self.lm_head(x)
-
-        if targets is None:
-            loss = None
-        else:
-            B, T, C = logits.shape
-            logits = logits.view(B*T, C)
-            targets = targets.view(B*T)
-            loss = F.cross_entropy(logits, targets)
-
-        return logits, loss
-
-# Data loading function
-def get_batch(split):
-    data = train_data if split == 'train' else val_data
-    ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([data[i:i+block_size] for i in ix])
-    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
-    x, y = x.to(device), y.to(device)
-    return x, y
-
-@torch.no_grad()
-def estimate_loss():
-    out = {}
-    model.eval()
-    for split in ['train', 'val']:
-        losses = torch.zeros(eval_iters)
-        for k in range(eval_iters):
-            X, Y = get_batch(split)
-            logits, loss = model(X, Y)
-            losses[k] = loss.item()
-        out[split] = losses.mean()
-    model.train()
-    return out
-
-# Load your dataset - REPLACE 'input.txt' with your actual data file
-print("Loading data...")
-with open(file_path, 'r', encoding='utf-8') as f:
-    text = f.read()
-
-# Encode using tiktoken
-data = torch.tensor(tokenizer.encode(text), dtype=torch.long)
-n = int(0.9 * len(data))
-train_data = data[:n]
-val_data = data[n:]
-print(f"Dataset: {len(data):,} tokens | Train: {len(train_data):,} | Val: {len(val_data):,}")
-
-# Initialize model
-print(f"Initializing model with vocab_size={vocab_size}")
-model = GPTLanguageModel().to(device)
-
-# Load existing weights
-print("Loading weights from best_model.pt...")
-checkpoint = torch.load(model_path, map_location=device)
-if isinstance(checkpoint, dict):
-    model.load_state_dict(checkpoint['model'] if 'model' in checkpoint else checkpoint)
-else:
-    model.load_state_dict(checkpoint)
-print(f"Weights loaded successfully")
-
-# Optimizer
-optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-
-# Training loop
-print(f"\nStarting fine-tuning on {device}")
-print(f"{'Step':<10} {'Train Loss':<12} {'Val Loss':<12} {'Time (ms)':<12} {'Tok/s':<10}")
-print("-" * 66)
-
-start_time = time.time()
-for iter in range(max_iters):
-    
-    # Evaluate
-    if iter % eval_interval == 0 or iter == max_iters - 1:
-        losses = estimate_loss()
-        elapsed = (time.time() - start_time) * 1000
-        tokens_per_sec = (batch_size * block_size * eval_interval) / ((time.time() - start_time) if iter > 0 else 1)
-        
-        print(f"{iter:<10} {losses['train']:.6f}     {losses['val']:.6f}     {elapsed:<12.2f} {tokens_per_sec:<10.0f}")
-        start_time = time.time()
-    
-    # Training step
-    xb, yb = get_batch('train')
-    logits, loss = model(xb, yb)
-    optimizer.zero_grad(set_to_none=True)
-    loss.backward()
-    optimizer.step()
-
-# Save fine-tuned model
-print("\nSaving fine-tuned model...")
-torch.save({
-    'model': model.state_dict(),
-    'optimizer': optimizer.state_dict(),
-    'iter': max_iters,
-    'config': {
-        'vocab_size': vocab_size,
-        'n_embd': n_embd,
-        'n_head': n_head,
-        'n_layer': n_layer,
-        'block_size': block_size,
-        'dropout': dropout,
-    }
-}, 'finetuned_model.pt')
-print(" Model saved to finetuned_model.pt")
\ No newline at end of file
diff --git a/engine/inference.py b/engine/inference.py
index 4f1b8d6..d4ce649 100644
--- a/engine/inference.py
+++ b/engine/inference.py
@@ -8,9 +8,6 @@
 import tiktoken
 
 
-W = 78
-DOUBLE = "=" * W
-SINGLE = "-" * W
 ARROW = "->"
 
 block_size = 32
@@ -23,11 +20,9 @@
 
 
 def header(title, subtitle=""):
-    print(f"\n{DOUBLE}")
     print(f"  {title}")
     if subtitle:
         print(f"  {subtitle}")
-    print(DOUBLE)
 
 
 def row(label, value="", unit="", note=""):
@@ -39,7 +34,7 @@ def row(label, value="", unit="", note=""):
 
 
 def rule():
-    print(f"  {SINGLE}")
+    print(f"")
 
 
 def blank():
@@ -68,7 +63,8 @@ def __init__(self, head_size):
         self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
-        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+        self.register_buffer("tril", torch.tril(
+            torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, x):
@@ -128,7 +124,8 @@ def __init__(self):
         super().__init__()
         self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
         self.ln_f = nn.LayerNorm(n_embd)
         self.lm_head = nn.Linear(n_embd, vocab_size)
 
@@ -163,7 +160,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
             probs = F.softmax(logits, dim=-1)
             idx_next = torch.multinomial(probs, num_samples=1)
             idx = torch.cat((idx, idx_next), dim=1)
-        return idx
+            yield idx_next.item()
 
 
 def default_checkpoint_path():
@@ -194,20 +191,23 @@ def load_model(checkpoint_path):
     return model
 
 
-def generate_response(model, prompt, max_new_tokens, temperature, top_k):
+def stream_response(model, prompt, max_new_tokens, temperature, top_k):
     encoded_prompt = encode(prompt, tokenizer)
     context = torch.tensor([encoded_prompt], dtype=torch.long, device=device)
 
     with torch.no_grad():
-        output_ids = model.generate(
+        for token_id in model.generate(
             context,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
-        )
+        ):
+            word = decode([token_id], tokenizer)
+            yield word
 
-    new_tokens = output_ids[0][len(encoded_prompt):].tolist()
-    return decode(new_tokens, tokenizer).strip()
+
+def generate_response(model, prompt, max_new_tokens, temperature, top_k):
+    return "".join(stream_response(model, prompt, max_new_tokens, temperature, top_k)).strip()
 
 
 def chat(model, args):
@@ -215,7 +215,7 @@ def chat(model, args):
     blank()
 
     while True:
-        prompt = input(f"  user  {ARROW} ").strip()
+        prompt = input(f" >> ").strip()
         if prompt.lower() in ("quit", "exit", "q"):
             blank()
             print("  Session ended.")
@@ -223,27 +223,31 @@ def chat(model, args):
         if not prompt:
             continue
 
-        response = generate_response(
+        blank()
+        print(f"  ", end="", flush=True)
+        for word in stream_response(
             model,
             prompt,
             args.max_new_tokens,
             args.temperature,
             args.top_k,
-        )
+        ):
+            print(word, end="", flush=True)
         blank()
-        print(f"  Model {ARROW} {response}")
         blank()
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Run inference from an engine trained .pt checkpoint.")
+    parser = argparse.ArgumentParser(
+        description="Run inference from an engine trained .pt checkpoint.")
     parser.add_argument(
         "--checkpoint",
         type=Path,
         default=default_checkpoint_path(),
         help="Path to the .pt file generated by engine/main.py.",
     )
-    parser.add_argument("--prompt", type=str, default=None, help="Generate once from this prompt.")
+    parser.add_argument("--prompt", type=str, default=None,
+                        help="Generate once from this prompt.")
     parser.add_argument("--max-new-tokens", type=int, default=200)
     parser.add_argument("--temperature", type=float, default=1.0)
     parser.add_argument("--top-k", type=int, default=None)
@@ -253,33 +257,26 @@ def parse_args():
 def main():
     args = parse_args()
     start = time.time()
-
-    print(f"{'Quadtrix-v1.0':^{W}}")
     blank()
-    row("Started", time.strftime("%Y-%m-%d  %H:%M:%S"))
-    row("Device", str(device))
-    row("PyTorch", torch.__version__)
-    row("Checkpoint", args.checkpoint)
     rule()
 
     model = load_model(args.checkpoint)
 
     if args.prompt:
-        response = generate_response(
+        for word in stream_response(
             model,
             args.prompt,
             args.max_new_tokens,
             args.temperature,
             args.top_k,
-        )
+        ):
+            print(word, end="", flush=True)
         blank()
-        print(response)
     else:
         chat(model, args)
 
     blank()
     row("Total", f"{time.time() - start:.2f}s")
-    print(DOUBLE)
 
 
 if __name__ == "__main__":
diff --git a/engine/llm.py b/engine/llm.py
new file mode 100644
index 0000000..4582f4d
--- /dev/null
+++ b/engine/llm.py
@@ -0,0 +1,322 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import time
+import sys
+import os
+from pathlib import Path
+import tiktoken
+SCRIPT_DIR = Path(__file__).resolve().parent
+LOG_DIR = SCRIPT_DIR / "logs"
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+LOG_PATH = LOG_DIR / f"run_{time.strftime('%Y%m%d_%H%M%S')}.txt"
+
+
+def log(msg: str = ""):
+    print(msg)
+    with open(LOG_PATH, "a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+
+cleaned_path = Path(os.environ.get("data", SCRIPT_DIR / "input.txt"))
+train_split = 0.9
+seed = 1337
+
+batch_size = 32
+block_size = 543
+max_iters = 25000
+eval_interval = 1
+sample_interval = 100
+learning_rate = 3e-4
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+eval_iters = 200
+n_embd = 64
+n_head = 10
+n_layer = 10
+dropout = 0.0
+
+torch.manual_seed(seed)
+
+
+def get_tokenizer(encoding_name="gpt2"):
+    tokenizer = tiktoken.get_encoding(encoding_name)
+    vocab_size = tokenizer.n_vocab
+    return tokenizer, vocab_size
+
+
+def encode(text, tokenizer): return tokenizer.encode(text)
+def decode(tokens, tokenizer): return tokenizer.decode(tokens)
+
+
+with open(cleaned_path, 'r', encoding='utf-8') as f:
+    text = f.read()
+
+tokenizer, vocab_size = get_tokenizer("o200k_base")
+data = torch.tensor(encode(text, tokenizer), dtype=torch.long)
+n = int(train_split * len(data))
+train_data = data[:n]
+val_data = data[n:]
+
+
+def get_batch(split):
+    d = train_data if split == 'train' else val_data
+    ix = torch.randint(len(d) - block_size, (batch_size,))
+    x = torch.stack([d[i:i + block_size] for i in ix])
+    y = torch.stack([d[i + 1:i + block_size + 1] for i in ix])
+    return x.to(device), y.to(device)
+
+
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            _, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+
+
+class Head(nn.Module):
+    def __init__(self, head_size):
+        super().__init__()
+        self.key = nn.Linear(n_embd, head_size, bias=False)
+        self.query = nn.Linear(n_embd, head_size, bias=False)
+        self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.register_buffer('tril', torch.tril(
+            torch.ones(block_size, block_size)))
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        return wei @ self.value(x)
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, num_heads, head_size):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+        self.proj = nn.Linear(head_size * num_heads, n_embd)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        return self.dropout(self.proj(out))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, n_embd):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            nn.ReLU(),
+            nn.Linear(4 * n_embd, n_embd),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Block(nn.Module):
+    def __init__(self, n_embd, n_head):
+        super().__init__()
+        head_size = n_embd // n_head
+        self.sa = MultiHeadAttention(n_head, head_size)
+        self.ffwd = FeedForward(n_embd)
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
+
+    def forward(self, x):
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+
+
+class GPTLanguageModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+        self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        tok_emb = self.token_embedding_table(idx)
+        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
+        x = tok_emb + pos_emb
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+
+        loss = None
+        if targets is not None:
+            B, T, C = logits.shape
+            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))
+        return logits, loss
+
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -block_size:]
+            logits, _ = self(idx_cond)
+            probs = F.softmax(logits[:, -1, :], dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+
+
+@torch.no_grad()
+def print_sample(step, max_new_tokens=1000):
+    model.eval()
+    context = torch.zeros((1, 1), dtype=torch.long, device=device)
+    output_ids = model.generate(context, max_new_tokens=max_new_tokens)
+    sample = decode(output_ids[0].tolist(), tokenizer).strip()
+    log("")
+    log(f"sample at step {step}:")
+    log("-" * 60)
+    log(sample)
+    log("-" * 60)
+    log("")
+    model.train()
+
+
+model = GPTLanguageModel().to(device)
+n_params = sum(p.numel() for p in model.parameters())
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+
+log("")
+log("quadtrix v1.0")
+log(f"device: {device}")
+log(f"number of parameters: {n_params:,}")
+log(f"train tokens: {len(train_data):,}  |  val tokens: {len(val_data):,}")
+log(f"batch size: {batch_size}  |  block size: {block_size}  |  lr: {learning_rate:.0e}")
+log(f"layers: {n_layer}  |  heads: {n_head}  |  embd: {n_embd}  |  dropout: {dropout}")
+log("")
+log(f"training for {max_iters} steps, eval every {eval_interval}, sample every {sample_interval}")
+log("")
+
+best_val_loss = float('inf')
+best_step = 0
+prev_train = None
+train_start = time.time()
+
+for iter_num in range(max_iters):
+
+    # eval
+    if iter_num % eval_interval == 0 or iter_num == max_iters - 1:
+        losses = estimate_loss()
+        elapsed = time.time() - train_start
+
+        # gradient norm
+        total_norm = sum(
+            p.grad.detach().norm(2).item() ** 2
+            for p in model.parameters() if p.grad is not None
+        ) ** 0.5
+
+        # loss delta
+        delta_str = "    n/a" if prev_train is None else f"{losses['train'].item() - prev_train:+.4f}"
+        prev_train = losses['train'].item()
+
+        # throughput + ETA
+        steps_done = iter_num + 1
+        tok_per_sec = int(steps_done * batch_size *
+                          block_size / elapsed) if elapsed > 0 else 0
+        steps_left = max_iters - steps_done
+        eta_sec = int(steps_left * elapsed /
+                      steps_done) if steps_done > 0 else 0
+        eta_str = f"{eta_sec // 60}m {eta_sec % 60:02d}s"
+
+        is_best = losses['val'] < best_val_loss
+        if is_best:
+            best_val_loss = losses['val']
+            best_step = iter_num
+            torch.save(model.state_dict(), 'best_model.pt')
+
+        marker = " (best)" if is_best else ""
+        log(
+            f"step {iter_num:>5}/{max_iters} |"
+            f" train loss {losses['train'].item():.4f} |"
+            f" val loss {losses['val'].item():.4f} |"
+            f" delta {delta_str} |"
+            f" norm {total_norm:.3f} |"
+            f" {tok_per_sec:,} tok/s |"
+            f" eta {eta_str}"
+            f"{marker}"
+        )
+        sys.stdout.flush()
+    if iter_num > 0 and iter_num % sample_interval == 0:
+        print_sample(iter_num)
+    xb, yb = get_batch('train')
+    logits, loss = model(xb, yb)
+    optimizer.zero_grad(set_to_none=True)
+    loss.backward()
+    optimizer.step()
+
+total_time = time.time() - train_start
+log("")
+log(f"training done in {int(total_time // 60)}m {int(total_time % 60):02d}s")
+log(f"best val loss {best_val_loss:.4f} at step {best_step}")
+log(f"checkpoint saved to best_model.pt")
+log("")
+model.load_state_dict(torch.load(
+    'best_model.pt', map_location=device, weights_only=True))
+model.eval()
+log(f"restored best_model.pt (val {best_val_loss:.4f})")
+log("")
+log("final sample from best checkpoint:")
+print_sample("final")
+log("inference  |  type 'quit' to exit")
+log("")
+
+try:
+    while True:
+        prompt = input("you > ").strip()
+        log(f"you > {prompt}")
+
+        if prompt.lower() in ("quit", "exit", "q"):
+            log("session ended.")
+            break
+        if not prompt:
+            continue
+
+        context = torch.tensor([encode(prompt, tokenizer)],
+                               dtype=torch.long, device=device)
+        with torch.no_grad():
+            output_ids = model.generate(context, max_new_tokens=200)
+
+        new_tokens = output_ids[0][context.shape[1]:].tolist()
+        response = decode(new_tokens, tokenizer).strip()
+        log("")
+        log(f"model > {response}")
+        log("")
+
+except KeyboardInterrupt:
+    log("")
+    log("interrupted.")
+
+log("")
+log(f"total training time: {int(total_time // 60)}m {int(total_time % 60):02d}s")
+log(f"log written to: {LOG_PATH}")
+log("")
diff --git a/engine/logs/chat.txt b/engine/logs/chat.txt
deleted file mode 100644
index 3d0ca61..0000000
--- a/engine/logs/chat.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-user  -> hi
-
-  Model -> like the sunrise all of the shiny, and sparkly cars. When the elephant was happy to have come back home day!
-
-Lollarreled and chilly had a funny plan to touch her friend. She felt managed to put the picture on the spike and down. When they reached on a cliff, the oak, the flag and crashed against a butterfly in the corner.
-
-The man said, "Too powerful! Can we say it?" Chirpy replied, "I'm sorry," her grandma. "That is time to go to the zoo with you. But we have to see we can meet you."
-
-Tom smiles. He asks him, "OK, why are you doing? You are a bee on. It will get some fake clothes. I want to help you."
-
-"Yes, don't wait to be, we can hear a big boat and you will be good!" Tim said. "We love it. You have to take a
-
-  user  -> The night was cold and silent. The stars flickered above as she held the letter in her hand.
-
-  Model -> The dolphin thought it was so much that she smiled and went out of the pool. She was so excited she decided to do something new.
-
-P tires was cozy for a while and the flower went outside with her bike. The bottom was filled with beautiful. She walked around, she noticed what she was frightened and she was done playing in the park. She had found the red stick because joy like a colourful stars fish. It was white and looked around and still and saw what it sounded like this butter. It was and she started to giggle and feel warm and buzzing. George tried to push the sky, but his elephant was too late. 
-
-But the boy came box was scared. He squealed that he couldn't get the nut. He had started to the pant patterns and dipped it away, but he decided to only rain dance. So he blew the fish to bounce for some of the rain again. He was so strong and warm. He still didn't think
-
-  user  -> Once upon a time there was a king who
-
-  Model -> loved to drive in the sun. While they were on, there was a bench of trees, looking at a cloud in the rocks and the sun fly the park. The two friends were all
-One day, Leo found pictures in the park. He asked Timmy if they saw the man around the park. They got very sad. 
-
-Mia said hello to the stage. She said, "I'm glad you sparkly ones that's a pear!"
-
-So, they both decided to try how they sat in the wonderful pond. They sit the lake. People clothes with kings and its key, and dumped hands for some whales.
-
-Soon, Jack flew it to a. Suddenly, he decided to do it and didn't like he did. He rolled them up and left the hole in the park.
-
-So Tom, the jogging and his friends continued to help his friends for each other. Soon Bob chose playing with the hard junk that doggy would go
-
-  user  -> Once upon a time, there was a king who
-
-  Model -> wanted to go inside to see a big chair looking magic. He showed hide and he returned a postfly with a scene.
-
-He flew to the butterfly next day. Tommy picked him out a needle and it looked all ash. The sack was very it workers, he knew he had her pepper to snuggled and asked Dad for it in a lot. The end.
-
-One thing answered, Molly decided to have some glue and swim restored her eyes. Every day she wouldmy always get outside, pretending to share its cake with her things. 
-
-One day, Jenny hopped to her house. She saw something shiny, thermometer was cool and went down. She didn't seem wires, listening to it and she drank some houses and make any more excited knowing it would be polite than before. She hoped all the things kind adventure and be™s, and she knew it was too dangerous.
-
-Once upon a time, there was a little girl named Lily. She
-
-  user  -> The river flowed through the valley and
-
-  Model -> the shore big screen! All of the colors like he opened it and kept flying in the air.
-
-A little boy named investigate. He floated down with the carrot and covered instead of this zoom on the street. The son wanted to record it. He asked, "Oh, Lola, it's coming in the peife." He couldn't believe the water of them up until he saw lots of milk suddenly he thought about its eyes and some roots wouldn't be stuck in the street. All the faith danced, it was beautiful and secure about it at the pond.
-
-After the villagers got a new friend, tug and a big rock and said, "Someone can."
-
-Timmy agreed and promised to have some sweet lunch to promise and take them to carry them in his blocks. But Mike kept away, he kept his mother even Billy had ran to have a big wave to the pool and scared he went to explore. The village was a little girl named Lily and weak
-
-  user  -> The dog was named Bruno. Bruno had a blue collar. One day, Bruno ran into the forest and
-
-  Model -> finally looked on her boat. The man laughed and praised the carpet from the park together. The duck soon quickly sat on an grass. 
-
-The two children were friends very excited and told him that mom had to sort the toys when the squash was watching the farmer's family was come back by.
-
-But a nosy raven said, "Don't worry they did the world!"
-
-Sammy smiled and said, "Hello, I want you to see the surprise, their fingers in. Are you sure, Jimmy? Now be careful and you want to include you. You can help me. Be good to be good or don't agree write too."
-
-But when, they walked, the zipper looked up. The pebble was a naughty voice. It spilled the tiny screen became heavy. Lily heard her. She did not care of her presents. She liked to swim and see her room. She wanted to hold her toys and touch it. She wished she had
\ No newline at end of file
diff --git a/engine/logs/run_20260504_143730.txt b/engine/logs/run_20260504_143730.txt
deleted file mode 100644
index 40e2dd3..0000000
--- a/engine/logs/run_20260504_143730.txt
+++ /dev/null
@@ -1,298 +0,0 @@
-2026-05-04 14:37:30 |                                 Quadtrix-v1.0                                 
-
-2026-05-04 14:37:30 |   Started                     2026-05-04  14:37:30        
-2026-05-04 14:37:30 |   Device                      CPU                         
-2026-05-04 14:37:30 |   PyTorch                     2.4.1+cpu                   
-2026-05-04 14:37:30 |   Log file                    C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\engine\logs\run_20260504_143730.txt        
-
-2026-05-04 14:37:39 | ==============================================================================
-2026-05-04 14:37:39 |   CONFIG
-2026-05-04 14:37:39 | ==============================================================================
-2026-05-04 14:37:39 |   Seed                        1337                        
-2026-05-04 14:37:39 |   Batch size                  16                          
-2026-05-04 14:37:39 |   Block size                  32                          
-2026-05-04 14:37:39 |   Learning rate               0.001                       
-2026-05-04 14:37:39 |   Layers                      4                           
-2026-05-04 14:37:39 |   Heads                       4                           
-2026-05-04 14:37:39 |   Embedding dim               64                          
-2026-05-04 14:37:39 |   Dropout                     0.1                         
-2026-05-04 14:37:39 |   Parameters                  6,684,497                   
-2026-05-04 14:37:39 |   Train tokens                7,065,137                   
-2026-05-04 14:37:39 |   Val tokens                  785,016                     
-2026-05-04 14:37:39 |   Data file                   C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\engine\input.txt        
-
-2026-05-04 14:37:39 | ==============================================================================
-2026-05-04 14:37:39 |   TRAINING
-2026-05-04 14:37:39 |   10,000 steps | eval every 10 | checkpoint on improvement
-2026-05-04 14:37:39 | ==============================================================================
-
-2026-05-04 14:37:45 | step    0/10000 | loss 10.823271          | norm 0.0000          | lr 1.00e-03 | 6008.76 ms | 60.0% bf16 MFU | 85 tok/s
-2026-05-04 14:37:56 | step   10/10000 | loss 9.919573 (-0.90z) | norm 1.0755 (-0.90z) | lr 1.00e-03 | 17541.63 ms | 60.0% bf16 MFU | 321 tok/s
-2026-05-04 14:38:10 | step   20/10000 | loss 8.960536 (-0.96z) | norm 1.0837 (-0.96z) | lr 1.00e-03 | 30928.88 ms | 60.0% bf16 MFU | 347 tok/s
-2026-05-04 14:38:21 | step   30/10000 | loss 7.965471 (-1.00z) | norm 1.1215 (-1.00z) | lr 1.00e-03 | 42338.19 ms | 60.0% bf16 MFU | 374 tok/s
-2026-05-04 14:38:33 | step   40/10000 | loss 7.123497 (-0.84z) | norm 1.1473 (-0.84z) | lr 1.00e-03 | 54362.14 ms | 60.0% bf16 MFU | 386 tok/s
-2026-05-04 14:38:45 | step   50/10000 | loss 6.588623 (-0.53z) | norm 0.9567 (-0.53z) | lr 1.00e-03 | 66579.86 ms | 60.0% bf16 MFU | 392 tok/s
-2026-05-04 14:38:58 | step   60/10000 | loss 6.261659 (-0.33z) | norm 0.8200 (-0.33z) | lr 1.00e-03 | 79535.98 ms | 60.0% bf16 MFU | 392 tok/s
-2026-05-04 14:39:09 | step   70/10000 | loss 6.084456 (-0.18z) | norm 0.5600 (-0.18z) | lr 1.00e-03 | 90772.39 ms | 60.0% bf16 MFU | 400 tok/s
-2026-05-04 14:39:20 | step   80/10000 | loss 5.920197 (-0.16z) | norm 0.5964 (-0.16z) | lr 1.00e-03 | 101472.72 ms | 60.0% bf16 MFU | 408 tok/s
-2026-05-04 14:39:31 | step   90/10000 | loss 5.923934 (+0.00z) | norm 0.6237 (+0.00z) | lr 1.00e-03 | 112395.39 ms | 60.0% bf16 MFU | 414 tok/s
-2026-05-04 14:39:41 | step  100/10000 | loss 5.831656 (-0.09z) | norm 0.7426 (-0.09z) | lr 1.00e-03 | 122897.32 ms | 60.0% bf16 MFU | 420 tok/s
-2026-05-04 14:39:53 | step  110/10000 | loss 5.735040 (-0.10z) | norm 0.7108 (-0.10z) | lr 1.00e-03 | 134747.30 ms | 60.0% bf16 MFU | 421 tok/s
-2026-05-04 14:40:06 | step  120/10000 | loss 5.582746 (-0.15z) | norm 0.7010 (-0.15z) | lr 1.00e-03 | 147069.85 ms | 60.0% bf16 MFU | 421 tok/s
-2026-05-04 14:40:16 | step  130/10000 | loss 5.540210 (-0.04z) | norm 0.6215 (-0.04z) | lr 1.00e-03 | 157757.56 ms | 60.0% bf16 MFU | 425 tok/s
-2026-05-04 14:40:27 | step  140/10000 | loss 5.415887 (-0.12z) | norm 0.7015 (-0.12z) | lr 1.00e-03 | 168680.35 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:40:38 | step  150/10000 | loss 5.276190 (-0.14z) | norm 0.7288 (-0.14z) | lr 1.00e-03 | 179473.81 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:40:49 | step  160/10000 | loss 5.224572 (-0.05z) | norm 0.7665 (-0.05z) | lr 1.00e-03 | 190512.16 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:41:00 | step  170/10000 | loss 5.104290 (-0.12z) | norm 0.8253 (-0.12z) | lr 1.00e-03 | 201702.84 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:41:11 | step  180/10000 | loss 5.064143 (-0.04z) | norm 0.9258 (-0.04z) | lr 1.00e-03 | 212740.06 ms | 60.0% bf16 MFU | 435 tok/s
-2026-05-04 14:41:23 | step  190/10000 | loss 5.004972 (-0.06z) | norm 0.8852 (-0.06z) | lr 1.00e-03 | 224522.36 ms | 60.0% bf16 MFU | 435 tok/s
-2026-05-04 14:41:35 | step  200/10000 | loss 4.840600 (-0.16z) | norm 0.8442 (-0.16z) | lr 1.00e-03 | 236631.88 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:41:47 | step  210/10000 | loss 4.878980 (+0.04z) | norm 0.8591 (+0.04z) | lr 1.00e-03 | 248387.85 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:41:59 | step  220/10000 | loss 4.839026 (-0.04z) | norm 0.8605 (-0.04z) | lr 1.00e-03 | 260407.80 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:42:12 | step  230/10000 | loss 4.797831 (-0.04z) | norm 0.9404 (-0.04z) | lr 1.00e-03 | 273254.24 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:42:24 | step  240/10000 | loss 4.695076 (-0.10z) | norm 0.9927 (-0.10z) | lr 1.00e-03 | 285755.31 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:42:37 | step  250/10000 | loss 4.708679 (+0.01z) | norm 0.9779 (+0.01z) | lr 1.00e-03 | 298451.18 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:42:50 | step  260/10000 | loss 4.690611 (-0.02z) | norm 0.9101 (-0.02z) | lr 1.00e-03 | 311342.43 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:43:03 | step  270/10000 | loss 4.576964 (-0.11z) | norm 0.9836 (-0.11z) | lr 1.00e-03 | 324718.10 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:43:17 | step  280/10000 | loss 4.510950 (-0.07z) | norm 1.1143 (-0.07z) | lr 1.00e-03 | 338326.45 ms | 60.0% bf16 MFU | 425 tok/s
-2026-05-04 14:43:29 | step  290/10000 | loss 4.586329 (+0.08z) | norm 1.0503 (+0.08z) | lr 1.00e-03 | 350202.22 ms | 60.0% bf16 MFU | 425 tok/s
-2026-05-04 14:43:40 | step  300/10000 | loss 4.512451 (-0.07z) | norm 0.9719 (-0.07z) | lr 1.00e-03 | 361464.60 ms | 60.0% bf16 MFU | 426 tok/s
-2026-05-04 14:43:51 | step  310/10000 | loss 4.468702 (-0.04z) | norm 1.1043 (-0.04z) | lr 1.00e-03 | 372647.08 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:44:03 | step  320/10000 | loss 4.427707 (-0.04z) | norm 1.3409 (-0.04z) | lr 1.00e-03 | 384214.66 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:44:14 | step  330/10000 | loss 4.406521 (-0.02z) | norm 1.1902 (-0.02z) | lr 1.00e-03 | 395405.43 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:44:26 | step  340/10000 | loss 4.340779 (-0.07z) | norm 1.1662 (-0.07z) | lr 1.00e-03 | 407186.14 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:44:37 | step  350/10000 | loss 4.314247 (-0.03z) | norm 1.0941 (-0.03z) | lr 1.00e-03 | 418504.92 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:44:49 | step  360/10000 | loss 4.369273 (+0.06z) | norm 1.1833 (+0.06z) | lr 1.00e-03 | 430018.06 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:45:00 | step  370/10000 | loss 4.337414 (-0.03z) | norm 1.2574 (-0.03z) | lr 1.00e-03 | 441304.87 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:45:11 | step  380/10000 | loss 4.328847 (-0.01z) | norm 1.1782 (-0.01z) | lr 1.00e-03 | 452542.36 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:45:22 | step  390/10000 | loss 4.342051 (+0.01z) | norm 1.2784 (+0.01z) | lr 1.00e-03 | 463790.91 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:45:34 | step  400/10000 | loss 4.194255 (-0.15z) | norm 1.0477 (-0.15z) | lr 1.00e-03 | 475100.12 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:45:45 | step  410/10000 | loss 4.188337 (-0.01z) | norm 1.1660 (-0.01z) | lr 1.00e-03 | 486328.98 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:45:56 | step  420/10000 | loss 4.229173 (+0.04z) | norm 1.2353 (+0.04z) | lr 1.00e-03 | 497675.27 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:46:07 | step  430/10000 | loss 4.266688 (+0.04z) | norm 1.1175 (+0.04z) | lr 1.00e-03 | 508933.14 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:46:19 | step  440/10000 | loss 4.186455 (-0.08z) | norm 1.0737 (-0.08z) | lr 1.00e-03 | 520052.56 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:46:30 | step  450/10000 | loss 4.233275 (+0.05z) | norm 1.1226 (+0.05z) | lr 1.00e-03 | 531928.31 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:46:43 | step  460/10000 | loss 4.208728 (-0.02z) | norm 1.2259 (-0.02z) | lr 1.00e-03 | 544453.32 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:46:56 | step  470/10000 | loss 4.092826 (-0.12z) | norm 1.1402 (-0.12z) | lr 1.00e-03 | 556987.79 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:47:08 | step  480/10000 | loss 4.100472 (+0.01z) | norm 1.2286 (+0.01z) | lr 1.00e-03 | 569270.20 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:47:20 | step  490/10000 | loss 4.093727 (-0.01z) | norm 1.1608 (-0.01z) | lr 1.00e-03 | 581398.77 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:47:32 | step  500/10000 | loss 4.145991 (+0.05z) | norm 1.2088 (+0.05z) | lr 1.00e-03 | 593302.77 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:47:43 | step  510/10000 | loss 4.071595 (-0.07z) | norm 1.3051 (-0.07z) | lr 1.00e-03 | 604607.90 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:47:54 | step  520/10000 | loss 4.084886 (+0.01z) | norm 1.2778 (+0.01z) | lr 1.00e-03 | 615870.61 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:48:06 | step  530/10000 | loss 4.133907 (+0.05z) | norm 1.1454 (+0.05z) | lr 1.00e-03 | 627181.52 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:48:17 | step  540/10000 | loss 4.033642 (-0.10z) | norm 1.0800 (-0.10z) | lr 1.00e-03 | 638377.22 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:48:28 | step  550/10000 | loss 4.082393 (+0.05z) | norm 1.2565 (+0.05z) | lr 1.00e-03 | 649659.00 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:48:39 | step  560/10000 | loss 3.982449 (-0.10z) | norm 1.2172 (-0.10z) | lr 1.00e-03 | 660930.87 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:48:51 | step  570/10000 | loss 4.030721 (+0.05z) | norm 1.1852 (+0.05z) | lr 1.00e-03 | 672268.58 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:49:02 | step  580/10000 | loss 4.028955 (-0.00z) | norm 1.2413 (-0.00z) | lr 1.00e-03 | 683849.72 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:49:14 | step  590/10000 | loss 3.931427 (-0.10z) | norm 1.3435 (-0.10z) | lr 1.00e-03 | 695096.18 ms | 60.0% bf16 MFU | 435 tok/s
-2026-05-04 14:49:27 | step  600/10000 | loss 4.046100 (+0.11z) | norm 1.1043 (+0.11z) | lr 1.00e-03 | 708486.46 ms | 60.0% bf16 MFU | 434 tok/s
-2026-05-04 14:49:40 | step  610/10000 | loss 3.969247 (-0.08z) | norm 1.2893 (-0.08z) | lr 1.00e-03 | 721463.53 ms | 60.0% bf16 MFU | 433 tok/s
-2026-05-04 14:49:53 | step  620/10000 | loss 3.957383 (-0.01z) | norm 1.4027 (-0.01z) | lr 1.00e-03 | 734612.72 ms | 60.0% bf16 MFU | 432 tok/s
-2026-05-04 14:50:08 | step  630/10000 | loss 4.052402 (+0.10z) | norm 1.2685 (+0.10z) | lr 1.00e-03 | 749388.88 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:50:21 | step  640/10000 | loss 3.975482 (-0.08z) | norm 1.2427 (-0.08z) | lr 1.00e-03 | 762634.77 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:50:33 | step  650/10000 | loss 3.848517 (-0.13z) | norm 1.1864 (-0.13z) | lr 1.00e-03 | 774719.25 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:50:46 | step  660/10000 | loss 3.941863 (+0.09z) | norm 1.2051 (+0.09z) | lr 1.00e-03 | 787071.57 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:50:58 | step  670/10000 | loss 3.874694 (-0.07z) | norm 1.3841 (-0.07z) | lr 1.00e-03 | 799305.08 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:51:10 | step  680/10000 | loss 3.885223 (+0.01z) | norm 1.2585 (+0.01z) | lr 1.00e-03 | 811883.23 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:51:23 | step  690/10000 | loss 3.979385 (+0.09z) | norm 1.3379 (+0.09z) | lr 1.00e-03 | 824136.44 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:51:35 | step  700/10000 | loss 3.971829 (-0.01z) | norm 1.4648 (-0.01z) | lr 1.00e-03 | 836339.25 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:51:46 | step  710/10000 | loss 3.930426 (-0.04z) | norm 1.3554 (-0.04z) | lr 1.00e-03 | 847832.00 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:51:59 | step  720/10000 | loss 3.875196 (-0.06z) | norm 1.3765 (-0.06z) | lr 1.00e-03 | 860340.49 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:52:11 | step  730/10000 | loss 3.805929 (-0.07z) | norm 1.2464 (-0.07z) | lr 1.00e-03 | 872834.14 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:52:24 | step  740/10000 | loss 3.820781 (+0.01z) | norm 1.3040 (+0.01z) | lr 1.00e-03 | 885535.54 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:52:36 | step  750/10000 | loss 3.887057 (+0.07z) | norm 1.4763 (+0.07z) | lr 1.00e-03 | 897927.10 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:52:49 | step  760/10000 | loss 3.885127 (-0.00z) | norm 1.3122 (-0.00z) | lr 1.00e-03 | 910325.22 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:53:01 | step  770/10000 | loss 3.855359 (-0.03z) | norm 1.3701 (-0.03z) | lr 1.00e-03 | 922822.70 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:53:13 | step  780/10000 | loss 3.806393 (-0.05z) | norm 1.2119 (-0.05z) | lr 1.00e-03 | 934724.86 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:53:25 | step  790/10000 | loss 3.878775 (+0.07z) | norm 1.2416 (+0.07z) | lr 1.00e-03 | 946057.87 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:53:36 | step  800/10000 | loss 3.814861 (-0.06z) | norm 1.2754 (-0.06z) | lr 1.00e-03 | 957226.34 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:53:47 | step  810/10000 | loss 3.843380 (+0.03z) | norm 1.3366 (+0.03z) | lr 1.00e-03 | 968530.22 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:53:58 | step  820/10000 | loss 3.813046 (-0.03z) | norm 1.3156 (-0.03z) | lr 1.00e-03 | 979751.65 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:54:10 | step  830/10000 | loss 3.817100 (+0.00z) | norm 1.2097 (+0.00z) | lr 1.00e-03 | 991107.76 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:54:21 | step  840/10000 | loss 3.758600 (-0.06z) | norm 1.1792 (-0.06z) | lr 1.00e-03 | 1002466.55 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:54:33 | step  850/10000 | loss 3.728361 (-0.03z) | norm 1.3284 (-0.03z) | lr 1.00e-03 | 1014102.66 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:54:44 | step  860/10000 | loss 3.865416 (+0.14z) | norm 1.2994 (+0.14z) | lr 1.00e-03 | 1025333.78 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:54:56 | step  870/10000 | loss 3.845677 (-0.02z) | norm 1.4581 (-0.02z) | lr 1.00e-03 | 1037310.93 ms | 60.0% bf16 MFU | 429 tok/s
-2026-05-04 14:55:07 | step  880/10000 | loss 3.759125 (-0.09z) | norm 1.3152 (-0.09z) | lr 1.00e-03 | 1048680.43 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:55:18 | step  890/10000 | loss 3.818029 (+0.06z) | norm 1.2441 (+0.06z) | lr 1.00e-03 | 1059962.26 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:55:30 | step  900/10000 | loss 3.798479 (-0.02z) | norm 1.3269 (-0.02z) | lr 1.00e-03 | 1071388.61 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:55:41 | step  910/10000 | loss 3.785151 (-0.01z) | norm 1.5816 (-0.01z) | lr 1.00e-03 | 1082672.00 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:55:53 | step  920/10000 | loss 3.695237 (-0.09z) | norm 1.4505 (-0.09z) | lr 1.00e-03 | 1094033.71 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:56:05 | step  930/10000 | loss 3.798957 (+0.10z) | norm 1.4444 (+0.10z) | lr 1.00e-03 | 1106073.78 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:56:16 | step  940/10000 | loss 3.737733 (-0.06z) | norm 1.2665 (-0.06z) | lr 1.00e-03 | 1117396.12 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:56:27 | step  950/10000 | loss 3.741677 (+0.00z) | norm 1.3833 (+0.00z) | lr 1.00e-03 | 1128762.30 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:56:39 | step  960/10000 | loss 3.827510 (+0.09z) | norm 1.3797 (+0.09z) | lr 1.00e-03 | 1140180.91 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:56:50 | step  970/10000 | loss 3.766584 (-0.06z) | norm 1.4485 (-0.06z) | lr 1.00e-03 | 1151665.83 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:57:02 | step  980/10000 | loss 3.741104 (-0.03z) | norm 1.2647 (-0.03z) | lr 1.00e-03 | 1163232.50 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:57:14 | step  990/10000 | loss 3.694165 (-0.05z) | norm 1.3785 (-0.05z) | lr 1.00e-03 | 1175907.71 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:57:27 | step 1000/10000 | loss 3.651606 (-0.04z) | norm 1.2958 (-0.04z) | lr 1.00e-03 | 1188369.28 ms | 60.0% bf16 MFU | 431 tok/s
-2026-05-04 14:57:40 | step 1010/10000 | loss 3.781113 (+0.13z) | norm 1.3492 (+0.13z) | lr 1.00e-03 | 1201704.79 ms | 60.0% bf16 MFU | 430 tok/s
-2026-05-04 14:57:59 | step 1020/10000 | loss 3.705041 (-0.08z) | norm 1.4179 (-0.08z) | lr 1.00e-03 | 1220313.50 ms | 60.0% bf16 MFU | 428 tok/s
-2026-05-04 14:58:13 | step 1030/10000 | loss 3.704241 (-0.00z) | norm 1.3897 (-0.00z) | lr 1.00e-03 | 1234424.72 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:58:26 | step 1040/10000 | loss 3.750603 (+0.05z) | norm 1.4084 (+0.05z) | lr 1.00e-03 | 1247238.88 ms | 60.0% bf16 MFU | 427 tok/s
-2026-05-04 14:58:39 | step 1050/10000 | loss 3.779020 (+0.03z) | norm 1.4278 (+0.03z) | lr 1.00e-03 | 1260226.59 ms | 60.0% bf16 MFU | 426 tok/s
-2026-05-04 14:58:52 | step 1060/10000 | loss 3.653060 (-0.13z) | norm 1.5602 (-0.13z) | lr 1.00e-03 | 1273200.17 ms | 60.0% bf16 MFU | 426 tok/s
-2026-05-04 14:59:06 | step 1070/10000 | loss 3.720659 (+0.07z) | norm 1.3283 (+0.07z) | lr 1.00e-03 | 1287073.90 ms | 60.0% bf16 MFU | 426 tok/s
-2026-05-04 14:59:18 | step 1080/10000 | loss 3.663023 (-0.06z) | norm 1.1879 (-0.06z) | lr 1.00e-03 | 1299751.11 ms | 60.0% bf16 MFU | 425 tok/s
-2026-05-04 14:59:31 | step 1090/10000 | loss 3.656892 (-0.01z) | norm 1.4168 (-0.01z) | lr 1.00e-03 | 1312365.84 ms | 60.0% bf16 MFU | 425 tok/s
-2026-05-04 14:59:46 | step 1100/10000 | loss 3.617486 (-0.04z) | norm 1.3855 (-0.04z) | lr 1.00e-03 | 1327086.33 ms | 60.0% bf16 MFU | 424 tok/s
-2026-05-04 14:59:59 | step 1110/10000 | loss 3.671275 (+0.05z) | norm 1.4076 (+0.05z) | lr 1.00e-03 | 1340855.86 ms | 60.0% bf16 MFU | 424 tok/s
-2026-05-04 15:00:12 | step 1120/10000 | loss 3.671899 (+0.00z) | norm 1.3070 (+0.00z) | lr 1.00e-03 | 1353739.44 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:00:24 | step 1130/10000 | loss 3.640059 (-0.03z) | norm 1.3248 (-0.03z) | lr 1.00e-03 | 1365775.88 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:00:36 | step 1140/10000 | loss 3.717279 (+0.08z) | norm 1.3523 (+0.08z) | lr 1.00e-03 | 1377873.36 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:00:49 | step 1150/10000 | loss 3.663347 (-0.05z) | norm 1.3193 (-0.05z) | lr 1.00e-03 | 1390458.33 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:01:02 | step 1160/10000 | loss 3.681363 (+0.02z) | norm 1.2806 (+0.02z) | lr 1.00e-03 | 1403391.90 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:01:16 | step 1170/10000 | loss 3.641992 (-0.04z) | norm 1.3507 (-0.04z) | lr 1.00e-03 | 1417206.51 ms | 60.0% bf16 MFU | 423 tok/s
-2026-05-04 15:01:29 | step 1180/10000 | loss 3.710226 (+0.07z) | norm 1.3113 (+0.07z) | lr 1.00e-03 | 1430015.60 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:01:41 | step 1190/10000 | loss 3.662544 (-0.05z) | norm 1.3078 (-0.05z) | lr 1.00e-03 | 1442333.83 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:01:53 | step 1200/10000 | loss 3.606754 (-0.06z) | norm 1.2411 (-0.06z) | lr 1.00e-03 | 1454755.98 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:02:06 | step 1210/10000 | loss 3.602933 (-0.00z) | norm 1.4793 (-0.00z) | lr 1.00e-03 | 1467072.70 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:02:19 | step 1220/10000 | loss 3.634384 (+0.03z) | norm 1.2908 (+0.03z) | lr 1.00e-03 | 1480145.41 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:02:31 | step 1230/10000 | loss 3.621403 (-0.01z) | norm 1.4949 (-0.01z) | lr 1.00e-03 | 1492855.24 ms | 60.0% bf16 MFU | 422 tok/s
-2026-05-04 15:02:45 | step 1240/10000 | loss 3.651283 (+0.03z) | norm 1.4613 (+0.03z) | lr 1.00e-03 | 1506324.47 ms | 60.0% bf16 MFU | 421 tok/s
-2026-05-04 15:02:58 | step 1250/10000 | loss 3.650946 (-0.00z) | norm 1.4826 (-0.00z) | lr 1.00e-03 | 1519560.98 ms | 60.0% bf16 MFU | 421 tok/s
-2026-05-04 15:03:13 | step 1260/10000 | loss 3.570591 (-0.08z) | norm 1.4759 (-0.08z) | lr 1.00e-03 | 1534242.02 ms | 60.0% bf16 MFU | 420 tok/s
-2026-05-04 15:03:25 | step 1270/10000 | loss 3.596807 (+0.03z) | norm 1.2778 (+0.03z) | lr 1.00e-03 | 1546893.75 ms | 60.0% bf16 MFU | 420 tok/s
-2026-05-04 15:03:39 | step 1280/10000 | loss 3.660119 (+0.06z) | norm 1.4496 (+0.06z) | lr 1.00e-03 | 1560958.53 ms | 60.0% bf16 MFU | 420 tok/s
-2026-05-04 15:03:53 | step 1290/10000 | loss 3.613994 (-0.05z) | norm 1.4903 (-0.05z) | lr 1.00e-03 | 1574299.16 ms | 60.0% bf16 MFU | 419 tok/s
-2026-05-04 15:04:05 | step 1300/10000 | loss 3.640958 (+0.03z) | norm 1.2254 (+0.03z) | lr 1.00e-03 | 1586640.36 ms | 60.0% bf16 MFU | 419 tok/s
-2026-05-04 15:04:17 | step 1310/10000 | loss 3.610671 (-0.03z) | norm 1.3232 (-0.03z) | lr 1.00e-03 | 1598397.77 ms | 60.0% bf16 MFU | 419 tok/s
-2026-05-04 15:04:29 | step 1320/10000 | loss 3.558076 (-0.05z) | norm 1.3751 (-0.05z) | lr 1.00e-03 | 1610227.12 ms | 60.0% bf16 MFU | 420 tok/s
-2026-05-04 15:04:43 | step 1330/10000 | loss 3.590876 (+0.03z) | norm 1.3819 (+0.03z) | lr 1.00e-03 | 1623948.06 ms | 60.0% bf16 MFU | 419 tok/s
-2026-05-04 15:04:58 | step 1340/10000 | loss 3.603198 (+0.01z) | norm 1.4738 (+0.01z) | lr 1.00e-03 | 1639464.35 ms | 60.0% bf16 MFU | 418 tok/s
-2026-05-04 15:05:15 | step 1350/10000 | loss 3.527039 (-0.08z) | norm 1.4526 (-0.08z) | lr 1.00e-03 | 1656036.60 ms | 60.0% bf16 MFU | 417 tok/s
-2026-05-04 15:05:30 | step 1360/10000 | loss 3.491501 (-0.04z) | norm 1.4562 (-0.04z) | lr 1.00e-03 | 1671233.26 ms | 60.0% bf16 MFU | 416 tok/s
-2026-05-04 15:05:45 | step 1370/10000 | loss 3.606274 (+0.11z) | norm 1.5188 (+0.11z) | lr 1.00e-03 | 1686754.62 ms | 60.0% bf16 MFU | 416 tok/s
-2026-05-04 15:06:01 | step 1380/10000 | loss 3.578838 (-0.03z) | norm 1.4632 (-0.03z) | lr 1.00e-03 | 1702479.87 ms | 60.0% bf16 MFU | 415 tok/s
-2026-05-04 15:06:18 | step 1390/10000 | loss 3.509110 (-0.07z) | norm 1.5313 (-0.07z) | lr 1.00e-03 | 1719155.33 ms | 60.0% bf16 MFU | 414 tok/s
-2026-05-04 15:06:34 | step 1400/10000 | loss 3.532950 (+0.02z) | norm 1.4829 (+0.02z) | lr 1.00e-03 | 1735055.72 ms | 60.0% bf16 MFU | 413 tok/s
-2026-05-04 15:06:50 | step 1410/10000 | loss 3.592101 (+0.06z) | norm 1.3624 (+0.06z) | lr 1.00e-03 | 1751615.59 ms | 60.0% bf16 MFU | 412 tok/s
-2026-05-04 15:07:05 | step 1420/10000 | loss 3.586413 (-0.01z) | norm 1.3034 (-0.01z) | lr 1.00e-03 | 1766251.03 ms | 60.0% bf16 MFU | 411 tok/s
-2026-05-04 15:07:20 | step 1430/10000 | loss 3.592228 (+0.01z) | norm 1.3970 (+0.01z) | lr 1.00e-03 | 1781304.57 ms | 60.0% bf16 MFU | 411 tok/s
-2026-05-04 15:07:36 | step 1440/10000 | loss 3.534686 (-0.06z) | norm 1.4541 (-0.06z) | lr 1.00e-03 | 1797230.26 ms | 60.0% bf16 MFU | 410 tok/s
-2026-05-04 15:07:53 | step 1450/10000 | loss 3.447916 (-0.09z) | norm 1.4672 (-0.09z) | lr 1.00e-03 | 1814545.96 ms | 60.0% bf16 MFU | 409 tok/s
-2026-05-04 15:08:10 | step 1460/10000 | loss 3.591452 (+0.14z) | norm 1.3419 (+0.14z) | lr 1.00e-03 | 1831024.54 ms | 60.0% bf16 MFU | 408 tok/s
-2026-05-04 15:08:27 | step 1470/10000 | loss 3.530718 (-0.06z) | norm 1.3372 (-0.06z) | lr 1.00e-03 | 1848398.65 ms | 60.0% bf16 MFU | 407 tok/s
-2026-05-04 15:08:42 | step 1480/10000 | loss 3.579262 (+0.05z) | norm 1.4897 (+0.05z) | lr 1.00e-03 | 1863153.36 ms | 60.0% bf16 MFU | 406 tok/s
-2026-05-04 15:08:58 | step 1490/10000 | loss 3.562484 (-0.02z) | norm 1.3123 (-0.02z) | lr 1.00e-03 | 1879157.64 ms | 60.0% bf16 MFU | 406 tok/s
-2026-05-04 15:09:13 | step 1500/10000 | loss 3.517546 (-0.04z) | norm 1.3604 (-0.04z) | lr 1.00e-03 | 1894579.29 ms | 60.0% bf16 MFU | 405 tok/s
-2026-05-04 15:09:29 | step 1510/10000 | loss 3.543605 (+0.03z) | norm 1.4959 (+0.03z) | lr 1.00e-03 | 1909997.35 ms | 60.0% bf16 MFU | 405 tok/s
-2026-05-04 15:09:44 | step 1520/10000 | loss 3.571496 (+0.03z) | norm 1.4026 (+0.03z) | lr 1.00e-03 | 1925129.54 ms | 60.0% bf16 MFU | 404 tok/s
-2026-05-04 15:09:59 | step 1530/10000 | loss 3.541286 (-0.03z) | norm 1.4666 (-0.03z) | lr 1.00e-03 | 1940954.96 ms | 60.0% bf16 MFU | 403 tok/s
-2026-05-04 15:10:17 | step 1540/10000 | loss 3.445838 (-0.10z) | norm 1.5903 (-0.10z) | lr 1.00e-03 | 1958463.18 ms | 60.0% bf16 MFU | 402 tok/s
-2026-05-04 15:10:32 | step 1550/10000 | loss 3.461990 (+0.02z) | norm 1.4139 (+0.02z) | lr 1.00e-03 | 1973715.68 ms | 60.0% bf16 MFU | 402 tok/s
-2026-05-04 15:10:48 | step 1560/10000 | loss 3.492140 (+0.03z) | norm 1.4321 (+0.03z) | lr 1.00e-03 | 1989484.47 ms | 60.0% bf16 MFU | 401 tok/s
-2026-05-04 15:11:02 | step 1570/10000 | loss 3.484315 (-0.01z) | norm 1.4711 (-0.01z) | lr 1.00e-03 | 2003631.84 ms | 60.0% bf16 MFU | 401 tok/s
-2026-05-04 15:11:17 | step 1580/10000 | loss 3.513388 (+0.03z) | norm 1.5235 (+0.03z) | lr 1.00e-03 | 2018383.24 ms | 60.0% bf16 MFU | 401 tok/s
-2026-05-04 15:11:33 | step 1590/10000 | loss 3.473915 (-0.04z) | norm 1.3386 (-0.04z) | lr 1.00e-03 | 2034189.70 ms | 60.0% bf16 MFU | 400 tok/s
-2026-05-04 15:11:49 | step 1600/10000 | loss 3.526958 (+0.05z) | norm 1.4742 (+0.05z) | lr 1.00e-03 | 2050324.52 ms | 60.0% bf16 MFU | 399 tok/s
-2026-05-04 15:12:03 | step 1610/10000 | loss 3.504819 (-0.02z) | norm 1.5160 (-0.02z) | lr 1.00e-03 | 2064785.38 ms | 60.0% bf16 MFU | 399 tok/s
-2026-05-04 15:12:17 | step 1620/10000 | loss 3.550168 (+0.05z) | norm 1.4029 (+0.05z) | lr 1.00e-03 | 2078497.35 ms | 60.0% bf16 MFU | 399 tok/s
-2026-05-04 15:12:31 | step 1630/10000 | loss 3.523181 (-0.03z) | norm 1.4476 (-0.03z) | lr 1.00e-03 | 2092467.59 ms | 60.0% bf16 MFU | 399 tok/s
-2026-05-04 15:12:47 | step 1640/10000 | loss 3.455931 (-0.07z) | norm 1.5301 (-0.07z) | lr 1.00e-03 | 2108170.66 ms | 60.0% bf16 MFU | 398 tok/s
-2026-05-04 15:13:03 | step 1650/10000 | loss 3.569903 (+0.11z) | norm 1.5848 (+0.11z) | lr 1.00e-03 | 2124203.70 ms | 60.0% bf16 MFU | 397 tok/s
-2026-05-04 15:13:18 | step 1660/10000 | loss 3.561064 (-0.01z) | norm 1.4298 (-0.01z) | lr 1.00e-03 | 2139937.55 ms | 60.0% bf16 MFU | 397 tok/s
-2026-05-04 15:13:35 | step 1670/10000 | loss 3.510484 (-0.05z) | norm 1.4331 (-0.05z) | lr 1.00e-03 | 2156882.64 ms | 60.0% bf16 MFU | 396 tok/s
-2026-05-04 15:13:51 | step 1680/10000 | loss 3.469988 (-0.04z) | norm 1.5686 (-0.04z) | lr 1.00e-03 | 2172113.22 ms | 60.0% bf16 MFU | 396 tok/s
-2026-05-04 15:14:06 | step 1690/10000 | loss 3.470391 (+0.00z) | norm 1.5732 (+0.00z) | lr 1.00e-03 | 2187108.58 ms | 60.0% bf16 MFU | 395 tok/s
-2026-05-04 15:14:20 | step 1700/10000 | loss 3.457046 (-0.01z) | norm 1.4568 (-0.01z) | lr 1.00e-03 | 2201358.14 ms | 60.0% bf16 MFU | 395 tok/s
-2026-05-04 15:14:35 | step 1710/10000 | loss 3.501427 (+0.04z) | norm 1.4765 (+0.04z) | lr 1.00e-03 | 2216452.30 ms | 60.0% bf16 MFU | 395 tok/s
-2026-05-04 15:14:50 | step 1720/10000 | loss 3.457146 (-0.04z) | norm 1.4784 (-0.04z) | lr 1.00e-03 | 2231426.41 ms | 60.0% bf16 MFU | 394 tok/s
-2026-05-04 15:15:06 | step 1730/10000 | loss 3.479253 (+0.02z) | norm 1.5213 (+0.02z) | lr 1.00e-03 | 2247599.12 ms | 60.0% bf16 MFU | 394 tok/s
-2026-05-04 15:15:22 | step 1740/10000 | loss 3.428710 (-0.05z) | norm 1.5679 (-0.05z) | lr 1.00e-03 | 2263109.90 ms | 60.0% bf16 MFU | 393 tok/s
-2026-05-04 15:15:36 | step 1750/10000 | loss 3.488854 (+0.06z) | norm 1.6920 (+0.06z) | lr 1.00e-03 | 2277931.00 ms | 60.0% bf16 MFU | 393 tok/s
-2026-05-04 15:15:51 | step 1760/10000 | loss 3.459068 (-0.03z) | norm 1.4861 (-0.03z) | lr 1.00e-03 | 2292386.69 ms | 60.0% bf16 MFU | 393 tok/s
-2026-05-04 15:16:06 | step 1770/10000 | loss 3.425840 (-0.03z) | norm 1.4342 (-0.03z) | lr 1.00e-03 | 2307471.61 ms | 60.0% bf16 MFU | 392 tok/s
-2026-05-04 15:16:22 | step 1780/10000 | loss 3.439773 (+0.01z) | norm 1.5527 (+0.01z) | lr 1.00e-03 | 2323683.36 ms | 60.0% bf16 MFU | 392 tok/s
-2026-05-04 15:16:37 | step 1790/10000 | loss 3.505554 (+0.07z) | norm 1.3507 (+0.07z) | lr 1.00e-03 | 2338284.43 ms | 60.0% bf16 MFU | 392 tok/s
-2026-05-04 15:16:51 | step 1800/10000 | loss 3.509921 (+0.00z) | norm 1.4539 (+0.00z) | lr 1.00e-03 | 2352339.26 ms | 60.0% bf16 MFU | 391 tok/s
-2026-05-04 15:17:06 | step 1810/10000 | loss 3.463483 (-0.05z) | norm 1.5144 (-0.05z) | lr 1.00e-03 | 2367052.74 ms | 60.0% bf16 MFU | 391 tok/s
-2026-05-04 15:17:20 | step 1820/10000 | loss 3.507226 (+0.04z) | norm 1.4693 (+0.04z) | lr 1.00e-03 | 2381221.15 ms | 60.0% bf16 MFU | 391 tok/s
-2026-05-04 15:17:35 | step 1830/10000 | loss 3.396090 (-0.11z) | norm 1.5541 (-0.11z) | lr 1.00e-03 | 2396625.23 ms | 60.0% bf16 MFU | 391 tok/s
-2026-05-04 15:17:50 | step 1840/10000 | loss 3.503596 (+0.11z) | norm 1.4478 (+0.11z) | lr 1.00e-03 | 2411145.61 ms | 60.0% bf16 MFU | 390 tok/s
-2026-05-04 15:18:04 | step 1850/10000 | loss 3.377811 (-0.13z) | norm 1.5108 (-0.13z) | lr 1.00e-03 | 2425277.79 ms | 60.0% bf16 MFU | 390 tok/s
-2026-05-04 15:18:19 | step 1860/10000 | loss 3.401505 (+0.02z) | norm 1.5041 (+0.02z) | lr 1.00e-03 | 2440351.55 ms | 60.0% bf16 MFU | 390 tok/s
-2026-05-04 15:18:33 | step 1870/10000 | loss 3.407108 (+0.01z) | norm 1.5940 (+0.01z) | lr 1.00e-03 | 2454065.07 ms | 60.0% bf16 MFU | 390 tok/s
-2026-05-04 15:18:47 | step 1880/10000 | loss 3.447253 (+0.04z) | norm 1.3309 (+0.04z) | lr 1.00e-03 | 2468754.53 ms | 60.0% bf16 MFU | 390 tok/s
-2026-05-04 15:19:02 | step 1890/10000 | loss 3.453714 (+0.01z) | norm 1.3862 (+0.01z) | lr 1.00e-03 | 2483345.48 ms | 60.0% bf16 MFU | 389 tok/s
-2026-05-04 15:19:24 | step 1900/10000 | loss 3.435635 (-0.02z) | norm 1.4998 (-0.02z) | lr 1.00e-03 | 2505086.11 ms | 60.0% bf16 MFU | 388 tok/s
-2026-05-04 15:19:46 | step 1910/10000 | loss 3.407151 (-0.03z) | norm 1.5915 (-0.03z) | lr 1.00e-03 | 2527204.95 ms | 60.0% bf16 MFU | 387 tok/s
-2026-05-04 15:20:03 | step 1920/10000 | loss 3.327791 (-0.08z) | norm 1.5593 (-0.08z) | lr 1.00e-03 | 2544182.45 ms | 60.0% bf16 MFU | 386 tok/s
-2026-05-04 15:20:18 | step 1930/10000 | loss 3.421272 (+0.09z) | norm 1.4553 (+0.09z) | lr 1.00e-03 | 2559337.24 ms | 60.0% bf16 MFU | 386 tok/s
-2026-05-04 15:20:35 | step 1940/10000 | loss 3.380475 (-0.04z) | norm 1.4170 (-0.04z) | lr 1.00e-03 | 2576952.44 ms | 60.0% bf16 MFU | 385 tok/s
-2026-05-04 15:20:53 | step 1950/10000 | loss 3.366257 (-0.01z) | norm 1.5302 (-0.01z) | lr 1.00e-03 | 2594278.12 ms | 60.0% bf16 MFU | 385 tok/s
-2026-05-04 15:21:13 | step 1960/10000 | loss 3.373919 (+0.01z) | norm 1.5803 (+0.01z) | lr 1.00e-03 | 2614571.89 ms | 60.0% bf16 MFU | 384 tok/s
-2026-05-04 15:21:29 | step 1970/10000 | loss 3.441697 (+0.07z) | norm 1.5353 (+0.07z) | lr 1.00e-03 | 2630852.84 ms | 60.0% bf16 MFU | 383 tok/s
-2026-05-04 15:21:44 | step 1980/10000 | loss 3.435732 (-0.01z) | norm 1.5022 (-0.01z) | lr 1.00e-03 | 2645595.41 ms | 60.0% bf16 MFU | 383 tok/s
-2026-05-04 15:21:59 | step 1990/10000 | loss 3.509201 (+0.07z) | norm 1.4692 (+0.07z) | lr 1.00e-03 | 2660810.33 ms | 60.0% bf16 MFU | 383 tok/s
-2026-05-04 15:22:15 | step 2000/10000 | loss 3.474214 (-0.03z) | norm 1.5991 (-0.03z) | lr 1.00e-03 | 2676714.69 ms | 60.0% bf16 MFU | 382 tok/s
-2026-05-04 15:22:32 | step 2010/10000 | loss 3.392986 (-0.08z) | norm 1.6101 (-0.08z) | lr 1.00e-03 | 2693812.83 ms | 60.0% bf16 MFU | 382 tok/s
-2026-05-04 15:22:48 | step 2020/10000 | loss 3.497144 (+0.10z) | norm 1.4729 (+0.10z) | lr 1.00e-03 | 2709797.93 ms | 60.0% bf16 MFU | 381 tok/s
-2026-05-04 15:23:04 | step 2030/10000 | loss 3.431588 (-0.07z) | norm 1.3984 (-0.07z) | lr 1.00e-03 | 2725728.32 ms | 60.0% bf16 MFU | 381 tok/s
-2026-05-04 15:23:22 | step 2040/10000 | loss 3.386894 (-0.04z) | norm 1.4854 (-0.04z) | lr 1.00e-03 | 2743141.56 ms | 60.0% bf16 MFU | 380 tok/s
-2026-05-04 15:23:38 | step 2050/10000 | loss 3.425443 (+0.04z) | norm 1.4933 (+0.04z) | lr 1.00e-03 | 2759448.09 ms | 60.0% bf16 MFU | 380 tok/s
-2026-05-04 15:23:54 | step 2060/10000 | loss 3.435753 (+0.01z) | norm 1.4518 (+0.01z) | lr 1.00e-03 | 2775228.15 ms | 60.0% bf16 MFU | 380 tok/s
-2026-05-04 15:24:13 | step 2070/10000 | loss 3.330579 (-0.11z) | norm 1.4320 (-0.11z) | lr 1.00e-03 | 2794940.94 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:24:32 | step 2080/10000 | loss 3.359337 (+0.03z) | norm 1.4895 (+0.03z) | lr 1.00e-03 | 2813227.23 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:24:48 | step 2090/10000 | loss 3.402071 (+0.04z) | norm 1.4325 (+0.04z) | lr 1.00e-03 | 2829137.16 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:25:03 | step 2100/10000 | loss 3.494185 (+0.09z) | norm 1.4257 (+0.09z) | lr 1.00e-03 | 2843993.40 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:25:18 | step 2110/10000 | loss 3.451862 (-0.04z) | norm 1.7183 (-0.04z) | lr 1.00e-03 | 2859130.55 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:25:32 | step 2120/10000 | loss 3.369636 (-0.08z) | norm 1.4906 (-0.08z) | lr 1.00e-03 | 2873462.58 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:25:45 | step 2130/10000 | loss 3.417961 (+0.05z) | norm 1.4327 (+0.05z) | lr 1.00e-03 | 2886842.75 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:25:58 | step 2140/10000 | loss 3.414645 (-0.00z) | norm 1.5593 (-0.00z) | lr 1.00e-03 | 2899635.21 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:26:11 | step 2150/10000 | loss 3.399529 (-0.02z) | norm 1.4236 (-0.02z) | lr 1.00e-03 | 2912630.85 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:26:24 | step 2160/10000 | loss 3.463797 (+0.06z) | norm 1.4444 (+0.06z) | lr 1.00e-03 | 2925497.02 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:26:37 | step 2170/10000 | loss 3.484994 (+0.02z) | norm 1.4607 (+0.02z) | lr 1.00e-03 | 2938388.92 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:26:50 | step 2180/10000 | loss 3.353402 (-0.13z) | norm 1.4219 (-0.13z) | lr 1.00e-03 | 2951172.76 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:27:03 | step 2190/10000 | loss 3.294387 (-0.06z) | norm 1.3777 (-0.06z) | lr 1.00e-03 | 2964302.81 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:27:19 | step 2200/10000 | loss 3.368872 (+0.07z) | norm 1.4879 (+0.07z) | lr 1.00e-03 | 2980672.30 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:27:32 | step 2210/10000 | loss 3.394851 (+0.03z) | norm 1.6262 (+0.03z) | lr 1.00e-03 | 2993094.08 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:27:44 | step 2220/10000 | loss 3.360827 (-0.03z) | norm 1.4866 (-0.03z) | lr 1.00e-03 | 3005232.94 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:27:56 | step 2230/10000 | loss 3.446846 (+0.09z) | norm 1.4141 (+0.09z) | lr 1.00e-03 | 3017297.17 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:28:09 | step 2240/10000 | loss 3.380163 (-0.07z) | norm 1.5856 (-0.07z) | lr 1.00e-03 | 3030252.49 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:28:23 | step 2250/10000 | loss 3.424447 (+0.04z) | norm 1.5390 (+0.04z) | lr 1.00e-03 | 3044526.23 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:28:37 | step 2260/10000 | loss 3.360008 (-0.06z) | norm 1.5169 (-0.06z) | lr 1.00e-03 | 3058319.16 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:28:51 | step 2270/10000 | loss 3.392204 (+0.03z) | norm 1.5397 (+0.03z) | lr 1.00e-03 | 3072390.46 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:29:04 | step 2280/10000 | loss 3.446681 (+0.05z) | norm 1.4294 (+0.05z) | lr 1.00e-03 | 3085440.41 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:29:18 | step 2290/10000 | loss 3.336515 (-0.11z) | norm 1.2407 (-0.11z) | lr 1.00e-03 | 3099263.93 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:29:32 | step 2300/10000 | loss 3.377033 (+0.04z) | norm 1.3151 (+0.04z) | lr 1.00e-03 | 3113366.84 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:29:45 | step 2310/10000 | loss 3.404816 (+0.03z) | norm 1.5130 (+0.03z) | lr 1.00e-03 | 3126261.75 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:29:57 | step 2320/10000 | loss 3.378711 (-0.03z) | norm 1.6131 (-0.03z) | lr 1.00e-03 | 3138817.55 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:30:10 | step 2330/10000 | loss 3.291705 (-0.09z) | norm 1.5695 (-0.09z) | lr 1.00e-03 | 3151336.76 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:30:22 | step 2340/10000 | loss 3.304232 (+0.01z) | norm 1.4011 (+0.01z) | lr 1.00e-03 | 3163683.93 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:30:35 | step 2350/10000 | loss 3.401928 (+0.10z) | norm 1.5349 (+0.10z) | lr 1.00e-03 | 3176284.41 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:30:47 | step 2360/10000 | loss 3.340690 (-0.06z) | norm 1.4230 (-0.06z) | lr 1.00e-03 | 3188863.09 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:31:00 | step 2370/10000 | loss 3.313061 (-0.03z) | norm 1.3590 (-0.03z) | lr 1.00e-03 | 3201379.13 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:31:12 | step 2380/10000 | loss 3.404562 (+0.09z) | norm 1.4416 (+0.09z) | lr 1.00e-03 | 3213921.90 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:31:25 | step 2390/10000 | loss 3.286808 (-0.12z) | norm 1.4700 (-0.12z) | lr 1.00e-03 | 3226466.23 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:31:38 | step 2400/10000 | loss 3.364665 (+0.08z) | norm 1.4838 (+0.08z) | lr 1.00e-03 | 3239392.84 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:31:53 | step 2410/10000 | loss 3.422925 (+0.06z) | norm 1.4412 (+0.06z) | lr 1.00e-03 | 3254281.23 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:32:06 | step 2420/10000 | loss 3.485755 (+0.06z) | norm 1.3675 (+0.06z) | lr 1.00e-03 | 3267549.02 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:32:20 | step 2430/10000 | loss 3.401863 (-0.08z) | norm 1.5630 (-0.08z) | lr 1.00e-03 | 3281729.99 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:32:35 | step 2440/10000 | loss 3.352350 (-0.05z) | norm 1.4517 (-0.05z) | lr 1.00e-03 | 3296049.48 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:32:48 | step 2450/10000 | loss 3.336325 (-0.02z) | norm 1.3885 (-0.02z) | lr 1.00e-03 | 3309844.96 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:33:02 | step 2460/10000 | loss 3.330453 (-0.01z) | norm 1.4428 (-0.01z) | lr 1.00e-03 | 3323313.44 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:33:16 | step 2470/10000 | loss 3.367071 (+0.04z) | norm 1.4417 (+0.04z) | lr 1.00e-03 | 3337044.79 ms | 60.0% bf16 MFU | 379 tok/s
-2026-05-04 15:33:31 | step 2480/10000 | loss 3.336869 (-0.03z) | norm 1.4949 (-0.03z) | lr 1.00e-03 | 3351988.04 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:33:46 | step 2490/10000 | loss 3.306152 (-0.03z) | norm 1.4728 (-0.03z) | lr 1.00e-03 | 3366985.02 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:33:59 | step 2500/10000 | loss 3.294227 (-0.01z) | norm 1.5736 (-0.01z) | lr 1.00e-03 | 3380522.13 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:34:13 | step 2510/10000 | loss 3.251968 (-0.04z) | norm 1.5912 (-0.04z) | lr 1.00e-03 | 3394546.12 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:34:26 | step 2520/10000 | loss 3.302908 (+0.05z) | norm 1.5442 (+0.05z) | lr 1.00e-03 | 3407757.12 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:34:41 | step 2530/10000 | loss 3.300499 (-0.00z) | norm 1.4248 (-0.00z) | lr 1.00e-03 | 3422015.94 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:34:54 | step 2540/10000 | loss 3.330806 (+0.03z) | norm 1.3733 (+0.03z) | lr 1.00e-03 | 3435220.80 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:35:08 | step 2550/10000 | loss 3.368146 (+0.04z) | norm 1.4722 (+0.04z) | lr 1.00e-03 | 3449144.03 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:35:22 | step 2560/10000 | loss 3.312632 (-0.06z) | norm 1.4138 (-0.06z) | lr 1.00e-03 | 3463285.53 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:35:35 | step 2570/10000 | loss 3.331315 (+0.02z) | norm 1.5550 (+0.02z) | lr 1.00e-03 | 3476508.83 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:35:48 | step 2580/10000 | loss 3.377988 (+0.05z) | norm 1.4901 (+0.05z) | lr 1.00e-03 | 3489469.38 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:36:01 | step 2590/10000 | loss 3.383955 (+0.01z) | norm 1.4782 (+0.01z) | lr 1.00e-03 | 3502209.11 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:36:15 | step 2600/10000 | loss 3.355087 (-0.03z) | norm 1.4556 (-0.03z) | lr 1.00e-03 | 3516213.43 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:36:30 | step 2610/10000 | loss 3.341608 (-0.01z) | norm 1.4276 (-0.01z) | lr 1.00e-03 | 3531756.82 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:36:45 | step 2620/10000 | loss 3.381522 (+0.04z) | norm 1.8193 (+0.04z) | lr 1.00e-03 | 3546697.87 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:37:01 | step 2630/10000 | loss 3.305541 (-0.08z) | norm 1.5054 (-0.08z) | lr 1.00e-03 | 3562780.33 ms | 60.0% bf16 MFU | 378 tok/s
-2026-05-04 15:37:21 | step 2640/10000 | loss 3.372554 (+0.07z) | norm 1.5732 (+0.07z) | lr 1.00e-03 | 3582535.63 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:37:34 | step 2650/10000 | loss 3.327857 (-0.04z) | norm 1.3704 (-0.04z) | lr 1.00e-03 | 3595828.30 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:37:47 | step 2660/10000 | loss 3.308565 (-0.02z) | norm 1.3511 (-0.02z) | lr 1.00e-03 | 3608491.94 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:38:00 | step 2670/10000 | loss 3.363048 (+0.05z) | norm 1.4632 (+0.05z) | lr 1.00e-03 | 3621821.63 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:38:16 | step 2680/10000 | loss 3.318958 (-0.04z) | norm 1.4825 (-0.04z) | lr 1.00e-03 | 3637059.08 ms | 60.0% bf16 MFU | 377 tok/s
-2026-05-04 15:38:31 | step 2690/10000 | loss 3.284549 (-0.03z) | norm 1.3629 (-0.03z) | lr 1.00e-03 | 3652407.57 ms | 60.0% bf16 MFU | 377 tok/s
diff --git a/engine/main.py b/engine/main.py
index cffb4cb..983e893 100644
--- a/engine/main.py
+++ b/engine/main.py
@@ -10,23 +10,25 @@
 #
 #  LOGGING UTILITIES
 
-W      = 78
+W = 78
 DOUBLE = "=" * W
 SINGLE = "-" * W
-TICK   = "best"
-ARROW  = ">"
+TICK = "best"
+ARROW = ">"
 
 LOG_DIR = Path(__file__).resolve().parent / "logs"
 LOG_DIR.mkdir(parents=True, exist_ok=True)
 LOG_PATH = LOG_DIR / f"run_{time.strftime('%Y%m%d_%H%M%S')}.txt"
 SCRIPT_DIR = Path(__file__).resolve().parent
 
+
 def log(message=""):
     line = "" if message == "" else f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {message}"
     print(line)
     with open(LOG_PATH, "a", encoding="utf-8") as f:
         f.write(f"{line}\n")
 
+
 def header(title, subtitle=""):
     log()
     log(DOUBLE)
@@ -35,51 +37,46 @@ def header(title, subtitle=""):
         log(f"  {subtitle}")
     log(DOUBLE)
 
+
 def row(label, value="", unit="", note=""):
     label_col = f"  {label:<28}"
     value_col = f"{str(value):<20}"
-    unit_col  = f"{unit:<8}"
-    note_col  = f"  {note}" if note else ""
+    unit_col = f"{unit:<8}"
+    note_col = f"  {note}" if note else ""
     log(f"{label_col}{value_col}{unit_col}{note_col}")
 
+
 def rule():   log(f"  {SINGLE}")
 def blank():  log()
 def info(msg):    log(f"  {ARROW}  {msg}")
 def success(msg): log(f"  ok  {msg}")
 
 
- 
 #  SESSION
 
 
-
-log(f"{'Quadtrix-v1.0':^{W}}")
+log(f"{'Quadtrix':^{W}}")
 blank()
-row("Started",  time.strftime('%Y-%m-%d  %H:%M:%S'))
-row("Device",   'CUDA' if torch.cuda.is_available() else 'CPU')
-row("PyTorch",  torch.__version__)
-row("Log file", str(LOG_PATH))
-
 start = time.time()
 
 #  CONFIGURATION
 
 
-cleaned_path  = Path(os.environ.get("data", SCRIPT_DIR / "input.txt"))
-train_split   = 0.9
-seed          = 1337
+cleaned_path = Path(os.environ.get("data", SCRIPT_DIR / "input.txt"))
+train_split = 0.9
+seed = 1337
 
-batch_size    = 16
-block_size    = 32
-max_iters     = 10000
-eval_interval = 10
+batch_size = 16
+block_size = 32
+max_iters = 6000
+eval_interval = 100
 learning_rate = 1e-3
-device        = 'cuda' if torch.cuda.is_available() else 'cpu'
-eval_iters    = 20
-n_embd        = 64
-n_head        = 4
-n_layer       = 4
-dropout       = 0.1
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+eval_iters = 20
+n_embd = 64
+n_head = 4
+n_layer = 4
+dropout = 0.1
 
 torch.manual_seed(seed)
 
@@ -87,38 +84,39 @@ def success(msg): log(f"  ok  {msg}")
 # tokenizer
 
 def get_tokenizer(encoding_name="gpt2"):
-    tokenizer  = tiktoken.get_encoding(encoding_name)
+    tokenizer = tiktoken.get_encoding(encoding_name)
     vocab_size = tokenizer.n_vocab
     return tokenizer, vocab_size
 
+
 def encode(text, tokenizer): return tokenizer.encode(text)
 def decode(tokens, tokenizer): return tokenizer.decode(tokens)
 
 
-
 #  DATA
-
 with open(cleaned_path, 'r', encoding='utf-8') as f:
     text = f.read()
 
 tokenizer, vocab_size = get_tokenizer("gpt2")
-encoded_data          = encode(text, tokenizer)
+encoded_data = encode(text, tokenizer)
 
-data       = torch.tensor(encoded_data, dtype=torch.long)
-n          = int(train_split * len(data))
+data = torch.tensor(encoded_data, dtype=torch.long)
+n = int(train_split * len(data))
 train_data = data[:n]
-val_data   = data[n:]
+val_data = data[n:]
 
 #  Batch and LOSS
 
+
 def get_batch(split):
     data_split = train_data if split == 'train' else val_data
-    ix   = torch.randint(len(data_split) - block_size, (batch_size,))
-    x    = torch.stack([data_split[i:i + block_size]         for i in ix])
-    y    = torch.stack([data_split[i + 1:i + block_size + 1] for i in ix])
+    ix = torch.randint(len(data_split) - block_size, (batch_size,))
+    x = torch.stack([data_split[i:i + block_size] for i in ix])
+    y = torch.stack([data_split[i + 1:i + block_size + 1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 
+
 @torch.no_grad()
 def estimate_loss():
     out = {}
@@ -126,8 +124,8 @@ def estimate_loss():
     for split in ['train', 'val']:
         losses = torch.zeros(eval_iters)
         for k in range(eval_iters):
-            X, Y      = get_batch(split)
-            _, loss   = model(X, Y)
+            X, Y = get_batch(split)
+            _, loss = model(X, Y)
             losses[k] = loss.item()
         out[split] = losses.mean()
     model.train()
@@ -135,36 +133,40 @@ def estimate_loss():
 
 # model
 
+
 class Head(nn.Module):
     def __init__(self, head_size):
         super().__init__()
-        self.key   = nn.Linear(n_embd, head_size, bias=False)
+        self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
-        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+        self.register_buffer('tril', torch.tril(
+            torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, x):
         B, T, C = x.shape
-        k   = self.key(x)
-        q   = self.query(x)
+        k = self.key(x)
+        q = self.query(x)
         wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
         wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
         wei = F.softmax(wei, dim=-1)
         wei = self.dropout(wei)
         return wei @ self.value(x)
 
+
 class MultiHeadAttention(nn.Module):
     def __init__(self, num_heads, head_size):
         super().__init__()
-        self.heads   = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
-        self.proj    = nn.Linear(head_size * num_heads, n_embd)
+        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+        self.proj = nn.Linear(head_size * num_heads, n_embd)
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, x):
         out = torch.cat([h(x) for h in self.heads], dim=-1)
         return self.dropout(self.proj(out))
 
+
 class FeedFoward(nn.Module):
     def __init__(self, n_embd):
         super().__init__()
@@ -178,28 +180,31 @@ def __init__(self, n_embd):
     def forward(self, x):
         return self.net(x)
 
+
 class Block(nn.Module):
     def __init__(self, n_embd, n_head):
         super().__init__()
         head_size = n_embd // n_head
-        self.sa   = MultiHeadAttention(n_head, head_size)
+        self.sa = MultiHeadAttention(n_head, head_size)
         self.ffwd = FeedFoward(n_embd)
-        self.ln1  = nn.LayerNorm(n_embd)
-        self.ln2  = nn.LayerNorm(n_embd)
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
 
     def forward(self, x):
         x = x + self.sa(self.ln1(x))
         x = x + self.ffwd(self.ln2(x))
         return x
 
+
 class GPTLanguageModel(nn.Module):
     def __init__(self):
         super().__init__()
-        self.token_embedding_table    = nn.Embedding(vocab_size, n_embd)
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks                   = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
-        self.ln_f                     = nn.LayerNorm(n_embd)
-        self.lm_head                  = nn.Linear(n_embd, vocab_size)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
         self.apply(self._init_weights)
 
     def _init_weights(self, module):
@@ -211,118 +216,135 @@ def _init_weights(self, module):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 
     def forward(self, idx, targets=None):
-        B, T    = idx.shape
+        B, T = idx.shape
         tok_emb = self.token_embedding_table(idx)
         pos_emb = self.position_embedding_table(torch.arange(T, device=device))
-        x       = tok_emb + pos_emb
-        x       = self.blocks(x)
-        x       = self.ln_f(x)
-        logits  = self.lm_head(x)
+        x = tok_emb + pos_emb
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
 
         if targets is None:
             loss = None
         else:
             B, T, C = logits.shape
-            logits  = logits.view(B * T, C)
+            logits = logits.view(B * T, C)
             targets = targets.view(B * T)
-            loss    = F.cross_entropy(logits, targets)
+            loss = F.cross_entropy(logits, targets)
         return logits, loss
 
     def generate(self, idx, max_new_tokens):
         for _ in range(max_new_tokens):
-            idx_cond  = idx[:, -block_size:]
+            idx_cond = idx[:, -block_size:]
             logits, _ = self(idx_cond)
-            logits    = logits[:, -1, :]
-            probs     = F.softmax(logits, dim=-1)
-            idx_next  = torch.multinomial(probs, num_samples=1)
-            idx       = torch.cat((idx, idx_next), dim=1)
+            logits = logits[:, -1, :]
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
         return idx
 
 
-
 #  INITIALISE
-
-model     = GPTLanguageModel().to(device)
-n_params  = sum(p.numel() for p in model.parameters())
+model = GPTLanguageModel().to(device)
+n_params = sum(p.numel() for p in model.parameters())
 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 
-header("CONFIG")
-row("Seed", seed)
+# ── num_activations: real count via forward hooks on a dummy batch ───────────
+
+
+def count_activations(m, bs, seq_len, dev):
+    total = 0
+    hooks = []
+
+    def _hook(module, inp, out):
+        nonlocal total
+        if isinstance(out, torch.Tensor):
+            total += out.numel()
+    for mod in m.modules():
+        hooks.append(mod.register_forward_hook(_hook))
+    dummy = torch.zeros(bs, seq_len, dtype=torch.long, device=dev)
+    with torch.no_grad():
+        m(dummy)
+    for h in hooks:
+        h.remove()
+    return total
+
+
+_num_activations = count_activations(model, batch_size, block_size, device)
+
+# ── dataset batch counts ─────────────────────────────────────────────────────
+_train_batches = len(train_data) // (batch_size * block_size)
+_val_batches = len(val_data) // (batch_size * block_size)
+
 row("Batch size", batch_size)
 row("Block size", block_size)
-row("Learning rate", learning_rate)
-row("Layers", n_layer)
-row("Heads", n_head)
-row("Embedding dim", n_embd)
-row("Dropout", dropout)
 row("Parameters", f"{n_params:,}")
-row("Train tokens", f"{len(train_data):,}")
-row("Val tokens", f"{len(val_data):,}")
-row("Data file", str(cleaned_path))
-
+blank()
 
-#training 
-header("TRAINING", f"{max_iters:,} steps | eval every {eval_interval} | checkpoint on improvement")
+# training
+header("TRAINING",
+       f"{max_iters:,} steps | eval every {eval_interval} | checkpoint on improvement")
 blank()
 
 best_val_loss = float('inf')
-train_start   = time.time()
-prev_loss = None
+train_start = time.time()
 
 for iter in range(max_iters):
 
+    # ── val + checkpoint at eval_interval ────────────────────────────────────
     if iter % eval_interval == 0 or iter == max_iters - 1:
-        losses      = estimate_loss()
-        elapsed     = time.time() - train_start
-        
-        # Calculate gradient norm
-        total_norm = 0
-        for p in model.parameters():
-            if p.grad is not None:
-                param_norm = p.grad.detach().data.norm(2)
-                total_norm += param_norm.item() ** 2
-        total_norm = total_norm ** 0.5
-        
-        # Calculate loss change
-        loss_change = ""
-        norm_change = ""
-        if prev_loss is not None:
-            delta = losses['train'].item() - prev_loss
-            loss_change = f"({delta:+.2f}z)"
-            norm_change = f"({delta:+.2f}z)"  # Using same delta for norm as placeholder
-        prev_loss = losses['train'].item()
-        
-        # Calculate tokens per second
-        tokens_per_sec = (iter + 1) * batch_size * block_size / elapsed if elapsed > 0 else 0
-        
-        # MFU calculation (simplified)
-        mfu_pct = 60.0  # Placeholder
-        
+        losses = estimate_loss()
         is_best = losses['val'] < best_val_loss
-
         if is_best:
             best_val_loss = losses['val']
             torch.save(model.state_dict(), 'best_model.pt')
-            # log(f"checkpoint saved: path=best_model.pt val_loss={best_val_loss:.4f} step={iter}")
-
-        log_line = (
-            f"step {iter:>4}/{max_iters:<5} | "
-            f"loss {losses['train']:.6f} {loss_change:<8} | "
-            f"norm {total_norm:.4f} {norm_change:<8} | "
-            f"lr {learning_rate:.2e} | "
-            f"{elapsed*1000:.2f} ms | "
-            f"{mfu_pct:.1f}% bf16 MFU | "
-            f"{int(tokens_per_sec)} tok/s"
-        )
-        log(log_line)
+        log(f"  val loss {losses['val']:.6f}")
         sys.stdout.flush()
 
-    xb, yb       = get_batch('train')
+    # ── forward + backward ───────────────────────────────────────────────────
+    step_start = time.time()
+
+    xb, yb = get_batch('train')
     logits, loss = model(xb, yb)
     optimizer.zero_grad(set_to_none=True)
     loss.backward()
+
+    # real grad norm (after backward, before step)
+    grad_norm = 0.0
+    for p in model.parameters():
+        if p.grad is not None:
+            grad_norm += p.grad.detach().data.norm(2).item() ** 2
+    grad_norm = grad_norm ** 0.5
+
     optimizer.step()
 
+    # real per-step dt and tok/sec
+    step_dt_ms = (time.time() - step_start) * 1000
+    tok_per_sec = (batch_size * block_size) / (step_dt_ms / 1000.0)
+
+    # real current lr from optimizer state
+    cur_lr = optimizer.param_groups[0]['lr']
+
+    line = (f"step {iter} | loss: {loss.item():.6f} | lr {cur_lr:.4e} | norm: {grad_norm:.4f} | dt: {step_dt_ms:.2f}ms | tok/sec: {tok_per_sec:.2f}")
+    print(line)
+    with open(LOG_PATH, "a", encoding="utf-8") as f:
+        f.write(line + "\n")
+
+    # ── sample 100 tokens every 50 steps ─────────────────────────────────────
+    if (iter + 1) % 50 == 0:
+        model.eval()
+        context = torch.zeros((1, 1), dtype=torch.long, device=device)
+        with torch.no_grad():
+            sample_ids = model.generate(context, max_new_tokens=100)
+        sample_text = decode(sample_ids[0].tolist(), tokenizer)
+        model.train()
+        blank()
+        log(f"  [sample @ step {iter + 1}]")
+        log(f"  {'-' * 60}")
+        log(f"  {sample_text.strip()}")
+        log(f"  {'-' * 60}")
+        blank()
+
 total_time = time.time() - train_start
 blank()
 rule()
@@ -332,10 +354,10 @@ def generate(self, idx, max_new_tokens):
 rule()
 
 
-
 #  RESTORE CHECKPOINT
 blank()
-model.load_state_dict(torch.load('best_model.pt', map_location=device, weights_only=True))
+model.load_state_dict(torch.load(
+    'best_model.pt', map_location=device, weights_only=True))
 model.eval()
 success(f"Restored best_model.pt | val loss {best_val_loss:.4f}")
 
@@ -359,13 +381,14 @@ def generate(self, idx, max_new_tokens):
             continue
 
         encoded_prompt = encode(prompt, tokenizer)
-        context        = torch.tensor([encoded_prompt], dtype=torch.long, device=device)
+        context = torch.tensor(
+            [encoded_prompt], dtype=torch.long, device=device)
 
         with torch.no_grad():
             output_ids = model.generate(context, max_new_tokens=200)
 
         new_tokens = output_ids[0][len(encoded_prompt):].tolist()
-        response   = decode(new_tokens, tokenizer).strip()
+        response = decode(new_tokens, tokenizer).strip()
 
         blank()
         log(f"  Model {ARROW} {response}")
@@ -376,13 +399,14 @@ def generate(self, idx, max_new_tokens):
     success("Interrupted.")
 
 
-end        = time.time()
+end = time.time()
 wall_clock = end - start
 
 blank()
 rule()
 row("Training",     f"{int(total_time // 60)}m {int(total_time % 60):02d}s")
-row("Total",        f"{int(wall_clock // 60)}m {int(wall_clock % 60):02d}s", "", TICK)
+row("Total",
+    f"{int(wall_clock // 60)}m {int(wall_clock % 60):02d}s", "", TICK)
 rule()
 blank()
 log(f"{DOUBLE}\n")
diff --git a/frontend/package.json b/frontend/package.json
index 0b8dacb..176aed6 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -4,8 +4,8 @@
   "private": true,
   "type": "module",
   "scripts": {
-    "dev": "vite --host 0.0.0.0",
-    "build": "tsc && vite build",
+    "dev": "vite --host 0.0.0.0 --configLoader runner",
+    "build": "tsc && vite build --configLoader runner",
     "preview": "vite preview"
   },
   "dependencies": {
@@ -26,4 +26,4 @@
     "typescript": "^5.7.2",
     "vite": "^6.0.7"
   }
-}
\ No newline at end of file
+}
diff --git a/frontend/src/components/layout/Topbar.tsx b/frontend/src/components/layout/Topbar.tsx
index a6ed610..59e42d2 100644
--- a/frontend/src/components/layout/Topbar.tsx
+++ b/frontend/src/components/layout/Topbar.tsx
@@ -19,7 +19,7 @@ export function Topbar() {
   const tooltip =
     modelBackend === "cpp"
       ? "C++ server - run: ./Quadtrix.exe --server --port 8080"
-      : "PyTorch checkpoint - engine/best_model .pt";
+      : "PyTorch checkpoint - engine/best_model.pt";
   return (
     <header className="border-b border-[var(--border-subtle)] bg-base/95 px-4 py-3 backdrop-blur md:px-6">
       <div className="mx-auto flex max-w-6xl items-center justify-between gap-4">
diff --git a/frontend/src/store/settingsStore.ts b/frontend/src/store/settingsStore.ts
index ecbb628..6c49eb2 100644
--- a/frontend/src/store/settingsStore.ts
+++ b/frontend/src/store/settingsStore.ts
@@ -28,8 +28,8 @@ export const useSettingsStore = create<SettingsState>()(
       apiBaseUrl: defaultApiBaseUrl,
       maxTokens: 200,
       temperature: 1.0,
-      modelLabel: "quadtrix-v1.0 - cpu",
-      modelBackend: "cpp",
+      modelLabel: "quadtrix-v1.0 - PyTorch",
+      modelBackend: "torch",
       settingsOpen: false,
       statsOpen: false,
       setApiBaseUrl: (value) => set({ apiBaseUrl: value }),
@@ -40,6 +40,14 @@ export const useSettingsStore = create<SettingsState>()(
       setSettingsOpen: (value) => set({ settingsOpen: value }),
       setStatsOpen: (value) => set({ statsOpen: value }),
     }),
-    { name: "quadtrix-settings" },
+    {
+      name: "quadtrix-settings",
+      version: 1,
+      migrate: (persistedState) => ({
+        ...(persistedState as Partial<SettingsState>),
+        modelLabel: "quadtrix-v1.0 - PyTorch",
+        modelBackend: "torch",
+      }),
+    },
   ),
 );
diff --git a/model/export_tokenizer.py b/model/export_tokenizer.py
deleted file mode 100644
index 206359f..0000000
--- a/model/export_tokenizer.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""
-Export tiktoken GPT-2 vocabulary for C++ inference
-
-This script creates vocabulary files that can be used with C++ tokenizers
-like sentencepiece or BPE implementations.
-"""
-
-import tiktoken
-import json
-
-print("-"*60)
-print("             Tokenizer Vocabulary Exporter for C++ Inference              ")
-print("-"*60)
-print()
-
-# Initialize GPT-2 tokenizer
-enc = tiktoken.get_encoding("gpt2")
-vocab_size = enc.n_vocab
-
-print(f"Tokenizer: GPT-2 (tiktoken)")
-print(f"Vocabulary size: {vocab_size:,}")
-print()
-#  Export 1: Token to ID mapping (JSON)
-
-
-print("  → Exporting token-to-id mapping...")
-
-# Get the encoder's vocabulary
-# tiktoken doesn't expose the full vocabulary directly, so we reconstruct it
-vocab_dict = {}
-
-# Try to decode each token ID to get its string representation
-for token_id in range(vocab_size):
-    try:
-        token_bytes = enc.decode_single_token_bytes(token_id)
-        # Store as both hex and string representation
-        vocab_dict[token_id] = {
-            "hex": token_bytes.hex(),
-            "str": token_bytes.decode('utf-8', errors='replace')
-        }
-    except Exception as e:
-        vocab_dict[token_id] = {"hex": "", "str": f"<ERROR_{token_id}>"}
-
-with open('gpt2_vocab.json', 'w', encoding='utf-8') as f:
-    json.dump(vocab_dict, f, indent=2, ensure_ascii=False)
-
-print(f"Saved: gpt2_vocab.json ({len(vocab_dict)} tokens)")
-
-
-#  Export 2: Merges file (for BPE)
-
-print("Exporting BPE merges...")
-
-# tiktoken uses a different approach than traditional BPE, but we can try
-# to approximate by analyzing common patterns
-try:
-    # This is a simplified version - for production use, you'd need
-    # to extract the actual BPE merge rules from tiktoken
-    
-    # For now, just create a placeholder
-    with open('gpt2_merges.txt', 'w', encoding='utf-8') as f:
-        f.write("# GPT-2 BPE Merges (approximation)\n")
-        f.write("# For full compatibility, use tiktoken's actual merge rules\n")
-    
-    print(f"  ⚠ Saved: gpt2_merges.txt (placeholder)")
-    print(f"     Note: Full BPE merge extraction requires tiktoken internals")
-except Exception as e:
-    print(f"  ✗ Could not export merges: {e}")
-
-#  Export 3: Simple byte-pair mapping
-
-print("  Creating token lookup table...")
-
-token_strings = []
-for token_id in range(vocab_size):
-    try:
-        token_bytes = enc.decode_single_token_bytes(token_id)
-        token_strings.append(token_bytes.decode('utf-8', errors='replace'))
-    except:
-        token_strings.append(f"<UNK_{token_id}>")
-
-with open('gpt2_tokens.txt', 'w', encoding='utf-8') as f:
-    for token in token_strings:
-        # Escape special characters
-        escaped = token.replace('\\', '\\\\').replace('\n', '\\n').replace('\t', '\\t')
-        f.write(f"{escaped}\n")
-
-print(f" Saved: gpt2_tokens.txt ({len(token_strings)} tokens)")
-
-
-#  Example encoding/decoding
-
-print()
-print("  Testing tokenization...")
-test_text = "Hello, how are you?"
-test_tokens = enc.encode(test_text)
-decoded = enc.decode(test_tokens)
-
-print(f"     Text:    '{test_text}'")
-print(f"     Tokens:  {test_tokens}")
-print(f"     Decoded: '{decoded}'")
-
-
-#  Summary
-print()
-print("Export Complete")
-print()
-print("Files created:")
-print("  gpt2_vocab.json   - Full vocabulary with hex representations")
-print(" gpt2_tokens.txt   - Simple token list (one per line)")
-print("gpt2_merges.txt   - BPE merges (placeholder)")
-print()
-print("for C++ integration:")
-print("Use sentencepiece with gpt2_vocab.json")
-print("Or implement a custom tokenizer using gpt2_tokens.txt")
-print("Or use a library like tokenizers (HuggingFace) with C++ bindings")
diff --git a/src/torch_example.cpp b/model/src/torch_example.cpp
similarity index 100%
rename from src/torch_example.cpp
rename to model/src/torch_example.cpp
diff --git a/src/torch_main.cpp b/model/src/torch_main.cpp
similarity index 100%
rename from src/torch_main.cpp
rename to model/src/torch_main.cpp
diff --git a/package-lock.json b/package-lock.json
deleted file mode 100644
index 28be669..0000000
--- a/package-lock.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "quadtrix",
-  "version": "1.0.0",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {
-    "": {
-      "name": "quadtrix",
-      "version": "1.0.0",
-      "license": "MIT",
-      "bin": {
-        "quadtrix": "bin/quadtrix.js"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    }
-  }
-}
diff --git a/package.json b/package.json
deleted file mode 100644
index a4a24c5..0000000
--- a/package.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "name": "quadtrix",
-  "version": "1.0.2",
-  "description": "CLI for running Quadtrix.cpp chat and local training.",
-  "license": "MIT",
-  "author": "Eamon",
-  "repository": {
-    "type": "git",
-    "url": "https://github.com/Eamon2009/Quadtrix.cpp.git"
-  },
-  "bugs": {
-    "url": "https://github.com/Eamon2009/Quadtrix.cpp/issues"
-  },
-  "homepage": "https://github.com/Eamon2009/Quadtrix.cpp#readme",
-  "bin": {
-    "quadtrix": "bin/quadtrix.js"
-  },
-  "files": [
-    "bin/",
-    "backend/**/*.py",
-    "backend/requirements.txt",
-    "backend/README.md",
-    "backend/.env.example",
-    "config/",
-    "data/data_set.py",
-    "engine/*.py",
-    "engine/engine.c",
-    "frontend/dist/",
-    "iGPU/*.py",
-    "include/",
-    "src/",
-    "main.cpp",
-    "LICENSE",
-    "README.md",
-    "PUBLISHING.md",
-    "run.md"
-  ],
-  "scripts": {
-    "build:frontend": "npm --prefix frontend run build",
-    "prepack": "npm run build:frontend"
-  },
-  "keywords": [
-    "quadtrix",
-    "llm",
-    "transformer",
-    "chat",
-    "machine-learning",
-    "cpp",
-    "pytorch"
-  ],
-  "engines": {
-    "node": ">=18"
-  }
-}
\ No newline at end of file
diff --git a/supervisord.conf b/supervisord.conf
new file mode 100644
index 0000000..3e13710
--- /dev/null
+++ b/supervisord.conf
@@ -0,0 +1,31 @@
+[supervisord]
+nodaemon=true
+logfile=/var/log/supervisor/supervisord.log
+pidfile=/var/run/supervisord.pid
+loglevel=info
+[program:fastapi]
+command=/venv/bin/uvicorn main:app --host 0.0.0.0 --port 3001
+directory=/app/backend
+autostart=true
+autorestart=true
+startretries=5
+stdout_logfile=/var/log/supervisor/fastapi.stdout.log
+stderr_logfile=/var/log/supervisor/fastapi.stderr.log
+stdout_logfile_maxbytes=10MB
+stderr_logfile_maxbytes=10MB
+environment=
+    TORCH_CHECKPOINT_PATH="%(ENV_TORCH_CHECKPOINT_PATH)s",
+    GPT_MODEL_PATH="%(ENV_GPT_MODEL_PATH)s",
+    CORS_ORIGINS="%(ENV_CORS_ORIGINS)s",
+    LOG_LEVEL="%(ENV_LOG_LEVEL)s",
+    MAX_SESSIONS="%(ENV_MAX_SESSIONS)s",
+    SESSION_TTL_HOURS="%(ENV_SESSION_TTL_HOURS)s"
+[program:frontend]
+command=serve -s /app/frontend/dist -l 8080
+autostart=true
+autorestart=true
+startretries=5
+stdout_logfile=/var/log/supervisor/frontend.stdout.log
+stderr_logfile=/var/log/supervisor/frontend.stderr.log
+stdout_logfile_maxbytes=5MB
+stderr_logfile_maxbytes=5MB