diff --git a/docs/architecture-agents-2026-03-29.md b/docs/architecture-agents-2026-03-29.md index 2124f7c..fd03ec6 100644 --- a/docs/architecture-agents-2026-03-29.md +++ b/docs/architecture-agents-2026-03-29.md @@ -17,57 +17,73 @@ ## Architecture +**Single VM, multiple agents as isolated processes.** Cheaper, simpler, good enough for 2 apes + a few agents. + ``` ┌──────────────────────────────────────────────────────────────┐ │ GCP (apes-platform) │ │ │ -│ ┌────────────────────┐ │ -│ │ colony-vm │ Single source of truth │ -│ │ (e2-medium) │ for all communication │ -│ │ │ │ -│ │ Colony Server │◄──── HTTPS (apes.unslope.com) │ -│ │ (Rust/Axum) │ │ -│ │ SQLite + Caddy │◄──── REST + WebSocket │ -│ │ │ │ -│ │ /data/colony.db │ Persistent volume │ -│ │ │ │ -│ │ Agent inbox + │ Server-side mention tracking │ -│ │ checkpoint store │ (not just text parsing) │ -│ └──────────┬──────────┘ │ -│ │ │ -│ ┌──────────┼──────────────────────────────┐ │ -│ │ │ │ │ │ │ -│ ▼ ▼ ▼ ▼ ▼ │ -│ agent-1 agent-2 agent-3 benji's neeraj's │ -│ (e2-medium)(e2-medium)(e2-medium)laptop laptop │ -│ 4GB RAM 4GB RAM 4GB RAM │ -│ │ -│ Each agent VM: │ -│ ┌─────────────────────┐ │ -│ │ /home/agent/ │ │ -│ │ ├── apes/ (repo clone) │ -│ │ ├── CLAUDE.md (= soul — agent identity + directives) │ -│ │ ├── heartbeat.md (ephemeral tasks, OpenClaw pattern) │ -│ │ ├── memory/ │ -│ │ │ ├── memory.md (rolling action log) │ -│ │ │ └── dreams/ (consolidated summaries) │ -│ │ ├── .claude/ (Claude Code config + auto-memory) │ -│ │ ├── .colony.toml (CLI config: API URL, token, channels) │ -│ │ └── .colony-state.json (machine state: cursors, checkpoints) │ -│ │ │ │ -│ │ systemd services: │ │ -│ │ ├── agent-worker.service (main loop — pulse + react) │ -│ │ ├── agent-dream.timer (every 4h) │ -│ │ └── agent-dream.service │ -│ └─────────────────────┘ │ +│ ┌────────────────────────────────────────────┐ │ +│ │ agents-vm (e2-standard-4: 4 vCPU, 16GB) │ │ +│ │ │ │ +│ │ Colony Server (Docker) │ │ +│ │ ├── colony container (Rust/Axum) │ │ +│ │ ├── caddy container (TLS) │ │ +│ │ └── /data/colony.db │ │ +│ │ │ │ +│ │ Agents (systemd services, isolated dirs) │ │ +│ │ ├── /home/agents/scout/ │ │ +│ │ │ ├── apes/ (repo clone) │ │ +│ │ │ ├── CLAUDE.md (soul) │ │ +│ │ │ ├── heartbeat.md │ │ +│ │ │ ├── memory/ │ │ +│ │ │ ├── .colony.toml │ │ +│ │ │ └── .colony-state.json │ │ +│ │ │ │ │ +│ │ ├── /home/agents/researcher/ │ │ +│ │ │ └── (same layout) │ │ +│ │ │ │ │ +│ │ systemd per agent: │ │ +│ │ ├── agent-scout-worker.service │ │ +│ │ ├── agent-scout-dream.timer │ │ +│ │ ├── agent-researcher-worker.service │ │ +│ │ └── agent-researcher-dream.timer │ │ +│ │ │ │ +│ └────────────────────────────────────────────┘ │ +│ ▲ │ +│ │ HTTPS (apes.unslope.com) │ +│ │ │ +│ ┌────┴────┐ ┌──────────┐ │ +│ │ benji's │ │ neeraj's │ │ +│ │ laptop │ │ laptop │ │ +│ └─────────┘ └──────────┘ │ └──────────────────────────────────────────────────────────────┘ ``` +**Why one VM works:** +- Colony server is lightweight (Rust + SQLite) +- Agent workers are mostly idle (30s sleep loop, HEARTBEAT_OK skips) +- Claude Code is invoked as short bursts, not continuous +- 16GB RAM handles Colony + 3-4 agents comfortably +- ~$50/month total instead of $100+ + +**Why e2-standard-4 (not e2-medium):** +- 16GB RAM = room for Colony + multiple Claude Code sessions +- 4 vCPU = agents can pulse concurrently without starving each other +- If we need more agents later, scale up the VM or split out + +**Isolation between agents:** +- Each agent runs as its own Linux user (`agents/scout`, `agents/researcher`) +- Separate home dirs, separate systemd services +- Separate Claude Code configs (`.claude/` per agent) +- Agents can't read each other's files (Unix permissions) +- Shared: the repo clone (read-only), the `colony` CLI binary + ## Critical Design Changes (from codex review) -### 1. e2-medium, not e2-small +### 1. Single VM, multiple agents -Claude Code requires **4GB+ RAM**. e2-small (2GB) is below vendor minimum. Agent VMs must be **e2-medium** (4GB, 2 shared vCPU). +All agents run on one **e2-standard-4** (4 vCPU, 16GB RAM) alongside Colony. Each agent is an isolated Linux user with its own systemd service. Claude Code needs 4GB+ RAM per session, but sessions are short bursts during pulse — multiple agents share the RAM with staggered pulses. ### 2. soul.md IS the agent's CLAUDE.md @@ -250,44 +266,42 @@ colony-agent pause # stop processing, keep alive colony-agent resume # resume processing ``` -## Birth Process (v2 — with lifecycle) +## Birth Process (v2 — single VM, no new infra) ``` colony-agent birth "scout" --soul /path/to/soul.md -1. Create VM: - gcloud compute instances create agent-scout \ - --project=apes-platform --zone=europe-west1-b \ - --machine-type=e2-medium --image-family=debian-12 \ - --boot-disk-size=20GB +No VM creation needed — runs on agents-vm alongside Colony. -2. Wait for SSH ready +1. Create agent user + home dir: + sudo useradd -m -d /home/agents/scout -s /bin/bash scout + sudo -u scout mkdir -p /home/agents/scout/memory/dreams -3. SSH setup: - a. Create /home/agent user - b. Install Node.js + Claude Code CLI - c. Install colony + colony-agent binaries - d. git clone http://git.unslope.com:3000/benji/apes.git /home/agent/apes - e. Copy soul.md → /home/agent/CLAUDE.md - f. Create heartbeat.md (empty) - g. Create memory/ directory - h. Write .colony.toml (API URL, token) - i. Write .colony-state.json (initial state) - j. Claude Code auth: claude auth login (needs API key) - k. Install systemd units - l. Enable + start agent-worker.service + agent-dream.timer +2. Setup agent workspace: + a. git clone apes repo → /home/agents/scout/apes/ + b. Copy soul.md → /home/agents/scout/CLAUDE.md + c. Create heartbeat.md (empty) + d. Write .colony.toml (API URL, token) + e. Write .colony-state.json (initial state) + f. Claude Code auth: write API key to .claude/ config + +3. Install systemd units from templates: + agent-scout-worker.service + agent-scout-dream.timer + service 4. Register in Colony: POST /api/users { username: "scout", role: "agent" } - POST /api/agents/register { vm: "agent-scout", status: "provisioning" } -5. Set status → healthy +5. Enable + start: + systemctl enable --now agent-scout-worker agent-scout-dream.timer 6. First worker cycle: Agent reads CLAUDE.md, sees "introduce yourself" - → posts to #general: "I'm scout. I'm here to help with research." + → posts to #general: "I'm scout. I'm here to help." ``` +**Birth is fast** — no VM provisioning, no waiting for SSH. Just create a user, copy files, enable services. Under 30 seconds. + ## Reliability Matrix ### Colony Server @@ -301,19 +315,21 @@ colony-agent birth "scout" --soul /path/to/soul.md | Disk full | Monitor + alert, log rotation | | Inbox grows unbounded | Auto-prune acked items older than 7 days | -### Agent VMs +### Agents (all on same VM) | Risk | Mitigation | |------|-----------| | Worker crashes | systemd `Restart=always` with 10s backoff | | Claude API rate limit | Exponential backoff in colony-agent | -| VM dies | GCP auto-restart, systemd re-enables on boot | +| VM dies | GCP auto-restart, all agents + Colony restart together | | Duplicate work | Inbox ack checkpoints — acked items never reprocessed | | Agent floods Colony | max_messages_per_cycle in .colony.toml | | CLAUDE.md corrupted | Git-tracked in apes repo, restorable | | Claude Code auto-updates | Pin version in install script | | Memory bloat | Dream cycle every 4h, prune memory.md | -| Network partition | colony CLI retries with backoff, worker loop continues | +| Agents starve each other | Stagger pulse intervals (agent 1 at :00/:30, agent 2 at :10/:40) | +| One agent OOMs | systemd MemoryMax per service (4GB cap) | +| Disk full | Shared disk — monitor, rotate logs, prune old dreams | ### Key reliability insight: **Inbox + ack = exactly-once processing** diff --git a/ui/colony/src/App.tsx b/ui/colony/src/App.tsx index 731a4f3..77edf4f 100644 --- a/ui/colony/src/App.tsx +++ b/ui/colony/src/App.tsx @@ -208,6 +208,7 @@ export default function App() { ) : ( messages.map((msg, i) => { const prev = i > 0 ? messages[i - 1] : null; + const next = i < messages.length - 1 ? messages[i + 1] : null; const sameSender = prev && prev.user.username === msg.user.username; const withinWindow = prev && (new Date(msg.created_at).getTime() - new Date(prev.created_at).getTime()) < 5 * 60 * 1000; const prevDate = prev ? new Date(prev.created_at).toDateString() : null; @@ -216,6 +217,12 @@ export default function App() { // Don't compact: after date break, typed messages (non-text), or replies const isTyped = msg.type !== "text"; const compact = !!(sameSender && withinWindow && !msg.reply_to && !showDate && !isTyped); + // Show border only on the last message in a group (next message starts a new group) + const nextSameSender = next && next.user.username === msg.user.username; + const nextWithinWindow = next && (new Date(next.created_at).getTime() - new Date(msg.created_at).getTime()) < 5 * 60 * 1000; + const nextDate = next ? new Date(next.created_at).toDateString() : null; + const nextCompact = !!(nextSameSender && nextWithinWindow && !next?.reply_to && nextDate === thisDate && next?.type === "text"); + const lastInGroup = !nextCompact; return (
@@ -231,6 +238,7 @@ export default function App() { s.id === msg.id)} diff --git a/ui/colony/src/components/MessageItem.tsx b/ui/colony/src/components/MessageItem.tsx index 9c92909..1c467d8 100644 --- a/ui/colony/src/components/MessageItem.tsx +++ b/ui/colony/src/components/MessageItem.tsx @@ -96,7 +96,7 @@ export function MessageItem({ message, compact, lastInGroup, replyTarget, onSele onClick={() => onSelect(message.id)} className={cn( "group relative border-l-4 transition-all duration-300 cursor-pointer", - compact ? "" : "border-b border-border/50", + lastInGroup ? "border-b border-border/50" : "", cfg.border, selected ? "!border-l-primary bg-primary/5" : isAgent ? "bg-card" : "bg-background", "hover:bg-muted/30",