Page Capture

Enable visual understanding with page screenshots and DOM extraction.

Allow the Agent to "see" the current page by capturing page state, including Markdown content, screenshots, and actionable elements.

Overview

Page capture extracts the following page state:

Markdown: Page text content in Markdown format
Screenshot: Visual representation of the page (base64 JPEG)
Actionable elements: Structured list of all interactive elements on the page

Install Dependency

Terminal

bun add html2canvas

Full Source Code

Save the following code as src/lib/page-capture.ts:

src/lib/page-capture.ts

/**
 * Page Capture Utilities
 * Captures page information for Lens OS Agent context
 */

import type { PageState, ActionableElement } from "@lens-os/sdk";

// ==================== DOM to Markdown ====================

export function extractPageMarkdown(maxLength: number = 8000): string {
  if (typeof document === "undefined") return "";

  const lines: string[] = [];

  const mainContent =
    document.querySelector("main") ||
    document.querySelector("article") ||
    document.querySelector('[role="main"]') ||
    document.body;

  if (!mainContent) return "";

  processNode(mainContent, lines);

  let result = lines.join("\n").trim();
  if (result.length > maxLength) {
    result = result.substring(0, maxLength) + "\n\n[Content truncated...]";
  }

  return result;
}

function processNode(node: Node, lines: string[], depth: number = 0): void {
  if (node instanceof HTMLElement) {
    const style = window.getComputedStyle(node);
    if (style.display === "none" || style.visibility === "hidden") return;
  }

  if (node instanceof HTMLElement) {
    const skipTags = ["SCRIPT", "STYLE", "SVG", "NOSCRIPT", "IFRAME", "CANVAS", "VIDEO", "AUDIO"];
    if (skipTags.includes(node.tagName)) return;
  }

  if (node.nodeType === Node.TEXT_NODE) {
    const text = node.textContent?.trim();
    if (text) lines.push(text);
    return;
  }

  if (node instanceof HTMLElement) {
    const tag = node.tagName;

    if (/^H[1-6]$/.test(tag)) {
      const level = parseInt(tag[1]);
      const text = node.textContent?.trim();
      if (text) {
        lines.push("");
        lines.push(`${"#".repeat(level)} ${text}`);
        lines.push("");
      }
      return;
    }

    if (tag === "P") {
      const text = node.textContent?.trim();
      if (text) { lines.push(""); lines.push(text); lines.push(""); }
      return;
    }

    if (tag === "A") {
      const href = node.getAttribute("href");
      const text = node.textContent?.trim();
      if (text && href) lines.push(`[${text}](${href})`);
      else if (text) lines.push(text);
      return;
    }

    if (tag === "IMG") {
      const alt = node.getAttribute("alt");
      const src = node.getAttribute("src");
      if (alt || src) lines.push(`![${alt || "image"}](${src || ""})`);
      return;
    }

    if (tag === "UL" || tag === "OL") {
      lines.push("");
      const items = node.querySelectorAll(":scope > li");
      items.forEach((li, index) => {
        const text = li.textContent?.trim();
        if (text) lines.push(`${tag === "OL" ? `${index + 1}.` : "-"} ${text}`);
      });
      lines.push("");
      return;
    }

    if (tag === "TABLE") {
      lines.push(""); lines.push("[Table]");
      node.querySelectorAll("tr").forEach((row, rowIndex) => {
        const cells = Array.from(row.querySelectorAll("th, td")).map(c => c.textContent?.trim() || "");
        if (cells.some(t => t)) {
          lines.push(`| ${cells.join(" | ")} |`);
          if (rowIndex === 0 && row.querySelector("th"))
            lines.push(`| ${cells.map(() => "---").join(" | ")} |`);
        }
      });
      lines.push("");
      return;
    }

    if (tag === "BUTTON") {
      const text = node.textContent?.trim();
      if (text) lines.push(`[Button: ${text}]`);
      return;
    }

    if (tag === "INPUT") {
      const type = node.getAttribute("type") || "text";
      const label = node.getAttribute("aria-label") || node.getAttribute("placeholder") || node.getAttribute("name");
      lines.push(`[Input ${type}: ${label || "unnamed"}]`);
      return;
    }

    for (const child of Array.from(node.childNodes)) {
      processNode(child, lines, depth + 1);
    }
  }
}

// ==================== Screenshot Capture ====================

export async function captureScreenshot(): Promise<string> {
  if (typeof window === "undefined") return "";

  try {
    const html2canvas = (await import("html2canvas")).default;

    const canvas = await html2canvas(document.body, {
      logging: false,
      useCORS: true,
      allowTaint: true,
      scale: 0.5,
      width: window.innerWidth,
      height: window.innerHeight,
      windowWidth: window.innerWidth,
      windowHeight: window.innerHeight,
    });

    return canvas.toDataURL("image/jpeg", 0.7);
  } catch (error) {
    console.error("[PageCapture] Screenshot failed:", error);
    return "";
  }
}

// ==================== Capture Page State ====================

export async function capturePageState(
  options: {
    includeScreenshot?: boolean;
    maxMarkdownLength?: number;
    maxElements?: number;
  } = {}
): Promise<PageState> {
  const {
    includeScreenshot = true,
    maxMarkdownLength = 8000,
    maxElements = 50,
  } = options;

  const [markdown, screenshot, actionableElements] = await Promise.all([
    Promise.resolve(extractPageMarkdown(maxMarkdownLength)),
    includeScreenshot ? captureScreenshot() : Promise.resolve(""),
    Promise.resolve(extractActionableElements(maxElements)),
  ]);

  return {
    url: typeof window !== "undefined" ? window.location.href : "",
    title: typeof document !== "undefined" ? document.title : "",
    markdown,
    screenshot,
    actionableElements,
    timestamp: new Date(),
  };
}

// ==================== Actionable Elements ====================

export function extractActionableElements(maxElements: number = 50): ActionableElement[] {
  if (typeof document === "undefined") return [];

  const elements: ActionableElement[] = [];
  const seen = new Set<string>();

  const selectors = [
    "button:not([disabled])", "a[href]",
    'input:not([type="hidden"]):not([disabled])',
    "select:not([disabled])", "textarea:not([disabled])",
    '[role="button"]:not([disabled])', '[role="link"]',
    '[role="menuitem"]', "[onclick]", '[tabindex="0"]',
  ];

  for (const el of Array.from(document.querySelectorAll(selectors.join(", ")))) {
    if (elements.length >= maxElements) break;

    const htmlEl = el as HTMLElement;
    const style = window.getComputedStyle(htmlEl);
    if (style.display === "none" || style.visibility === "hidden") continue;

    const rect = htmlEl.getBoundingClientRect();
    if (rect.width === 0 || rect.height === 0) continue;

    const selector = generateSelector(htmlEl);
    if (seen.has(selector)) continue;
    seen.add(selector);

    const type = getElementType(htmlEl);
    const text = getElementText(htmlEl);

    elements.push({
      id: `el-${elements.length}`,
      type,
      selector,
      text: text || undefined,
      placeholder: htmlEl.getAttribute("placeholder") || undefined,
      description: generateDescription(htmlEl, type, text),
    });
  }

  return elements;
}

function getElementType(el: HTMLElement): ActionableElement["type"] {
  const tag = el.tagName.toLowerCase();
  if (tag === "button" || el.getAttribute("role") === "button") return "button";
  if (tag === "a" || el.getAttribute("role") === "link") return "link";
  if (tag === "input") return "input";
  if (tag === "select") return "select";
  if (tag === "textarea") return "textarea";
  return "button";
}

function getElementText(el: HTMLElement): string {
  const ariaLabel = el.getAttribute("aria-label");
  if (ariaLabel) return ariaLabel.trim();
  const title = el.getAttribute("title");
  if (title) return title.trim();
  const text = el.textContent?.trim() || "";
  if (text && text.length <= 100) return text;
  if (text) return text.substring(0, 100) + "...";
  if (el instanceof HTMLInputElement) return el.value || el.placeholder || "";
  return "";
}

function generateDescription(el: HTMLElement, type: ActionableElement["type"], text: string): string {
  switch (type) {
    case "button": return text ? `Button: "${text}"` : `Button (${el.tagName.toLowerCase()})`;
    case "link": {
      const href = el.getAttribute("href") || "";
      return text ? `Link: "${text}" → ${href.length > 50 ? href.slice(0, 50) + "..." : href}` : `Link → ${href}`;
    }
    case "input": {
      const inputType = el.getAttribute("type") || "text";
      const name = el.getAttribute("name") || el.getAttribute("id") || el.getAttribute("placeholder") || "";
      return `Input (${inputType}): ${name || "unnamed"}`;
    }
    case "select": return `Dropdown: ${el.getAttribute("name") || el.getAttribute("id") || "unnamed"}`;
    case "textarea": return `Text area: ${el.getAttribute("name") || el.getAttribute("id") || "unnamed"}`;
    default: return text || "Interactive element";
  }
}

function generateSelector(el: HTMLElement): string {
  if (el.id) return `#${CSS.escape(el.id)}`;
  const testId = el.getAttribute("data-testid") || el.getAttribute("data-cy");
  if (testId) return `[data-testid="${CSS.escape(testId)}"]`;

  const parts: string[] = [];
  let current: HTMLElement | null = el;
  let depth = 0;

  while (current && depth < 3) {
    const tag = current.tagName.toLowerCase();
    const classes = Array.from(current.classList)
      .filter(c => !c.startsWith("css-") && !c.includes("--") && c.length < 30)
      .slice(0, 2);

    let part = tag;
    if (classes.length > 0) part += "." + classes.map(c => CSS.escape(c)).join(".");

    if (current.parentElement) {
      const siblings = Array.from(current.parentElement.children).filter(s => s.tagName === current!.tagName);
      if (siblings.length > 1) part += `:nth-of-type(${siblings.indexOf(current) + 1})`;
    }

    parts.unshift(part);
    current = current.parentElement;
    depth++;
  }

  return parts.join(" > ");
}

Section Breakdown

1. extractPageMarkdown — DOM to Markdown

Extracts page text from the DOM and converts it to Markdown format so the LLM can better understand page structure. Prioritizes semantic elements like main, article, and [role="main"], recursively traverses child nodes, converts HTML tags to corresponding Markdown syntax (headings, paragraphs, links, lists, tables), and skips non-content nodes like SCRIPT, STYLE, and hidden elements.

2. captureScreenshot — Visual Screenshot

Dynamically imports html2canvas (tree-shaking friendly) to capture the current viewport as a base64 JPEG. Uses scale: 0.5 and quality: 0.7 to reduce image size, lowering transfer time and token cost.

3. capturePageState — Main Entry Point

Runs Markdown extraction, screenshot capture, and actionable element extraction in parallel, then returns a complete PageState object including URL, title, and timestamp for the Agent.

4. extractActionableElements — Interactive Element List

Queries all interactive elements on the page (buttons, links, inputs, dropdowns, etc.), filters out hidden or zero-size elements, and generates a unique CSS selector and description for each element — giving the Agent a clear map of what UI can be interacted with.

Integrating into the Chat Component

In handleSend, call capturePageState before sending the message and pass the result via the pageState parameter to sendMessage:

Integration example

// Integrate page capture in handleSend
const handleSend = async () => {
  let currentPage: PageState | null = null;

  if (captureEnabled) {
    setIsCapturing(true);
    try {
      currentPage = await capturePageState({
        includeScreenshot: true,
        maxMarkdownLength: 6000,
        maxElements: 30,
      });
    } catch (err) {
      console.error("[PageCapture] Capture failed:", err);
    } finally {
      setIsCapturing(false);
    }
  }

  await sendMessage(message, {
    currentUrl: typeof window !== "undefined" ? window.location.href : "",
    pageState: currentPage ?? undefined,
  });
};

Performance Tips

Tip

Reduce screenshot resolution: Use scale: 0.5 to shrink image size, reducing transfer time and token cost.

Tip

Limit Markdown length: Set maxMarkdownLength: 6000 to avoid overly long content degrading LLM performance.

Tip

Limit element count: Set maxElements: 30 to capture only the most important interactive elements.