Source: pipeline/harCapture.js

/**
 * @module pipeline/harCapture
 * @description Captures API traffic during crawl/exploration and produces a
 * structured summary of discovered endpoints for API test generation.
 *
 * Attaches Playwright request/response listeners to a `BrowserContext` and
 * records every same-origin fetch/XHR call. After crawling completes, the
 * captured entries are deduplicated, grouped by endpoint pattern, and
 * summarised into an `ApiEndpoint[]` array that the AI prompt can consume.
 *
 * ### What is captured
 * - Method, URL path, query params, request headers (safe subset)
 * - Request body (JSON only, truncated to 2 KB)
 * - Response status, content-type, body (JSON only, truncated to 2 KB)
 * - Timing (duration ms)
 *
 * ### What is filtered out
 * - Static assets (images, fonts, CSS, JS bundles, sourcemaps)
 * - Third-party origins (analytics, CDNs, ads)
 * - Duplicate endpoint+method combinations (keeps first + last seen)
 *
 * ### Exports
 * - {@link createHarCapture} — attach to a BrowserContext, returns collector
 * - {@link summariseApiEndpoints} — deduplicate + group captured entries
 */

// File extensions / path segments that indicate static assets — never API calls
const STATIC_PATTERNS = [
  /\.(js|mjs|css|png|jpe?g|gif|svg|ico|woff2?|ttf|eot|map|webp|avif)(\?|$)/i,
  /\/(fonts|images|assets|static|_next\/static|__webpack)\//i,
  /^data:/,
  /\/favicon/i,
  /\/manifest\.json$/i,
];

// Request headers safe to include in the prompt (no auth tokens / cookies)
const SAFE_HEADERS = new Set([
  "content-type", "accept", "x-requested-with", "origin", "referer",
]);

// Max body size to capture (keeps prompt tokens bounded)
const MAX_BODY_CHARS = 2048;

function isStaticAsset(url) {
  return STATIC_PATTERNS.some(re => re.test(url));
}

function truncate(str, max = MAX_BODY_CHARS) {
  if (!str || str.length <= max) return str || "";
  return str.slice(0, max) + `… [truncated, ${str.length} total chars]`;
}

function safeHeaders(headers) {
  const out = {};
  for (const [k, v] of Object.entries(headers || {})) {
    if (SAFE_HEADERS.has(k.toLowerCase())) out[k.toLowerCase()] = v;
  }
  return out;
}

/**
 * Normalise a URL path into a pattern by replacing numeric/UUID segments
 * with `:id` placeholders. This groups `/api/users/123` and `/api/users/456`
 * into the same endpoint pattern `/api/users/:id`.
 *
 * @param {string} pathname
 * @returns {string}
 */
function normalisePathPattern(pathname) {
  return pathname
    .split("/")
    .map(seg => {
      if (/^\d+$/.test(seg)) return ":id";
      if (/^[0-9a-f]{8,}$/i.test(seg)) return ":id";
      if (/^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(seg)) return ":id";
      return seg;
    })
    .join("/");
}

/**
 * Extract the GraphQL operation name from a JSON request body.
 * Returns null if the body is not a valid GraphQL request.
 *
 * @param {string|null} body - Raw request body string.
 * @returns {string|null} Operation name, or null.
 */
function extractGraphQLOperationName(body) {
  if (!body) return null;
  try {
    const parsed = JSON.parse(body);
    if (typeof parsed.operationName === "string" && parsed.operationName) {
      return parsed.operationName;
    }
    // Fallback: extract from the query string (e.g. "query GetUser { ... }")
    if (typeof parsed.query === "string") {
      const match = parsed.query.match(/^\s*(?:query|mutation|subscription)\s+(\w+)/);
      if (match) return match[1];
    }
  } catch { /* not JSON or not GraphQL */ }
  return null;
}

/**
 * Check if a URL path looks like a GraphQL endpoint.
 * @param {string} pathname
 * @returns {boolean}
 */
function isGraphQLPath(pathname) {
  return /\/graphql\b/i.test(pathname);
}

/**
 * Attach API traffic capture to a Playwright BrowserContext.
 *
 * Call `capture.detach()` when done to stop listening. Then call
 * `capture.getEntries()` to retrieve all captured API calls.
 *
 * @param {Object} context — Playwright BrowserContext instance
 * @param {string} appOrigin — the project URL origin (only same-origin calls captured)
 * @returns {Object} `{ detach(), getEntries() }` — collector handle
 */
export function createHarCapture(context, appOrigin) {
  const entries = [];
  // Map Playwright request object → entry for reliable 1:1 request–response
  // correlation. The previous key-based approach (`method:url:timestamp`)
  // caused mispairing when multiple concurrent requests hit the same endpoint.
  const pendingByRequest = new WeakMap();

  let origin;
  try { origin = new URL(appOrigin).origin; } catch { origin = appOrigin; }

  function onRequest(request) {
    try {
      const url = request.url();
      if (isStaticAsset(url)) return;

      let parsed;
      try { parsed = new URL(url); } catch { return; }
      if (parsed.origin !== origin) return;

      const method = request.method();
      const resourceType = request.resourceType();
      // Only capture fetch/xhr — skip document, stylesheet, image, etc.
      if (!["fetch", "xhr"].includes(resourceType)) return;

      let reqBody = null;
      let rawPostData = null;
      try {
        rawPostData = request.postData();
        if (rawPostData) reqBody = truncate(rawPostData);
      } catch { /* no body */ }

      // Detect GraphQL operations so they can be grouped separately.
      // Extract from raw postData (before truncation) since complex GraphQL
      // bodies often exceed MAX_BODY_CHARS and truncation breaks JSON parsing.
      const graphqlOp = (method === "POST" && isGraphQLPath(parsed.pathname))
        ? extractGraphQLOperationName(rawPostData)
        : null;

      const entry = {
        method,
        url,
        pathname: parsed.pathname,
        query: parsed.search || "",
        requestHeaders: safeHeaders(request.headers()),
        requestBody: reqBody,
        graphqlOperation: graphqlOp,
        status: null,
        responseHeaders: {},
        responseBody: null,
        contentType: null,
        durationMs: null,
        startTime: Date.now(),
        pageUrl: null,
      };

      // Try to capture which page triggered this request
      try {
        const frame = request.frame();
        if (frame) entry.pageUrl = frame.url();
      } catch { /* frame may be detached */ }

      pendingByRequest.set(request, entry);
      entries.push(entry);
    } catch { /* swallow — never break the crawl */ }
  }

  async function onResponse(response) {
    try {
      const request = response.request();
      const entry = pendingByRequest.get(request);
      if (!entry) return;
      pendingByRequest.delete(request);

      entry.status = response.status();
      entry.durationMs = Date.now() - entry.startTime;

      const ct = (response.headers()["content-type"] || "").toLowerCase();
      entry.contentType = ct;
      entry.responseHeaders = safeHeaders(response.headers());

      // Only capture JSON response bodies — HTML/binary is noise for API tests
      if (ct.includes("json")) {
        try {
          const body = await response.text().catch(() => "");
          entry.responseBody = truncate(body);
        } catch { /* body unavailable */ }
      }
    } catch { /* swallow */ }
  }

  // Attach to context so ALL pages in the context are captured
  context.on("request", onRequest);
  context.on("response", onResponse);

  return {
    detach() {
      context.removeListener("request", onRequest);
      context.removeListener("response", onResponse);
    },
    getEntries() {
      return entries;
    },
  };
}

/**
 * Deduplicate and summarise captured HAR entries into API endpoint descriptors.
 *
 * Groups entries by `METHOD + normalised path pattern`, keeps the first and
 * last example for each group (so the AI sees both the shape and variation),
 * and produces a compact summary suitable for the API test prompt.
 *
 * @param {HarEntry[]} entries — raw entries from createHarCapture
 * @returns {ApiEndpoint[]} — deduplicated endpoint summaries
 *
 * @typedef {Object} ApiEndpoint
 * @property {string}   method       — HTTP method (GET, POST, etc.)
 * @property {string}   pathPattern  — normalised path (e.g. `/api/users/:id`)
 * @property {string[]} exampleUrls  — 1–2 concrete URLs observed
 * @property {number[]} statuses     — unique status codes observed
 * @property {string}   contentType  — response content-type
 * @property {string|null} requestBodyExample — first observed request body (JSON)
 * @property {string|null} responseBodyExample — first observed response body (JSON)
 * @property {number}   callCount    — how many times this endpoint was hit
 * @property {number}   avgDurationMs
 * @property {string[]} pageUrls     — which pages triggered this endpoint
 */
export function summariseApiEndpoints(entries) {
  if (!entries || entries.length === 0) return [];

  // Group by METHOD + normalised path
  const groups = new Map();

  for (const e of entries) {
    const pattern = normalisePathPattern(e.pathname);
    // For GraphQL endpoints, include the operation name in the key so
    // different operations (queries, mutations) are grouped separately
    // instead of being collapsed into a single "POST /graphql" entry.
    const gqlOp = e.graphqlOperation || null;
    const key = gqlOp ? `${e.method} ${pattern} [${gqlOp}]` : `${e.method} ${pattern}`;

    if (!groups.has(key)) {
      groups.set(key, {
        method: e.method,
        pathPattern: gqlOp ? `${pattern} (${gqlOp})` : pattern,
        graphqlOperation: gqlOp,
        exampleUrls: [],
        statuses: new Set(),
        contentType: e.contentType || "",
        requestBodyExample: null,
        responseBodyExample: null,
        durations: [],
        pageUrls: new Set(),
        callCount: 0,
      });
    }

    const g = groups.get(key);
    g.callCount++;
    if (g.exampleUrls.length < 2 && !g.exampleUrls.includes(e.url)) {
      g.exampleUrls.push(e.url);
    }
    if (e.status) g.statuses.add(e.status);
    if (e.durationMs) g.durations.push(e.durationMs);
    if (e.pageUrl) g.pageUrls.add(e.pageUrl);
    if (!g.requestBodyExample && e.requestBody) g.requestBodyExample = e.requestBody;
    if (!g.responseBodyExample && e.responseBody) g.responseBodyExample = e.responseBody;
    if (!g.contentType && e.contentType) g.contentType = e.contentType;
  }

  // Convert to array and sort by call count (most-used endpoints first)
  return Array.from(groups.values())
    .map(g => ({
      method: g.method,
      pathPattern: g.pathPattern,
      graphqlOperation: g.graphqlOperation || null,
      exampleUrls: g.exampleUrls,
      statuses: [...g.statuses].sort(),
      contentType: g.contentType,
      requestBodyExample: g.requestBodyExample,
      responseBodyExample: g.responseBodyExample,
      callCount: g.callCount,
      avgDurationMs: g.durations.length
        ? Math.round(g.durations.reduce((s, d) => s + d, 0) / g.durations.length)
        : 0,
      pageUrls: [...g.pageUrls].slice(0, 3),
    }))
    .sort((a, b) => b.callCount - a.callCount)
    .slice(0, 30); // Cap at 30 endpoints to keep prompt bounded
}