Source: testRunner.js

/**
 * @module testRunner
 * @description Thin orchestrator for Playwright test execution with parallel
 * worker support.
 *
 * Owns the browser lifecycle, per-test loop (sequential or parallel), trace
 * management, and final status transition. Delegates heavy sub-tasks to
 * focused modules:
 *
 * | Module                          | Responsibility                        |
 * |---------------------------------|---------------------------------------|
 * | `runner/config.js`              | Env constants, artifact dir setup     |
 * | `runner/codeParsing.js`         | `extractTestBody` (hasCode check)     |
 * | `runner/executeTest.js`         | Single-test execution                 |
 * | `runner/feedbackIntegration.js` | Post-run AI feedback loop             |
 *
 * ### Parallel execution
 * When `parallelWorkers > 1`, tests run in concurrent browser contexts within
 * a single Chromium instance. Each worker picks the next queued test, executes
 * it in its own isolated `BrowserContext`, and reports back. The shared browser
 * process keeps memory usage lower than launching N separate browsers.
 *
 * Concurrency is controlled by:
 * 1. `PARALLEL_WORKERS` env var (default for all runs)
 * 2. Per-run override via `options.parallelWorkers` (from Test Dials / API)
 *
 * ### Exports
 * - {@link runTests} — Execute an array of approved tests against a project.
 */

import { extractTestBody, isApiTest } from "./runner/codeParsing.js";
import { executeTest, executeTestIterations } from "./runner/executeTest.js";
import { runFeedbackLoop } from "./runner/feedbackIntegration.js";
import { isSmokeTest } from "./pipeline/riskScorer.js";
import { clusterFailures } from "./pipeline/failureClusterer.js";
import { TRACES_DIR, DEFAULT_PARALLEL_WORKERS, MAX_TEST_RETRIES, launchBrowser, resolveBrowser, BROWSER_HEADLESS } from "./runner/config.js";
import { executeWithRetries } from "./runner/retry.js";
import { finalizeRunIfNotAborted, isRunAborted } from "./utils/abortHelper.js";
import { trackTelemetry } from "./utils/telemetry.js";
import { emitRunEvent, log, logWarn, logError, logSuccess } from "./utils/runLogger.js";
import { classifyError } from "./utils/errorClassifier.js";
import { structuredLog, formatLogLine } from "./utils/logFormatter.js";
import * as testRepo from "./database/repositories/testRepo.js";
import * as runRepo from "./database/repositories/runRepo.js";
import * as testFixtureRepo from "./database/repositories/testFixtureRepo.js";
import { signRunArtifacts, signArtifactUrl } from "./middleware/appSetup.js";
import { writeArtifactBuffer } from "./utils/objectStorage.js";
import fs from "fs";
import { recordMetric } from "./utils/recordMetric.js";
import { isNonExecutedSkip } from "./utils/skipReasons.js";


function evaluateQualityGates(gates, run) {
  // Defense-in-depth: `validateQualityGates` in `backend/src/routes/projects.js`
  // already rejects payloads that produce an empty object, but a corrupted DB
  // row or direct DB manipulation could still surface `{}` here. Treat any
  // non-object, array, or empty object as "no gates configured" and return
  // null — same shape as the unconfigured case — so callers (trigger response,
  // RunDetail UI, GateBadge) render legacy-style with no enforcement rather
  // than silently reporting `{ passed: true }` from a misconfigured project.
  if (!gates || typeof gates !== "object" || Array.isArray(gates)) return null;
  if (Object.keys(gates).length === 0) return null;
  const violations = [];
  // AUTO-001 / AUTO-004: `run.total` reflects the approved-test set (audit
  // fidelity), but skipped tests never executed — they shouldn't dilute the
  // pass-rate denominator. Exclude both `over_budget` (AUTO-001) and
  // `skipped_no_impact` (AUTO-004) so a `minPassRate: 80%` gate doesn't
  // falsely fail when budget truncation or impact analysis happens to skip
  // tests that would otherwise have passed. Must stay in sync with the
  // frontend denominator at `frontend/src/pages/RunDetail.jsx` —
  // `passRateDenominator = total - skippedOverBudget - skippedNoImpact`.
  const rawTotal = Number(run.total || 0);
  const skippedNonExecuted = Array.isArray(run.results)
    ? run.results.filter(isNonExecutedSkip).length
    : 0;
  const total = Math.max(0, rawTotal - skippedNonExecuted);
  const failed = Number(run.failed || 0);
  const passed = Number(run.passed || 0);
  const passRate = total > 0 ? (passed / total) * 100 : 100;

  // `flakyPct` = % of tests that needed at least one retry, NOT the sum of
  // retries across the suite. Using `run.retryCount` directly (sum of per-test
  // retries — see line ~340 below) would let a single 3×-retried test push
  // flakyPct above 100% on a 1-test run, which is both nonsensical and
  // unreachable given `maxFlakyPct` is range-validated to 0–100 server-side
  // (`backend/src/routes/projects.js`). Counting flaky *tests* instead matches
  // the user-facing meaning and stays bounded in [0, 100]. Falls back to
  // counting per-result retryCount > 0 when run.results is available; uses 0
  // when results aren't populated yet (e.g. aborted runs). Denominator uses
  // the dispatched-count `total` (budget-skipped tests excluded) for the same
  // reason as `passRate` above.
  const flakyTests = Array.isArray(run.results)
    ? run.results.filter((r) => Number(r?.retryCount || 0) > 0).length
    : 0;
  const flakyPct = total > 0 ? (flakyTests / total) * 100 : 0;

  if (Number.isFinite(gates.minPassRate) && passRate < gates.minPassRate) {
    violations.push({ rule: "minPassRate", threshold: gates.minPassRate, actual: Number(passRate.toFixed(2)) });
  }
  if (Number.isFinite(gates.maxFlakyPct) && flakyPct > gates.maxFlakyPct) {
    violations.push({ rule: "maxFlakyPct", threshold: gates.maxFlakyPct, actual: Number(flakyPct.toFixed(2)) });
  }
  if (Number.isFinite(gates.maxFailures) && failed > gates.maxFailures) {
    violations.push({ rule: "maxFailures", threshold: gates.maxFailures, actual: failed });
  }

  return { passed: violations.length === 0, violations };
}


function evaluateWebVitalsBudgets(budgets, run) {
  if (!budgets || typeof budgets !== "object" || Array.isArray(budgets) || Object.keys(budgets).length === 0) return null;
  const violations = [];
  // Track whether *any* metric was actually compared against a budget. If zero
  // comparisons happened — e.g. the `web-vitals` IIFE failed to load and every
  // captureWebVitals() returned all-null metrics — return null to match the
  // "unconfigured" semantics. Otherwise CI consumers (trigger.js callback,
  // status endpoint) would see `{ passed: true, violations: [] }` and falsely
  // conclude the budgets passed when nothing was measured at all. Mirrors the
  // defense-in-depth pattern in evaluateQualityGates above.
  let anyMeasured = false;
  const rows = Array.isArray(run.results) ? run.results : [];
  for (const r of rows) {
    const m = r?.webVitals;
    if (!m || typeof m !== "object") continue;
    for (const key of ["lcp", "cls", "inp", "ttfb"]) {
      if (!Number.isFinite(budgets[key]) || !Number.isFinite(m[key])) continue;
      anyMeasured = true;
      if (m[key] > budgets[key]) {
        violations.push({ rule: key, threshold: budgets[key], actual: m[key], testId: r.testId, testName: r.testName || null });
      }
    }
  }
  if (!anyMeasured) return null;
  return { passed: violations.length === 0, violations };
}

// Exported under a name-mangled alias so integration tests can exercise the
// pure evaluator without pulling in the full runner surface. Not part of the
// public module contract — callers outside tests should rely on run.gateResult.
export { evaluateQualityGates as __evaluateQualityGatesForTest, evaluateWebVitalsBudgets as __evaluateWebVitalsBudgetsForTest };

// ── Concurrency helper ────────────────────────────────────────────────────────
// Lightweight promise pool — no external dependencies. Runs `fn` for each item
// in `items` with at most `concurrency` in-flight at once. Results are returned
// in the original item order.

/**
 * CAP-002 — Compute per-shard sizes using Playwright's `--shard=N/M`
 * algorithm: the first `total % shardCount` shards receive one extra item,
 * so shard sizes differ by at most one. Pure function — no I/O, no
 * mutation, no allocations beyond the returned array. The single source
 * of truth for the partition shape; both `partitionTestsIntoShards`
 * (stamps `_shardIndex` on test objects) and `partitionTestIdsForShards`
 * (slices plain-string arrays) derive their output from this.
 *
 * @param {number} total       - Total item count.
 * @param {number} shardCount  - 1..MAX_WORKERS (caller is responsible for clamp).
 * @returns {number[]} `sizes[shardIndex]` — per-shard item count. Empty
 *   trailing shards (possible when `shardCount > total`) yield 0.
 */
export function computeShardSizes(total, shardCount) {
  const count = Math.max(1, Number(shardCount) || 1);
  const safeTotal = Math.max(0, Number(total) || 0);
  const baseSize = Math.floor(safeTotal / count);
  const remainder = safeTotal % count;
  return new Array(count).fill(0).map((_, s) => baseSize + (s < remainder ? 1 : 0));
}

/**
 * CAP-002 — Partition tests into `shardCount` contiguous slices using the
 * shared {@link computeShardSizes} algorithm. Tags each test with
 * `_shardIndex` in place (callers rely on this to attribute results to the
 * correct shard at completion time) and returns the `sizes[]` array so the
 * runner can detect "last test in shard S has finished" without re-deriving
 * the partition. Pure function — no DB or side effects — so the partition
 * contract can be exercised in isolation by `backend/tests/run-sharding.test.js`.
 *
 * @param {Object[]} tests       - Tests in dispatch order (post-smoke-pin).
 * @param {number}   shardCount  - 1..MAX_WORKERS (caller is responsible for clamp).
 * @returns {{ sizes: number[] }} per-shard test counts.
 */
export function partitionTestsIntoShards(tests, shardCount) {
  const sizes = computeShardSizes(tests.length, shardCount);
  let cursor = 0;
  let shard = 0;
  for (let i = 0; i < tests.length; i++) {
    while (shard < sizes.length - 1 && i >= cursor + sizes[shard]) {
      cursor += sizes[shard];
      shard++;
    }
    tests[i]._shardIndex = shard;
  }
  return { sizes };
}

/**
 * CAP-002 Phase 2 — Partition test IDs into `shardCount` contiguous slices
 * using the shared {@link computeShardSizes} algorithm. The route layer
 * calls this at enqueue time to pre-compute each BullMQ shard job's
 * `testIds` payload — the coordinator is the single source of truth for
 * the split; workers never re-derive the partition (avoids drift if a
 * future test sort changes the approved-test order between enqueue and
 * worker pickup).
 *
 * Implementation note: slices the input array directly rather than allocating
 * a tagged-copy. `computeShardSizes` is the single source of truth shared
 * with `partitionTestsIntoShards`, so the algorithm cannot drift between
 * the two callers.
 *
 * @param {string[]} testIds    - Approved test IDs in dispatch order.
 * @param {number}   shardCount - 1..MAX_WORKERS (caller is responsible for clamp).
 * @returns {string[][]} `slices[shardIndex]` is the array of test IDs for
 *   that shard. Empty shards (possible when `shardCount > testIds.length`)
 *   yield `[]` at their slot.
 */
export function partitionTestIdsForShards(testIds, shardCount) {
  const ids = Array.isArray(testIds) ? testIds : [];
  const sizes = computeShardSizes(ids.length, shardCount);
  const slices = [];
  let cursor = 0;
  for (const size of sizes) {
    slices.push(ids.slice(cursor, cursor + size));
    cursor += size;
  }
  return slices;
}

/**
 * CAP-002 Phase 2 (Prerequisite #2) — Compute the public artifact URL for a
 * shard's trace zip. The path mirrors the on-disk layout in `TRACES_DIR` so
 * `signArtifactUrl` and the trace-viewer static-file mount can resolve nested
 * paths without special-casing. `shardIndex == null` (legacy / single-shard
 * runs) returns the flat `${runId}.zip` URL — zero regression for every
 * existing consumer of `run.tracePath`. Pure: no I/O, no DB, exported so
 * `backend/tests/run-sharding.test.js` can assert the contract directly.
 *
 * @param {string}       runId
 * @param {number|null}  shardIndex - 0-based shard index, or null for legacy single-path runs.
 * @returns {string} `/artifacts/traces/<runId>.zip` or `/artifacts/traces/<runId>/shard-<idx>.zip`
 */
export function shardTraceArtifactPath(runId, shardIndex) {
  if (shardIndex == null) return `/artifacts/traces/${runId}.zip`;
  return `/artifacts/traces/${runId}/shard-${shardIndex}.zip`;
}

async function poolMap(items, concurrency, fn, signal) {
  const results = new Array(items.length);
  let nextIndex = 0;

  async function worker() {
    while (nextIndex < items.length) {
      if (signal?.aborted) break;
      const idx = nextIndex++;
      if (idx >= items.length) break;
      results[idx] = await fn(items[idx], idx);
    }
  }

  const workers = [];
  for (let w = 0; w < Math.min(concurrency, items.length); w++) {
    workers.push(worker());
  }
  await Promise.all(workers);
  return results;
}


/**
 * Execute an array of approved tests against a project using Playwright.
 * Launches Chromium, runs each test with self-healing (optionally in parallel),
 * collects results, saves traces/videos, runs the AI feedback loop, and
 * finalises the run.
 *
 * @param {Object}      project                   - The project `{ id, name, url }`.
 * @param {Object[]}    tests                     - Array of test objects to execute.
 * @param {Object}      run                       - The run record (mutated in place).
 * @param {Object}      [options]
 * @param {number}      [options.parallelWorkers]  - Concurrent browser contexts (1–10). Overrides env default.
 * @param {string}      [options.browser]          - `"chromium" | "firefox" | "webkit"` (DIF-002). Defaults to chromium.
 * @param {string}      [options.device]           - Playwright device preset name (DIF-003).
 * @param {string}      [options.locale]           - BCP 47 locale (AUTO-007).
 * @param {string}      [options.timezoneId]       - IANA timezone (AUTO-007).
 * @param {Object}      [options.geolocation]      - `{ latitude, longitude }` (AUTO-007).
 * @param {AbortSignal} [options.signal]           - Abort signal for cancellation.
 * @param {number|null} [options.shardIndex]       - CAP-002 Phase 2: when set,
 *   the cross-process shard worker passes its 0-based shard index so trace
 *   artifacts land at `${TRACES_DIR}/${runId}/shard-${shardIndex}.zip`
 *   instead of the single-path layout. `null` (default) preserves the
 *   pre-shard zero-regression path — same filename, same `run.tracePath`,
 *   no `tracePaths[]` JSON column populated. See migration 026.
 * @returns {Promise<void>}
 */
export async function runTests(project, tests, run, { parallelWorkers, browser: browserName, device, locale, timezoneId, geolocation, networkCondition, signal, shardIndex = null } = {}) {
  const runId = run.id;
  // CAP-002 Phase 2 (Prerequisite #2) — shard-mode trace artifacts live in a
  // per-run subdirectory keyed by shard index so N concurrent shard workers
  // can write side-by-side without colliding on a single `${runId}.zip`.
  // Single-shard runs (`shardIndex == null`) preserve the legacy single-path
  // layout for bit-for-bit zero regression. The route-relative artifact path
  // mirrors the on-disk shape — `signArtifactUrl` and the trace-viewer
  // static-file route already handle nested paths via `req.params[0]`.
  const isShardMode = shardIndex != null;
  const tracePath = isShardMode
    ? `${TRACES_DIR}/${runId}/shard-${shardIndex}.zip`
    : `${TRACES_DIR}/${runId}.zip`;

  // AUTO-001: smoke tests always dispatch first regardless of caller order.
  // This is a runner-level invariant — any callsite of runTests (route layer,
  // BullMQ worker, single-test execute, future schedulers) gets the same
  // pin-smoke-to-front guarantee without duplicating the rule. Risk-based
  // ordering of the non-smoke tail is established at the route layer (where
  // run history + changedPages are available); the runner stays history-free
  // and only enforces the smoke pin to preserve auditability of the saved
  // run.testQueue order. Stable sort: tests retain their input order within
  // the smoke / non-smoke partitions.
  tests = [
    ...tests.filter((t) => isSmokeTest(t)),
    ...tests.filter((t) => !isSmokeTest(t)),
  ];

  // Resolve concurrency: per-run override → env default → 1 (sequential)
  const workers = Math.max(1, Math.min(10, parallelWorkers || DEFAULT_PARALLEL_WORKERS));

  // CAP-002 — partition the dispatch queue into `run.shardCount` contiguous
  // slices and tag each test with its shard index. Today the partition runs
  // in-process via `poolMap`; the follow-up cross-process PR (see
  // ROADMAP CAP-002 — coordinator + BullMQ shard jobs + Redis pub/sub
  // abort) will lift this same partition algorithm into the queue layer
  // so `partitionTestsIntoShards` is the single source of truth for the
  // split. `run.shardCount` defaults to 1 (the route layer only writes >1
  // when the caller explicitly passed `shards: N`), so fixture-less runs
  // and runs that only set `dialsConfig.parallelWorkers` go through the
  // single-shard zero-regression path.
  const shardCount = Math.max(1, Number(run.shardCount) || 1);
  // CAP-002 Phase 2 — when invoked by a shard worker (`shardIndex != null`),
  // the caller has already pre-partitioned the suite at enqueue time
  // (`partitionTestIdsForShards` in routes/runs.js), so this single call
  // owns ONE shard's slice. Stamp every test with the assigned shardIndex
  // so `processResult` attributes results to the right shard; skip the
  // in-process partition + shard-progress tracking (the worker drives
  // shard completion externally via `runRepo.incrementShardsCompleted`
  // after this function returns). Legacy single-process / non-sharded
  // callers (`isShardMode === false`) keep using the in-process partition
  // bit-for-bit — zero regression.
  let shardSizes;
  let shardRemaining;
  if (isShardMode) {
    // Pre-partitioned slice — every test belongs to this shard.
    for (const t of tests) { t._shardIndex = shardIndex; }
    shardSizes = [tests.length];
    shardRemaining = [tests.length];
  } else {
    ({ sizes: shardSizes } = partitionTestsIntoShards(tests, shardCount));
    // Track shard completion via a remaining-tests counter so we increment
    // `run.shardsCompleted` exactly when a shard's *last* test reports back —
    // poolMap may interleave shards (workers > shardCount, or shards of
    // different sizes), so a naive "increment per shard at boundary" would
    // miscount under concurrent dispatch.
    shardRemaining = [...shardSizes];
    if (run.shardsCompleted == null) run.shardsCompleted = 0;
    // CAP-002 — empty shards (size 0, possible when shardCount > tests.length)
    // have no tests to drain via `recordTestShardComplete`, so the counter
    // would never advance for them. Without pre-crediting, the badge would
    // surface "Shards M/N" with M < N during execution and only reconcile at
    // the very end of `finalizeRunIfNotAborted` — and if the run is aborted
    // before finalization, the badge would be permanently stuck at the
    // partial value. SSE snapshots fire on every result, so the badge needs
    // to reflect "no work to do here" the moment the partition is computed.
    const emptyShards = shardSizes.filter((s) => s === 0).length;
    if (emptyShards > 0) {
      run.shardsCompleted = Math.min(shardCount, run.shardsCompleted + emptyShards);
    }
  }

  // Classify each test once upfront and cache the result on the test object.
  // This avoids re-parsing the code body via isApiTest() multiple times per
  // test (previously called 4× each: allApiOnly, apiCount, logging, executeTest).
  // executeTest reads test._isApi instead of re-calling isApiTest().
  for (const t of tests) {
    t._isApi = !!(t.playwrightCode && isApiTest(t.playwrightCode));
    // Persist the classification on the test object so the frontend can read
    // test.isApiTest directly without reimplementing the detection heuristic.
    testRepo.update(t.id, { isApiTest: t._isApi });
  }

  // If every test is API-only, skip the entire browser launch + trace context
  // to save ~100-200MB of RAM.
  const allApiOnly = tests.every(t => t._isApi);

  let browser = null;
  let traceContext = null;

  // DIF-002: resolve the requested browser once so we can log + persist a
  // canonical name (invalid / unknown values fall back to chromium).
  const { name: resolvedBrowser } = resolveBrowser(browserName);
  run.browser = resolvedBrowser;

  structuredLog("run.start", { runId, projectId: project.id, tests: tests.length, workers, allApiOnly, browser: resolvedBrowser });

  if (!allApiOnly) {
    try {
      browser = await launchBrowser({ browser: resolvedBrowser });
    } catch (launchErr) {
      const classified = classifyError(launchErr, "run");
      run.status = "failed";
      run.error = classified.message;
      run.errorCategory = classified.category;
      run.finishedAt = new Date().toISOString();
      // CAP-002 — no tests will execute on this run, so no shard will drain
      // naturally via processResult. Mark every shard as "completed" so the
      // UI badge reads `N/N` rather than `0/N` after a hard launch failure.
      run.shardsCompleted = shardCount;
      logError(run, classified.message);
      structuredLog("browser.launch_failed", { runId, error: classified.message });
      throw launchErr;
    }
    structuredLog("browser.launched", { runId });

    // Shared tracing context (separate from per-test video contexts)
    try {
      traceContext = await browser.newContext({
        userAgent: "Mozilla/5.0 (compatible; AutonomousQA/1.0)",
        viewport: { width: 1280, height: 720 },
      });
      await traceContext.tracing.start({ screenshots: true, snapshots: true, sources: false });
    } catch (ctxErr) {
      await browser.close().catch(() => {});
      const classified = classifyError(ctxErr, "run");
      run.status = "failed";
      run.error = classified.message;
      run.errorCategory = classified.category;
      run.finishedAt = new Date().toISOString();
      // CAP-002 — same rationale as the browser.launch_failed branch above:
      // no tests run, so flush shardsCompleted to shardCount for UI clarity.
      run.shardsCompleted = shardCount;
      logError(run, classified.message);
      throw ctxErr;
    }
  }

  const apiCount = tests.filter(t => t._isApi).length;
  const modeLabel = workers > 1 ? `${workers} parallel workers` : "sequential";
  log(run, `🚀 Starting test run: ${tests.length} tests (${modeLabel})`);
  log(run, `⚙️ Run config:`);
  log(run, `Execution mode: ${workers > 1 ? `⚡ Parallel (${workers} workers)` : "▶ Sequential (1 worker)"}`);
  log(run, `Tests queued: ${tests.length}${apiCount > 0 ? ` (${apiCount} API, ${tests.length - apiCount} browser)` : ""}`);
  log(run, `Project URL: ${project.url}`);
  log(run, allApiOnly
    ? `Browser: ⏭️ Skipped (all tests are API-only)`
    : `Browser: ${resolvedBrowser} (${BROWSER_HEADLESS ? "headless" : "headed"})`);

  const runStart = Date.now();
  const allVideoSegments = [];
  // CAP-002 Phase 2 — per-shard stat accumulators. The worker composes the
  // parent `runs` row's totals from each shard's returned delta via
  // `runRepo.incrementRunStats`. For legacy single-shard runs these stay
  // aligned with `run.passed` / `run.failed` and are simply returned for
  // completeness — the caller may ignore them. `shardTotalDelta` captures
  // data-driven tests' iteration overflow (N fixture rows expand to N
  // iteration results — the original slice size already counted 1).
  let shardPassed = 0;
  let shardFailed = 0;
  let shardTotalDelta = 0;

  // CAP-002 — advance shard progress once per *test* (not per iteration
  // result). Data-driven tests call processResult N times for a single
  // test, so decrementing inside processResult would drain the shard
  // counter mid-test and surface a premature "Shards M/N" badge. The
  // poolMap callback below calls this helper exactly once after a test
  // fully resolves (success or crash), so each shard's counter reaches
  // zero precisely when its last test reports back. `_shardIndex` is
  // stamped by `partitionTestsIntoShards`; defensive `?? 0` covers
  // tests that bypass the partition (single-shard runs still attribute
  // to shard 0 — same effective behaviour).
  //
  // In `isShardMode` (cross-process shard worker), the BullMQ worker
  // drives `run.shardsCompleted` externally via
  // `runRepo.incrementShardsCompleted` after this function returns —
  // this helper must NOT touch the counter, or the boundary-crossing
  // finalizer detection in `runWorker.js` would double-count. The
  // previous implementation no-op'd by accident: tests are stamped
  // with the worker's `shardIndex` (e.g. 2) while `shardRemaining` is
  // sized `[tests.length]` (length 1), so `shardRemaining[2]` was
  // `undefined` and the `!= null` guard silently bailed. Make the
  // intent explicit so a future refactor of `shardRemaining` sizing
  // can't silently start incrementing here and break the worker's
  // external counter management.
  function recordTestShardComplete(test) {
    if (isShardMode) return;
    const shardIdx = test?._shardIndex ?? 0;
    if (shardRemaining[shardIdx] != null) {
      shardRemaining[shardIdx] -= 1;
      if (shardRemaining[shardIdx] === 0) {
        run.shardsCompleted = Math.min(shardCount, (run.shardsCompleted || 0) + 1);
      }
    }
  }

  // ── Process a single test result — shared by the pool worker callback ────
  function processResult(test, result) {
    // CAP-002 Phase 2 (Prerequisite #3) — stamp the result with its parent
    // test's shard index so the retry-reset path in `runWorker.js` can
    // identify which results to wipe (this shard's) vs. which to preserve
    // (sibling shards that already completed). Single-shard runs stamp
    // `_shardIndex: 0` uniformly — the retry filter is a no-op in that case.
    // `?? 0` covers the pre-partition path (crash-synth before
    // partitionTestsIntoShards stamps the test); shard 0 is the right
    // default for a single-shard run.
    result._shardIndex = test?._shardIndex ?? 0;
    run.results.push(result);

    if (result.videoPath) allVideoSegments.push(result.videoPath);

    // CAP-002 Phase 2 — count locally so we can return the shard's stats
    // delta to the worker, AND mirror to `run.passed` / `run.failed` for
    // legacy single-shard callers. In shard mode the worker composes the
    // parent run's totals from each shard's returned delta via
    // `runRepo.incrementRunStats`, so we don't bump the parent here.
    if (result.status === "passed") {
      if (!isShardMode) run.passed++;
      shardPassed++;
      logSuccess(run, `PASSED (${result.durationMs}ms)`);
    } else if (result.status === "warning") {
      if (!isShardMode) run.passed++;
      shardPassed++;
      logWarn(run, `WARNING: ${result.error}`);
    } else {
      if (!isShardMode) run.failed++;
      shardFailed++;
      logError(run, `FAILED: ${result.error}`);
    }

    // Emit result event (without the heavy base64 screenshot)
    const { screenshot: _ss, ...resultLean } = result;
    const signedResult = { ...resultLean };
    if (signedResult.screenshotPath) signedResult.screenshotPath = signArtifactUrl(signedResult.screenshotPath);
    if (signedResult.videoPath) signedResult.videoPath = signArtifactUrl(signedResult.videoPath);
    emitRunEvent(run.id, "result", { result: signedResult });
    if (result.screenshotPath) {
      emitRunEvent(run.id, "screenshot", {
        testId: test.id,
        screenshotPath: signArtifactUrl(result.screenshotPath),
      });
    }

    testRepo.update(test.id, {
      lastResult: result.status,
      lastRunAt: new Date().toISOString(),
    });

    // CAP-002 Phase 2 — flush this single result via the atomic primitive
    // in shard mode so N concurrent shard workers don't last-write-wins
    // each other's results. Legacy single-process runs continue to use
    // `runRepo.save(run)` (the full-snapshot path) for bit-for-bit zero
    // regression. The atomic primitive splices in a single SQL statement
    // (row-locked by SQLite/Postgres) — Prerequisite #1's contract,
    // verified end-to-end by `run-storage-concurrency.test.js`.
    if (isShardMode) {
      runRepo.appendRunResults(run.id, [result]);
    } else {
      // Flush run state to SQLite after each result so a crash mid-run
      // doesn't lose all results collected so far. SQLite writes are
      // synchronous (~1ms) so this adds negligible overhead per test.
      runRepo.save(run);
    }

    // Broadcast a snapshot after each result so the frontend progress bar
    // updates in real time (especially important during parallel execution
    // where multiple results arrive in quick succession). In shard mode the
    // snapshot is re-read from the DB so sibling-shard results that just
    // landed are reflected — the local `run` object only carries this
    // shard's slice.
    if (!isRunAborted(run, signal)) {
      const snapshotRun = isShardMode ? (runRepo.getById(run.id) || run) : run;
      emitRunEvent(run.id, "snapshot", { run: signRunArtifacts(snapshotRun) });
    }
  }

  try {
    await poolMap(tests, workers, async (test, i) => {
      if (signal?.aborted) return;

      const hasCode = !!(test.playwrightCode && extractTestBody(test.playwrightCode));
      const workerTag = workers > 1 ? ` [w${(i % workers) + 1}]` : "";
      const typeTag = test._isApi ? "🌐 API" : hasCode ? "executing generated code" : "fallback smoke test";
      structuredLog("test.start", { runId, testId: test.id, index: i + 1, total: tests.length, isApi: !!test._isApi });
      log(run, `▶ [${i + 1}/${tests.length}]${workerTag} ${test.name} (${typeTag})`);

      try {
        // The retry callback returns the *array* of iteration results (not a
        // single result) so processResult fires exactly once per iteration of
        // the final attempt — calling processResult inside the callback would
        // double-count `run.passed` / `run.failed` on every retry and push
        // `run.results.length` past `run.total`, corrupting quality-gate
        // evaluation, the SSE progress bar, and the `run.retryCount` /
        // `run.failedAfterRetry` aggregations below.
        const { result: iterResults, retryCount } = await executeWithRetries(async (attempt) => {
          if (attempt > 0) {
            logWarn(run, `↻ Retrying ${test.name} (attempt ${attempt + 1}/${MAX_TEST_RETRIES + 1})`);
          }
          // CAP-001: pull the fixture row-set keyed to this test's current
          // codeVersion. Falsy → fixture-less single-iteration path (zero
          // regression). On retries we re-resolve the fixture so a freshly
          // uploaded fixture between attempts is picked up; the cost is one
          // indexed SELECT per attempt (1-3 max).
          const fixture = testFixtureRepo.getFixture(test.id, Number(test.codeVersion || 1));
          const fixtureRows = fixture?.rows;
          const attemptResults = await executeTestIterations(
            test,
            fixtureRows,
            (iterTest) => executeTest(iterTest, browser, runId, i, runStart, { browser: resolvedBrowser, device, locale, timezoneId, geolocation, networkCondition }),
          );
          const lastResult = attemptResults[attemptResults.length - 1] || null;
          // Retry semantics:
          //  - Fixture-less tests (single iteration): preserve the original
          //    contract — throw on failure so executeWithRetries reruns the
          //    test up to MAX_TEST_RETRIES times. Earlier attempts are
          //    discarded; only the final attempt's results are surfaced.
          //  - Data-driven tests (≥1 fixture row): never retry. Retrying
          //    would re-execute every row (including the passes) on every
          //    failure, multiplying browser work. A visible log line keeps
          //    the suppression explicit — otherwise a failed data-driven
          //    test looks identical to a fixture-less retry exhaustion in
          //    the run timeline, masking the design choice from reviewers.
          // Log the retry-skip when *any* iteration failed, not just the
          // last one — an intermediate row failure with a passing tail row
          // would otherwise silently drop the message and break QA
          // acceptance criterion #18 in `QA.md`.
          const failedRows = attemptResults.filter((r) => r?.status === "failed").length;
          if (fixtureRows?.length && failedRows > 0) {
            logWarn(run, `↻ Skipping retry for ${test.name} — ${failedRows}/${attemptResults.length} fixture iteration(s) failed (data-driven tests don't retry)`);
          }
          if (!fixtureRows?.length && lastResult?.status === "failed") {
            const retryErr = new Error(lastResult.error || "Test failed");
            retryErr.result = lastResult;
            // Stash the attempt's results so the catch block can surface
            // them once after retries are exhausted (avoids double-emit).
            retryErr.attemptResults = attemptResults;
            throw retryErr;
          }
          return attemptResults;
        }, MAX_TEST_RETRIES);
        // CAP-001: `run.total` is initialised at the route layer to
        // `tests.length` (`backend/src/routes/runs.js:209`) — i.e. one slot
        // per *test*. Data-driven tests fan out into N iteration results
        // and each iteration increments `run.passed` / `run.failed` via
        // `processResult` below, so without adjusting `run.total` here the
        // pass-rate denominator in `evaluateQualityGates` (and the matching
        // `flakyPct` denominator) underflows, producing pass rates >100%
        // and making `minPassRate` / `maxFailures` gates unreachable. The
        // RunDetail progress bar would also render "5 / 1 test cases
        // executed". Bump the total by (N - 1) so the dispatched-iteration
        // count and the run aggregator stay in lock-step. Fixture-less
        // tests yield exactly one iteration → no adjustment, zero
        // regression.
        if (iterResults.length > 1) {
          const overflow = iterResults.length - 1;
          if (isShardMode) {
            // Shard mode: the parent run row's `total` is composed by the
            // worker via `incrementRunStats(totalDelta)` after this function
            // returns. Track the delta locally and surface it on return —
            // bumping `run.total` here would not persist (we don't save the
            // full snapshot in shard mode) and would mislead local SSE
            // snapshots that re-read from the DB anyway.
            shardTotalDelta += overflow;
          } else {
            run.total += overflow;
          }
        }
        // Surface every iteration from the final attempt to the run
        // aggregator — exactly once, after retries have fully resolved.
        for (const iterResult of iterResults) {
          iterResult.retryCount = retryCount;
          iterResult.failedAfterRetry = false;
          processResult(test, iterResult);
        }
        // CAP-002 — drain the shard counter once per test, after every
        // iteration result has been recorded. See `recordTestShardComplete`.
        recordTestShardComplete(test);
        const finalResult = iterResults[iterResults.length - 1];
        if (finalResult) {
          structuredLog("test.result", { runId, testId: test.id, status: finalResult.status, durationMs: finalResult.durationMs });
        }
      } catch (err) {
        // Build a synthetic result and route through processResult so SSE
        // `result` and `snapshot` events are emitted — otherwise the
        // frontend progress bar stalls during parallel execution.
        structuredLog("test.crash", { runId, testId: test.id, error: err.message?.slice(0, 200) });
        const errorResult = err.result || {
          testId: test.id, testName: test.name,
          status: "failed", error: err.message,
          durationMs: 0, network: [], consoleLogs: [],
        };
        // Prefer the actual retry count surfaced by executeWithRetries —
        // synchronous crashes from executeTest may exhaust fewer attempts
        // than MAX_TEST_RETRIES suggests.
        errorResult.retryCount = typeof err.retryCount === "number"
          ? err.retryCount
          : MAX_TEST_RETRIES;
        errorResult.failedAfterRetry = true;
        // processResult fires exactly once on exhaustion — earlier attempts
        // were discarded inside the retry loop so we never double-count.
        processResult(test, errorResult);
        // CAP-002 — crash path also resolves the test exactly once, so
        // drain the shard counter here too (matches the success path).
        recordTestShardComplete(test);
      }
    }, signal);
  } finally {
    // Always clean up browser resources — even if the loop threw unexpectedly.
    // browser/traceContext are null when all tests are API-only.
    if (traceContext) {
      try {
        await traceContext.tracing.stop({ path: tracePath });
        // Route through the storage adapter so S3-mode deployments upload
        // the trace zip; in local mode this is effectively a no-op rewrite
        // of the same file Playwright just produced.
        // CAP-002 Phase 2 (Prerequisite #2) — single source of truth for the
        // trace artifact URL. Mirrors the on-disk shard layout so the
        // trace-viewer route resolves nested paths verbatim. Single-shard
        // runs keep the legacy `/artifacts/traces/${runId}.zip` URL for
        // bit-for-bit zero regression with `run.tracePath` consumers
        // (RunDetail link, GitHub Check summary, signed-URL middleware).
        const traceArtifactPath = shardTraceArtifactPath(runId, isShardMode ? shardIndex : null);
        try {
          await writeArtifactBuffer({
            artifactPath: traceArtifactPath,
            absolutePath: tracePath,
            buffer: fs.readFileSync(tracePath),
            contentType: "application/zip",
          });
          run.tracePath = traceArtifactPath;
          // In shard mode also record the per-shard URL in the new
          // `tracePaths` JSON column (migration 026) so `RunDetail.jsx`
          // renders a dropdown when `shardCount > 1`. Use the atomic
          // primitive — N shard workers writing different slots on the
          // same `runs` row must compose, not last-write-wins. The
          // primitive's transaction wrapper serializes the read+write so
          // a concurrent sibling-shard update to a different slot can't
          // be lost (Prerequisite #1 contract for the trace-paths column).
          if (isShardMode) {
            if (!Array.isArray(run.tracePaths)) run.tracePaths = [];
            run.tracePaths[shardIndex] = traceArtifactPath;
            runRepo.setShardTracePath(runId, shardIndex, traceArtifactPath);
          }
        } catch (uploadErr) {
          logWarn(run, `Trace upload failed: ${uploadErr.message}`);
          run.tracePath = traceArtifactPath;
        }
        log(run, `📊 Trace saved`);
      } catch (e) {
        logWarn(run, `Trace save failed: ${e.message}`);
      }
      await traceContext.close().catch(() => {});
    }
    if (browser) {
      await browser.close().catch((err) => {
        console.warn(formatLogLine("warn", null, `[testRunner] browser.close() failed: ${err.message}`));
      });
    }
  }

  if (allVideoSegments.length > 0) {
    run.videoPath = allVideoSegments[0];
    run.videoSegments = allVideoSegments;
    log(run, `  🎬 ${allVideoSegments.length} video segment(s) saved`);
  }

  // CAP-002 Phase 2 — in shard mode, hand off to the worker for finalization.
  // The shard owns its slice's execution + trace flush; the worker composes
  // the parent run's totals via `incrementRunStats`, increments
  // `shardsCompleted`, and the boundary-crossing shard runs the feedback
  // loop + finalize + `done` event exactly once. Returning here avoids:
  //   - Running the feedback loop N times (once per shard) — wasteful AI calls
  //     and N redundant `testRepo.update(... reviewStatus: "draft")` writes.
  //   - N shards racing on `run.status = "completed"` via
  //     `finalizeRunIfNotAborted` — last-shard-wins semantics belong in the
  //     worker, gated on the atomic `incrementShardsCompleted` boundary.
  //   - N `done` SSE events for a single logical run.
  // Returns the shard's stats delta so the worker can compose them onto the
  // parent `runs` row atomically.
  if (isShardMode) {
    const elapsed = ((Date.now() - runStart) / 1000).toFixed(1);
    structuredLog("run.shard_execution_done", {
      runId, shardIndex,
      passed: shardPassed, failed: shardFailed,
      totalDelta: shardTotalDelta, elapsedSec: parseFloat(elapsed),
    });
    return {
      passed: shardPassed,
      failed: shardFailed,
      totalDelta: shardTotalDelta,
      tracePath: run.tracePath || null,
    };
  }

  // AUTO-005: aggregate per-result retry telemetry onto the run record so
  // the dedicated `runs.retryCount` / `runs.failedAfterRetry` columns
  // (migration 011) are populated for run-level analytics queries.
  run.retryCount = run.results.reduce((sum, r) => sum + (r.retryCount || 0), 0);
  run.failedAfterRetry = run.results.filter(r => r.failedAfterRetry).length;

  run.gateResult = evaluateQualityGates(project.qualityGates, run);
  run.webVitalsResult = evaluateWebVitalsBudgets(project.webVitalsBudgets, run);
  run.rootCauses = clusterFailures({ results: run.results });

  // AUTO-017.3: persist per-run Web Vitals samples for MET-001 trend charts.
  // Best-effort only — telemetry failures must never fail the run.
  try {
    const rows = Array.isArray(run.results) ? run.results : [];
    const keys = ["lcp", "cls", "inp", "ttfb"];
    for (const key of keys) {
      const values = rows
        .map((r) => Number(r?.webVitals?.[key]))
        .filter((v) => Number.isFinite(v));
      if (values.length === 0) continue;
      const avg = values.reduce((sum, v) => sum + v, 0) / values.length;
      const threshold = Number(project?.webVitalsBudgets?.[key]);
      recordMetric(project.id, `webVitals.${key}`, avg, { runId, source: "run", metric: key }, runStart);
      if (Number.isFinite(threshold)) {
        recordMetric(project.id, `webVitals.${key}.budget`, threshold, { runId, source: "budget", metric: key }, runStart);
      }
    }
  } catch (metricErr) {
    logWarn(run, `Web Vitals trend metric write failed: ${metricErr.message}`);
  }

  // NOTE: We intentionally keep run.status === "running" here so that:
  //   1. The abort endpoint (POST /api/runs/:id/abort) still works during the
  //      feedback loop — it checks run.status === "running".
  //   2. SSE reconnections don't prematurely close — the /events endpoint sends
  //      an immediate "done" + res.end() when run.status !== "running", which
  //      would cut off the client while the feedback loop is still active.
  // The status is set to "completed" only after the feedback loop finishes.
  const elapsed = ((Date.now() - runStart) / 1000).toFixed(1);
  structuredLog("run.execution_done", { runId, passed: run.passed, failed: run.failed, total: run.total, elapsedSec: parseFloat(elapsed) });
  log(run, `📋 Test execution done: ${run.passed} passed, ${run.failed} failed out of ${run.total} in ${elapsed}s${workers > 1 ? ` (${workers}x parallel)` : ""} — starting post-run analysis…`);

  // Broadcast a final snapshot so the frontend sees the complete pass/fail
  // counts before the feedback loop starts its long-running AI calls.
  // (processResult already emits per-result snapshots, but this ensures the
  // frontend has the final state even if the last result's snapshot was lost.)
  if (!isRunAborted(run, signal)) {
    emitRunEvent(run.id, "snapshot", { run: signRunArtifacts(run) });
  }

  // ── Feedback loop: auto-regenerate high-priority failing tests ──────────
  // Delegated to runner/feedbackIntegration.js — no-ops when no failures,
  // aborted, or no AI provider configured.
  await runFeedbackLoop(run, tests, signal);

  // Now that the feedback loop is done, finalize the run status.
  // This is the single place where status transitions to "completed".
  // Guard the log() call inside the callback so it only fires when the run
  // actually transitions to "completed". After an abort, the SSE "done" event
  // has already been emitted and the stream is closed — logging here would
  // append to run.logs but the SSE broadcast would be silently lost.
  finalizeRunIfNotAborted(run, () => {
    run.finishedAt = new Date().toISOString();
    // CAP-002 — `run.shardsCompleted` is incremented per-shard in
    // processResult as each shard's last test reports. A normal completion
    // should already have shardsCompleted === shardCount; this final
    // reconciliation only matters if a shard had zero tests (possible when
    // `shards > tests.length` clamps to per-shard size 0), which would
    // otherwise leave shardsCompleted < shardCount and surface a stuck
    // "Shards N-1/N" badge on the completed run.
    if ((run.shardsCompleted || 0) < shardCount) run.shardsCompleted = shardCount;
    run.duration = Date.now() - runStart;
    logSuccess(run, `Run complete: ${run.passed} passed, ${run.failed} failed out of ${run.total}`);
    structuredLog("run.complete", {
      runId, projectId: project.id,
      passed: run.passed, failed: run.failed, total: run.total,
      durationMs: run.duration,
    });
    // DIF-013: report run outcome (regression / single-test / scheduled run).
    // `run.started` is emitted at the route layer when the run is enqueued —
    // this is the matching `complete` event for funnel analysis. Retry
    // telemetry (AUTO-005) is included so we can measure flake-isolation
    // impact in aggregate.
    trackTelemetry("run.complete", {
      projectId: project.id,
      browser: resolvedBrowser,
      total: run.total,
      passed: run.passed,
      failed: run.failed,
      retryCount: run.retryCount || 0,
      failedAfterRetry: run.failedAfterRetry || 0,
      parallelWorkers: workers,
      durationMs: run.duration,
      url: project.url,
    });
  });

  // Emit "done" only now — after the feedback loop — so the frontend's
  // fetchRun() always sees the final, stable completed state.
  // Skip if already aborted — the abort endpoint already emitted the done event.
  if (!isRunAborted(run, signal)) {
    emitRunEvent(run.id, "done", { status: run.status, passed: run.passed, failed: run.failed, total: run.total });
  }
}