Source: crawler.js

/**
 * @module crawler
 * @description Autonomous QA pipeline — thin orchestration layer for the 8-stage
 * test generation pipeline.
 *
 * ### Pipeline stages
 * | #  | Stage                | Module                                              |
 * |----|----------------------|-----------------------------------------------------|
 * | 1  | Smart crawl / Explore| `pipeline/crawlBrowser.js` or `pipeline/stateExplorer.js` |
 * |    | ↳ HAR capture        | `pipeline/harCapture.js` (attached to BrowserContext)|
 * | 2  | Element filtering    | `pipeline/elementFilter.js`                         |
 * | 3  | Intent classification| `pipeline/intentClassifier.js`                      |
 * | 4  | Journey generation   | `pipeline/journeyGenerator.js`                      |
 * | 4b | API test generation  | `pipeline/journeyGenerator.js` + `prompts/apiTestPrompt.js` |
 * | 5  | Deduplication        | `pipeline/pipelineOrchestrator.js`                  |
 * | 6  | Assertion enhancement| `pipeline/pipelineOrchestrator.js`                  |
 * | 7  | Validate tests       | `pipeline/pipelineOrchestrator.js`                  |
 * | 8  | Feedback loop        | `pipeline/feedbackLoop.js`                          |
 *
 * ### Explorer modes (Test Dials `exploreMode`)
 * - `crawl` (default) — link-only BFS crawl via `crawlBrowser.js`
 * - `state` — state-based exploration via `stateExplorer.js` that executes
 *   real UI actions (click, fill, submit) and tracks state transitions to
 *   discover multi-step user flows
 *
 * ### Exports
 * - {@link generateFromUserDescription} — Generate test(s) from a user description (skips crawl).
 * - {@link crawlAndGenerateTests} — Full 8-stage pipeline from URL crawl or state exploration.
 */

import { getProviderName } from "./aiProvider.js";
import { throwIfAborted, finalizeRunIfNotAborted } from "./utils/abortHelper.js";
import { trackTelemetry } from "./utils/telemetry.js";
import { filterElements, filterStats } from "./pipeline/elementFilter.js";
import { classifyPageWithAI, buildUserJourneys } from "./pipeline/intentClassifier.js";
import { generateAllTests, generateFromDescription, generateApiTests } from "./pipeline/journeyGenerator.js";
import { crawlPages } from "./pipeline/crawlBrowser.js";
import { exploreStates } from "./pipeline/stateExplorer.js";
import { runPostGenerationPipeline, sanitizeRunInputs } from "./pipeline/pipelineOrchestrator.js";
import { persistGeneratedTests, buildPipelineStats } from "./pipeline/testPersistence.js";
import { emitRunEvent, log, logWarn, logSuccess } from "./utils/runLogger.js";
import { setStep } from "./utils/pipelineState.js";
import { classifyError } from "./utils/errorClassifier.js";
import { structuredLog } from "./utils/logFormatter.js";
import * as runRepo from "./database/repositories/runRepo.js";
import * as crawlBaselineRepo from "./database/repositories/crawlBaselineRepo.js";
import { diffCrawlSnapshots } from "./pipeline/crawlDiff.js";

/**
 * setStep is imported from utils/pipelineState.js — shared with pipelineOrchestrator.js.
 */

/**
 * AUTO-002 / AUTO-002b: shared diff-aware baseline runner. Compares the
 * current crawl's snapshots against the persisted baseline, emits the
 * `pages_changed` SSE event, and merges the new fingerprints into the
 * baseline table.
 *
 * Two callers, two key-derivation strategies:
 *
 * - **Link crawl** (`mode="crawl"`) keys baselines by snapshot URL — one
 *   row per page. The caller filters `snapshots[]` down to changed pages
 *   so generation only runs on what changed.
 *
 * - **State explorer** (`mode="state"`) keys baselines by a composite
 *   `url#fp=<fingerprint>` — distinct states at the same URL (login form
 *   blank vs login form with errors) are tracked as separate baseline
 *   rows. The caller does **not** filter `snapshots[]` post-diff because
 *   journeys reference unchanged states for context; filtering would
 *   break flow generation. The diff is informational + persistent, but
 *   no-change crawls still short-circuit the generation pipeline.
 *
 * @param {object} project - project record (must carry id + canonicalUrl/url)
 * @param {object} run - mutable run record
 * @param {object[]} snapshots - normalised snapshots (with synthetic .url for state mode)
 * @param {string} mode - "crawl" | "state"
 * @param {object} [opts]
 * @param {function(object): string} [opts.fingerprintOf]
 *   Forwarded to `diffCrawlSnapshots`. State mode supplies a function that
 *   returns a pre-computed fingerprint so the composite `url#fp=<fp>` key
 *   doesn't feed back into `fingerprintState`'s URL-derived computation
 *   (which would make every state-mode re-crawl look "changed" — the
 *   bug AUTO-002b's first round shipped with).
 * @returns {{noChanges: boolean, changedSet: (Set<string>|null), skipped: boolean}}
 *   `skipped=true` when the diff was bypassed (preview crawl or zero snapshots).
 *   `noChanges=true` when there's an existing baseline and nothing changed.
 *   `changedSet` is the set of keys (URLs or composite keys) that changed;
 *   the caller decides whether to filter `snapshots[]` against it.
 */
function runDiffAwareBaseline(project, run, snapshots, mode, opts = {}) {
  // AUTO-002 / AUTO-015: classify "preview crawl" by comparing the URL we
  // *asked Playwright to load* (`project.url`) against the project's
  // CANONICAL production URL. The AUTO-015 trigger routes overwrite
  // `project.url` with the deployment preview URL while preserving
  // `canonicalUrl`, so a mismatch is the unambiguous signal that this is a
  // preview crawl and baselines must be preserved.
  //
  // We deliberately do NOT consult `snapshots[0].url` here. The first
  // snapshot's URL is post-redirect — production sites routinely redirect
  // their entry URL to a different origin (`example.com` → `www.example.com`,
  // `http://` → `https://`, apex → www, etc.) and the previous code that
  // used `snapshots[0]?.url || project.url` would falsely classify those
  // crawls as "preview" and silently skip baseline updates on every
  // subsequent crawl. Redirects are a property of the site, not a signal
  // about *which* deployment we're hitting.
  const canonicalForOriginCheck = project.canonicalUrl || project.url;
  const sameOrigin = (() => {
    try {
      return new URL(project.url).origin === new URL(canonicalForOriginCheck).origin;
    } catch { return false; }
  })();
  if (!sameOrigin) {
    log(run, `↪️  Preview-deployment crawl detected — skipping baseline diff (preserving production baselines).`);
    return { noChanges: false, changedSet: null, skipped: true };
  }
  if (snapshots.length === 0) {
    // Defence-in-depth: a crawl that yielded zero snapshots but passed
    // the unreachable-target check above (e.g. auth wall, SPA with no
    // crawlable links, Playwright silent failure) must not wipe the
    // project's baselines. Skip the diff entirely.
    log(run, `⚠️  ${mode === "state" ? "State exploration" : "Crawl"} returned zero snapshots — skipping baseline diff to preserve existing fingerprints.`);
    return { noChanges: false, changedSet: null, skipped: true };
  }

  const existingBaselines = crawlBaselineRepo.getMapByProjectId(project.id);
  const diff = diffCrawlSnapshots(existingBaselines, snapshots, opts);
  run.changedPages = diff.changedPages;
  run.removedPages = diff.removedPages;
  emitRunEvent(run.id, "pages_changed", {
    changedPages: diff.changedPages,
    removedPages: diff.removedPages,
    unchangedPages: diff.unchangedPages,
  });

  if (Object.keys(existingBaselines).length > 0 && diff.changedPages.length === 0) {
    // No-change crawl: existing baselines still authoritative for observed
    // pages. Signal short-circuit to the caller via run.noChangesDetected.
    //
    // Edge case: a crawl with zero added/changed pages can still report
    // `removedPages` (a page genuinely went away while the rest stayed
    // identical). We must drop those baseline rows here — otherwise they
    // persist forever and every subsequent crawl re-reports them as
    // `removedPages` indefinitely. The merge call with empty fingerprints
    // is upsert-only, so existing observed-page rows are untouched.
    if (diff.removedPages.length > 0) {
      crawlBaselineRepo.mergeProjectBaselines(project.id, {}, diff.removedPages);
    }
    log(run, `🟰 No ${mode === "state" ? "state" : "page"} changes detected against the previous crawl baseline.`);
    run.noChangesDetected = true;
    return { noChanges: true, changedSet: null, skipped: false };
  }

  // Changes detected (or first-ever crawl). Merge upserts observed entries
  // and only deletes URLs the diff explicitly classified as removed —
  // partial-crawl-safe (a transient page failure won't wipe the baseline).
  crawlBaselineRepo.mergeProjectBaselines(project.id, diff.fingerprints, diff.removedPages);
  log(run, `🧬 ${mode === "state" ? "State" : "Crawl"} diff: ${diff.changedPages.length} changed/new, ${diff.removedPages.length} removed, ${diff.unchangedPages.length} unchanged.`);
  return {
    noChanges: false,
    changedSet: new Set(diff.changedPages),
    skipped: false,
    hadExistingBaseline: Object.keys(existingBaselines).length > 0,
  };
}

/**
 * Shared Steps 2 & 3: Element filtering + intent classification.
 * Extracted to avoid duplication between the "state" and "crawl" branches.
 *
 * @param {object[]} snapshots       — raw page snapshots from crawl or explore
 * @param {Record<string,object>} snapshotsByUrl — URL → snapshot map (mutated in place)
 * @param {object}  project          — project record (url used for log trimming)
 * @param {object}  run              — mutable run record
 * @param {AbortSignal} [signal]
 * @returns {Promise<{ filteredSnapshots: object[], classifiedPages: object[], classifiedPagesByUrl: Record<string,object> }>}
 */
async function filterAndClassify(snapshots, snapshotsByUrl, project, run, signal) {
  // ── Step 2: Element filtering ───────────────────────────────────────────
  setStep(run, 2);
  structuredLog("pipeline.filter", { runId: run.id, pages: snapshots.length });
  log(run, `🔍 Filtering elements (removing noise)...`);
  const filteredSnapshots = snapshots.map(snap => {
    const filtered = filterElements(snap.elements);
    log(run, `   ${snap.url.replace(project.url, "")}: ${filterStats(snap.elements, filtered)}`);
    return { ...snap, elements: filtered };
  });
  for (const snap of filteredSnapshots) snapshotsByUrl[snap.url] = snap;

  throwIfAborted(signal);

  // ── Step 3: Intent classification ───────────────────────────────────────
  setStep(run, 3);
  structuredLog("pipeline.classify", { runId: run.id, pages: filteredSnapshots.length });
  log(run, `🧠 Classifying page intents...`);
  const classifiedPages = [];
  for (const snap of filteredSnapshots) {
    throwIfAborted(signal);
    const classified = await classifyPageWithAI(snap, snap.elements, { signal });
    if (classified._aiAssisted) {
      log(run, `   🤖 AI classified ${snap.url.replace(project.url, "") || "/"} as ${classified.dominantIntent}`);
    }
    classifiedPages.push(classified);
  }
  const classifiedPagesByUrl = {};
  for (const cp of classifiedPages) {
    classifiedPagesByUrl[cp.url] = cp;
    log(run, `   ${cp.dominantIntent.padEnd(16)} ${cp.url.replace(project.url, "") || "/"}`);
  }

  return { filteredSnapshots, classifiedPages, classifiedPagesByUrl };
}

/**
 * generateFromUserDescription — Generates test(s) from a user-provided
 * name + description (no crawl needed).
 *
 * Uses a dedicated AI prompt that produces tests matching the user's
 * stated intent. The number of tests is controlled by the `testCount`
 * dial (1–20, default "one"). Unlike the crawl pipeline which discovers
 * pages automatically, this skips Steps 1-3 and goes straight to AI
 * generation.
 *
 * Pipeline:
 *   Step 1-3: SKIPPED (Crawl, Filter, Classify — user provides intent directly)
 *   Step 4: Generate     — AI generates test(s) from name + description
 *   Step 5: Deduplicate  — Check against existing project tests
 *   Step 6: Enhance      — Strengthen assertions
 *   Step 7: Validate     — Reject malformed / placeholder tests
 *   Step 8: Done
 */
export async function generateFromUserDescription(project, run, { name, description, dialsPrompt = "", testCount = "ai_decides", signal }) {
  const runStart = Date.now();
  structuredLog("generate.start", { runId: run.id, projectId: project.id, mode: "description", name });
  // DIF-013: anonymous opt-out telemetry — coarse-grained event with PII
  // stripped (URL → domain via sanitizeProps).
  trackTelemetry("generate.start", {
    projectId: project.id,
    provider: getProviderName(),
    testCount,
    descriptionLength: (description || "").length,
    url: project.url,
  });
  log(run, `✦ Starting test generation from requirement for "${name}"`);
  log(run, `🤖 AI provider: ${getProviderName()}`);
  log(run, `⚙️ Run config:`);
  log(run, `Generation mode: 📝 From requirement (no crawl)`);
  log(run, `Explorer mode: ⏭️ None (crawl skipped — generating from requirement)`);
  log(run, `Test count: ${testCount}`);
  log(run, `HAR capture: ❌ disabled (no crawl)`);
  log(run, `API tests: ✅ auto-detected from description (mention endpoints, HTTP methods, /api/ paths)`);
  log(run, `Target URL: ${project.url}`);

  // Skip steps 1-3 — user provides the intent directly via name + description
  setStep(run, 1);
  log(run, `⏭️  Step 1 (Crawl) — skipped (user-provided title & description)`);
  setStep(run, 2);
  log(run, `⏭️  Step 2 (Filter) — skipped`);
  setStep(run, 3);
  log(run, `⏭️  Step 3 (Classify) — skipped (user already described the intent)`);

  // ── Step 4: Generate focused test(s) via AI ─────────────────────────────
  throwIfAborted(signal);
  setStep(run, 4);
  // Detect API intent so the log reflects which prompt path will be used
  const apiKeywords = /\bAPI\b|\bREST\b|\bGraphQL\b|\bendpoint|\b(GET|POST|PUT|PATCH|DELETE)\s+\/|\bstatus\s*code|\brequest\s*body|\bresponse\s*(body|shape|schema)|\bjson\s*(response|payload|body)|\bcontract\s*test|\/api\//i;
  const detectedApiIntent = apiKeywords.test(`${name} ${description}`);
  log(run, `🤖 Generating test${detectedApiIntent ? "s (🌐 API intent detected → using API test prompt)" : " from user description"}...`);
  log(run, `Name: "${name}"`);
  if (description) log(run, `Description: "${description.slice(0, 100)}${description.length > 100 ? "…" : ""}"`);

  const rawTests = await generateFromDescription(name, description, project.url, (token) => {
    emitRunEvent(run.id, "llm_token", { token });
  }, { dialsPrompt, testCount, signal });
  log(run, `📝 Raw tests generated: ${rawTests.length}`);

  // ── Steps 5-7: Dedup → Enhance → Validate (shared pipeline) ────────────
  const { validatedTests, enhancedTests, rejected, removed, enhancedCount, dedupStats } =
    await runPostGenerationPipeline(rawTests, project, run, { signal });

  // ── Step 8: Store & Done ────────────────────────────────────────────────
  const createdTestIds = persistGeneratedTests(validatedTests, project, run, {
    name, description, sourceUrl: project.url, pageTitle: project.name,
  });

  run.testsGenerated = run.tests.length;
  run.pipelineStats = buildPipelineStats({ rawTests, removed, enhancedCount, rejected, dedupStats });

  finalizeRunIfNotAborted(run, () => {
    run.finishedAt = new Date().toISOString();
    run.duration = Date.now() - runStart;
    setStep(run, 8);
    log(run, `\n📊 Pipeline Summary:`);
    log(run, `Raw: ${rawTests.length} | Enhanced: ${enhancedTests.length} | Validated: ${validatedTests.length} | Rejected: ${rejected}`);
    logSuccess(run, `Done! ${run.tests.length} test(s) generated from description for "${name}".`);
    structuredLog("generate.complete", { runId: run.id, projectId: project.id, tests: run.tests.length, durationMs: run.duration });
    // DIF-013: report generation outcome (count + rejection rate proxy).
    trackTelemetry("generate.complete", {
      projectId: project.id,
      provider: getProviderName(),
      testsGenerated: run.tests.length,
      rejected,
      durationMs: run.duration,
      url: project.url,
    });
    emitRunEvent(run.id, "done", { status: "completed", testsGenerated: run.tests.length });
  });

  return createdTestIds;
}

/**
 * Full 8-stage pipeline: crawl a project URL, classify pages, generate tests,
 * deduplicate, enhance, validate, and persist.
 *
 * @param {Object} project                - The project `{ id, name, url, credentials? }`.
 * @param {Object} run                    - The run record (mutated in place with results).
 * @param {Object} [options]
 * @param {string} [options.dialsPrompt]   - Pre-built prompt fragment from Test Dials config.
 * @param {string} [options.testCount]     - Test count hint (`"one"` | `"small"` | `"medium"` | `"large"` | `"ai_decides"`).
 * @param {string} [options.explorerMode]   - `"crawl"` (default) or `"state"` — from Test Dials.
 * @param {Object} [options.explorerTuning] - Numeric tuning for state explorer `{ maxStates, maxDepth, maxActions, actionTimeout }`.
 * @param {AbortSignal} [options.signal]   - Abort signal for cancellation.
 * @returns {Promise<void>}
 */
export async function crawlAndGenerateTests(project, run, { dialsPrompt = "", testCount = "ai_decides", explorerMode, explorerTuning, signal } = {}) {
  const runStart = Date.now();
  const mode = (explorerMode || "crawl").toLowerCase();

  // ── Step 1: Smart crawl or state exploration ─────────────────────────────
  structuredLog("crawl.start", { runId: run.id, projectId: project.id, mode, url: project.url });
  // DIF-013: report crawl/state-explore launch. URL is stripped to domain
  // by sanitizeProps before sending — no full URLs leave the host.
  trackTelemetry("crawl.start", {
    projectId: project.id,
    mode,
    provider: getProviderName(),
    testCount,
    url: project.url,
  });
  log(run, `🕷️  Starting ${mode === "state" ? "state exploration" : "smart crawl"} of ${project.url}`);
  log(run, `🤖 AI provider: ${getProviderName()}`);
  log(run, `⚙️ Run config:`);
  log(run, `Explorer mode: ${mode === "state" ? "🔍 State exploration (click/fill/submit)" : "🔗 Link crawl (follow <a> tags)"}`);
  if (mode === "state" && explorerTuning) {
    log(run, `Max states: ${explorerTuning.maxStates ?? 30}`);
    log(run, `Max depth: ${explorerTuning.maxDepth ?? 3}`);
    log(run, `Max actions: ${explorerTuning.maxActions ?? 8}`);
    log(run, `Action timeout: ${explorerTuning.actionTimeout ?? 5000}ms`);
  }
  log(run, `Test count: ${testCount}`);
  log(run, `HAR capture: ✅ enabled (API traffic → API test generation)`);
  log(run, `Target URL: ${project.url}`);
  setStep(run, 1);

  let snapshots, snapshotsByUrl, journeys, classifiedPages, classifiedPagesByUrl, filteredSnapshots;
  let apiEndpoints = [];
  // AUTO-002: track the total pages the crawl actually discovered (before
  // diff-aware filtering reduces `snapshots` to just the changed subset).
  // Reported to the user as "pages found" / telemetry — otherwise a crawl
  // that discovered 10 pages with 3 changed would misleadingly report
  // `pagesFound: 3`, skewing both the UI and the `crawl.complete`
  // telemetry funnel which measures crawl quality.
  let pagesCrawled = 0;

  if (mode === "state") {
    // ── State-based exploration (new engine) ─────────────────────────────
    //
    // AUTO-002b: state-explorer mode is now diff-aware via composite keys.
    // The state explorer produces multiple snapshots per URL (login form
    // blank vs login form with errors), so we key the baseline by the
    // composite `url#fp=<fingerprint>` instead of plain URL — this lets
    // distinct states at the same URL be tracked as separate baseline
    // rows. The caller does NOT filter `snapshots[]` post-diff because
    // journeys reference unchanged states for flow context; we run the
    // full state set through generation but short-circuit when nothing
    // changed against the baseline (no-change crawl → `completed_empty`).
    const exploration = await exploreStates(project, run, { signal, tuning: explorerTuning });
    snapshots = exploration.snapshots;
    snapshotsByUrl = exploration.snapshotsByUrl;
    apiEndpoints = exploration.apiEndpoints || [];
    pagesCrawled = snapshots.length;

    throwIfAborted(signal);

    // AUTO-002b: diff-aware baseline for state mode.
    //
    // We synthesise a composite key per state (`originalUrl#fp=<fp>`) so
    // distinct states at the same URL (login blank vs login with errors)
    // track as separate baseline rows. But we must NOT let the diff
    // helper re-derive fingerprints from the composite-keyed snapshots —
    // `fingerprintState()` includes `snap.url` in its hash, so feeding
    // it a `url#fp=<fp>` URL would produce a different fingerprint than
    // the one originally computed (and stored as the suffix of the
    // composite key). Every re-crawl would then look "changed".
    //
    // Instead, we extract the pre-computed fingerprint directly from the
    // composite-key suffix and pass it through `fingerprintOf`. The
    // baseline stores it; the next run's diff compares apples to apples.
    if (snapshots.length > 0) {
      // Build an O(n) reverse lookup (snapshot → fingerprint) once, rather
      // than scanning `fpMap.entries()` per snapshot (O(n²)) AND — more
      // importantly — avoiding the fragility of relying on object identity
      // via `===`. stateExplorer.js:215-216 currently stores the SAME
      // snapshot reference in `snapshotsByFp` and `ctx.snapshots`, but if
      // any future refactor ever clones snapshots between those two stores,
      // identity comparison silently collapses all states into one baseline
      // row (defeating AUTO-002b's composite-key design). A WeakMap keyed
      // on the snapshot object is the same one-liner but makes the identity
      // dependency explicit; callers that produce a fresh snapshot simply
      // fall through to the `snap.url` fallback on line below.
      const fpMap = exploration.stateGraph?.snapshotsByFp;
      const snapshotToFp = new WeakMap();
      if (fpMap) {
        for (const [fp, s] of fpMap.entries()) snapshotToFp.set(s, fp);
      }
      const stateKeyed = snapshots.map((snap) => {
        const fp = snapshotToFp.get(snap) || null;
        return fp ? { ...snap, url: `${snap.url}#fp=${fp}`, _stateFp: fp } : snap;
      });
      const stateDiff = runDiffAwareBaseline(project, run, stateKeyed, "state", {
        // Pull the pre-computed fingerprint off the snapshot rather than
        // recomputing — the composite-key URL would otherwise feed back
        // into `fingerprintState` and falsely flip every state to changed.
        fingerprintOf: (snap) => snap._stateFp || snap.url,
      });
      if (stateDiff.noChanges) {
        // Short-circuit: nothing changed, skip generation entirely.
        snapshots = [];
        snapshotsByUrl = {};
      }
      // else: keep all snapshots — generation needs the full state set
      // for journey/flow context; the diff has been persisted + emitted.
    }

    // ── No-change short-circuit: skip filter/classify/journey/generation ─
    // When the state-mode diff reported zero changes, `snapshots` is empty
    // above. We must also zero out `journeys` and skip the supplementary
    // link-journey discovery — otherwise `exploration.journeys` (still
    // populated from the in-memory explorer run) would feed `generateAllTests`
    // and produce LLM calls + tests on a "no changes" run, which the run
    // is supposed to short-circuit to `completed_empty`.
    if (run.noChangesDetected) {
      filteredSnapshots = [];
      classifiedPages = [];
      classifiedPagesByUrl = {};
      journeys = [];
    } else {
      // ── Steps 2 & 3: shared filter + classify ─────────────────────────────
      ({ filteredSnapshots, classifiedPages, classifiedPagesByUrl } =
        await filterAndClassify(snapshots, snapshotsByUrl, project, run, signal));

      // Enrich snapshotsByUrl with fingerprint-keyed entries so that downstream
      // code (journeyPrompt.js) can look up per-state snapshots when a journey
      // page carries _stateFingerprint. Without this, multiple states at the
      // same URL (e.g. login form blank vs with errors) would all resolve to
      // the last-captured snapshot for that URL.
      const fpMap = exploration.stateGraph.snapshotsByFp;
      for (const [fp, snap] of fpMap) {
        snapshotsByUrl[fp] = snap;
      }

      // Use observed flows from the state explorer as journeys
      journeys = exploration.journeys;
      if (journeys.length > 0) {
        log(run, `🗺️  Discovered ${journeys.length} observed flow(s):`);
        for (const j of journeys) {
          const via = j._discoveredBy ? ` [${j._discoveredBy}]` : "";
          log(run, `   • ${j.name} (${j.pages.length} pages)${via}`);
        }
      }

      // Also discover link-graph journeys from classified pages as a supplement
      const linkJourneys = buildUserJourneys(classifiedPages, snapshotsByUrl);
      const explorerUrls = new Set(journeys.flatMap(j => j.pages.map(p => p.url)));
      for (const lj of linkJourneys) {
        // Only add link-graph journeys that cover pages not already in observed flows
        if (!lj.pages.some(p => explorerUrls.has(p.url))) {
          journeys.push(lj);
        }
      }
    }
  } else {
    // ── Legacy link-based crawl ──────────────────────────────────────────
    const crawlResult = await crawlPages(project, run, { signal });
    snapshots = crawlResult.snapshots;
    snapshotsByUrl = crawlResult.snapshotsByUrl;
    apiEndpoints = crawlResult.apiEndpoints || [];
    pagesCrawled = snapshots.length;

    // ── Early failure: unreachable target ────────────────────────────────
    // If the crawl produced zero pages AND every navigation attempt failed
    // with a network-class error (DNS, connection refused, TLS, timeout),
    // throw a navigation error so the run is classified `failed` with a
    // clear DNS/network reason — instead of silently completing as
    // "completed_empty" after the Filter/Classify/Generate stages run on
    // an empty snapshot list.
    //
    // NOTE: This check uses the raw crawl result (before diff filtering) and
    // runs BEFORE baselines are replaced — so a transient network failure
    // does not wipe the project's baseline fingerprints.
    const failures = crawlResult.navigationFailures || [];
    if (snapshots.length === 0 && failures.length > 0) {
      const networkFailures = failures.filter(f =>
        f.category === "dns" || f.category === "network" || f.category === "timeout"
      );
      if (networkFailures.length === failures.length) {
        const primary = networkFailures[0];
        const isDns = networkFailures.some(f => f.category === "dns");
        logWarn(run, isDns
          ? `Crawl aborted: DNS resolution failed for ${project.url} (${primary.message})`
          : `Crawl aborted: target URL unreachable — ${primary.message}`);
        structuredLog("crawl.unreachable", {
          runId: run.id, projectId: project.id, url: project.url,
          category: primary.category, message: primary.message,
        });
        // Throw with a message that contains "net::err_" / DNS markers so
        // classifyError() routes it to the NAVIGATION category (and the DNS
        // branch added in this change produces the DNS-specific hint).
        throw new Error(isDns
          ? `Target host could not be resolved (DNS). "${project.url}" is not reachable — ${primary.message}`
          : `Target URL is unreachable — ${primary.message}`
        );
      }
    }

    // ── Diff-aware crawl baseline (AUTO-002) ──────────────────────────────
    // Runs after the unreachable-target check so that transient network
    // failures cannot wipe existing baselines. The shared helper handles
    // canonical-URL origin checking (AUTO-015 preview-crawl preservation),
    // zero-snapshot defence, no-change short-circuit, and partial-crawl-safe
    // baseline merging. See `runDiffAwareBaseline` JSDoc for details.
    const diffOutcome = runDiffAwareBaseline(project, run, snapshots, "crawl");
    if (diffOutcome.noChanges) {
      // No-change crawl → short-circuit generation. The finalize block
      // checks `run.noChangesDetected` to render the correct
      // `completed_empty` message ("no changes" vs "AI returned empty").
      snapshots = [];
      snapshotsByUrl = {};
    } else if (!diffOutcome.skipped && diffOutcome.hadExistingBaseline && diffOutcome.changedSet) {
      // Diff-aware generation scope: filter to changed pages only.
      // First-ever crawl (no existing baseline) skips this filter so every
      // page flows through generation, matching pre-AUTO-002 behaviour.
      const changedSet = diffOutcome.changedSet;
      snapshots = snapshots.filter((snap) => changedSet.has(snap.url));
      snapshotsByUrl = Object.fromEntries(
        Object.entries(snapshotsByUrl).filter(([url]) => changedSet.has(url))
      );
      log(run, `🎯 Diff-aware generation scope: ${snapshots.length} changed page(s).`);
    }

    throwIfAborted(signal);

    // ── No-change short-circuit: skip filter/classify/journey/generation ─
    // Mirrors the state-mode branch above (see `if (run.noChangesDetected)`
    // around line 442). When the diff reported zero changes, `snapshots` is
    // empty — running filter/classify/journey detection on empty inputs is
    // wasted work AND `journey` ends up undefined which `generateAllTests`
    // happily processes (no LLM cost, but `journeys.length` access at the
    // structuredLog call later would crash). Short-circuit to empty arrays
    // and let the finalize block route to `completed_empty` with the
    // "no page changes since baseline" message.
    if (run.noChangesDetected) {
      filteredSnapshots = [];
      classifiedPages = [];
      classifiedPagesByUrl = {};
      journeys = [];
    } else {
      // ── Steps 2 & 3: shared filter + classify ─────────────────────────────
      ({ filteredSnapshots, classifiedPages, classifiedPagesByUrl } =
        await filterAndClassify(snapshots, snapshotsByUrl, project, run, signal));

      // Journey detection — pass snapshotsByUrl so link-graph analysis can discover
      // cross-intent journeys (e.g. pricing → signup → dashboard)
      journeys = buildUserJourneys(classifiedPages, snapshotsByUrl);
      if (journeys.length > 0) {
        log(run, `🗺️  Detected ${journeys.length} user journey(s):`);
        for (const j of journeys) {
          const via = j._discoveredBy ? ` [${j._discoveredBy}]` : "";
          log(run, `   • ${j.name} (${j.pages.length} pages)${via}`);
        }
      }
    }
  }

  throwIfAborted(signal);

  // SEC-006: PII firewall — sanitize snapshots + classified pages before
  // they reach `generateAllTests` (which builds the LLM prompt). Wiring
  // lives in `pipelineOrchestrator.sanitizeRunInputs` so any future caller
  // of the generation pipeline gets the same firewall behaviour from one
  // place. Honours `project.strictPiiFirewall` (default ON) and
  // `project.piiAllowlist`.
  const { snapshotsByUrl: effectiveSnapshotsByUrl, classifiedPages: effectiveClassifiedPages } =
    sanitizeRunInputs(project, run, { snapshotsByUrl, classifiedPages });

  // ── Step 4: AI test generation ──────────────────────────────────────────
  setStep(run, 4);
  structuredLog("pipeline.generate", { runId: run.id, pages: effectiveClassifiedPages.length, journeys: journeys.length });
  log(run, `🤖 Generating intent-driven tests...`);
  const genResult = await generateAllTests(effectiveClassifiedPages, journeys, effectiveSnapshotsByUrl, (msg) => log(run, msg), { dialsPrompt, testCount, signal });
  const rawTests = genResult.tests;
  log(run, `📝 Raw UI tests: ${rawTests.length}`);

  // Surface rate limit errors so the frontend shows a clear warning
  if (genResult.rateLimitHit) {
    const errMsg = genResult.rateLimitError || "AI provider rate limit exceeded";
    logWarn(run, `AI RATE LIMIT: ${errMsg}`);
    logWarn(run, `Tests generated before limit: ${rawTests.length}. Switch to a different AI provider in Settings, or wait and retry.`);
    run.rateLimitError = errMsg;
  }

  // ── Step 4b: API test generation from captured HAR traffic ──────────────
  if (apiEndpoints.length === 0) {
    log(run, `🌐 No API endpoints captured — site made no fetch/XHR calls during ${mode === "state" ? "exploration" : "crawl"}. API test generation skipped.`);
    log(run, `💡 Tip: Use "State exploration" mode to trigger API calls via button clicks and form submissions.`);
  }
  // Skip API test generation for trivial traffic — sites like google.com
  // emit a few GET requests for assets/telemetry that don't produce useful
  // API contract tests. Only invest an LLM call when there are enough
  // meaningful endpoints (≥4) or at least one mutation (POST/PUT/PATCH/DELETE).
  const hasMutationEndpoints = apiEndpoints.some(ep => ep.method !== "GET");
  const skipApiTests = apiEndpoints.length > 0 && apiEndpoints.length < 4 && !hasMutationEndpoints;
  if (skipApiTests) {
    log(run, `🌐 Only ${apiEndpoints.length} trivial GET endpoint(s) captured — skipping API test generation (need ≥4 endpoints or a mutation)`);
  }
  if (apiEndpoints.length > 0 && !skipApiTests && !genResult.rateLimitHit) {
    throwIfAborted(signal);
    log(run, `🌐 Generating API tests from ${apiEndpoints.length} discovered endpoints...`);
    try {
      const apiTests = await generateApiTests(apiEndpoints, project.url, { dialsPrompt, testCount: "small", signal });
      if (apiTests.length > 0) {
        for (const t of apiTests) rawTests.push(t);
        log(run, `📝 API tests generated: ${apiTests.length} (total raw: ${rawTests.length})`);
      } else {
        log(run, `No API tests generated (AI returned empty)`);
      }
    } catch (err) {
      if (err.name === "AbortError" || signal?.aborted) throw err;
      const classified = classifyError(err, "crawl");
      logWarn(run, `API test generation failed: ${classified.message}`);
    }
  }

  throwIfAborted(signal);

  // ── Steps 5-7: Dedup → Enhance → Validate (shared pipeline) ────────────
  const { validatedTests, enhancedTests, rejected, removed, enhancedCount, dedupStats } =
    await runPostGenerationPipeline(rawTests, project, run, { snapshotsByUrl: effectiveSnapshotsByUrl, classifiedPagesByUrl, signal });

  // ── Step 8: Store & Done ────────────────────────────────────────────────
  persistGeneratedTests(validatedTests, project, run);

  run.snapshots = filteredSnapshots;
  run.pages = filteredSnapshots.map(s => ({ url: s.url, title: s.title || s.url, status: "crawled" }));
  run.testsGenerated = run.tests.length;
  run.pipelineStats = buildPipelineStats({
    // pagesCrawled = total pages the crawl discovered, before AUTO-002 diff
    // filtering. `snapshots.length` at this point has been narrowed to just
    // the changed pages; reporting that as "pagesFound" would understate
    // crawl breadth and break the telemetry funnel that distinguishes
    // "small site" from "big site with few changes".
    pagesFound: pagesCrawled, rawTests, removed, enhancedCount, rejected, journeys, dedupStats,
    apiEndpointsDiscovered: apiEndpoints.length,
  });

  finalizeRunIfNotAborted(run, () => {
    run.finishedAt = new Date().toISOString();
    run.duration = Date.now() - runStart;
    setStep(run, 8);
    log(run, `\n📊 Pipeline Summary:`);
    // Show both the crawl breadth AND the diff-aware generation scope when
    // they differ, so reviewers can distinguish "big site with few changes"
    // from "small site, everything generated". `pagesCrawled` is the full
    // count, `snapshots.length` is the filtered subset that drove generation.
    const scopeSuffix = pagesCrawled !== snapshots.length ? ` (${snapshots.length} changed → generated)` : "";
    log(run, `Pages: ${pagesCrawled}${scopeSuffix} | Raw tests: ${rawTests.length} | Enhanced: ${enhancedTests.length} | Validated: ${validatedTests.length}`);
    log(run, `Journey tests: ${validatedTests.filter(t => t.isJourneyTest).length} | API tests: ${validatedTests.filter(t => t._generatedFrom === "api_har_capture" || t._generatedFrom === "api_user_described").length} | Rejected: ${rejected} | Avg quality: ${dedupStats.averageQuality}/100`);
    if (apiEndpoints.length > 0) {
      log(run, `API endpoints discovered: ${apiEndpoints.length}`);
    }

    // ── ENH-034: Distinguish empty crawl results ──────────────────────────
    // When a crawl completes but generates zero tests (site behind auth,
    // SPA with no crawlable links, AI returned empty), mark as
    // "completed_empty" so the UI can show a warning instead of green success.
    if (run.tests.length === 0) {
      run.status = "completed_empty";
      if (run.noChangesDetected) {
        log(run, `✅ Crawl completed — no page changes since the last baseline; generation skipped.`);
      } else {
      logWarn(run, `Crawl completed but no tests were generated.`);
      logWarn(run, `Possible causes:`);
      logWarn(run, `  1. AI provider is temporarily overloaded (503) — wait 5-10 min and Re-run, or configure multi-provider fallback in Settings`);
      logWarn(run, `  2. Site requires authentication — add credentials in Project Settings`);
      logWarn(run, `  3. Pages have no interactive elements — try a different start URL`);
      logWarn(run, `  4. AI provider returned empty — check your API key in Settings`);
      logWarn(run, `  5. Try "State exploration" mode to discover dynamic content`);
      }
    } else if (run.rateLimitError) {
      logWarn(run, `Completed with rate limit — only ${run.tests.length} test(s) generated. Switch AI provider or retry later.`);
    } else {
      logSuccess(run, `Done! ${run.tests.length} high-quality tests generated.`);
    }
    structuredLog("crawl.complete", {
      runId: run.id, projectId: project.id, mode,
      // `pages` = full crawl breadth; `pagesGenerated` = diff-filtered
      // subset that actually drove generation. Splitting these lets
      // telemetry funnels measure both crawl cost and generation scope.
      pages: pagesCrawled, pagesGenerated: snapshots.length,
      tests: run.tests.length, durationMs: run.duration,
      apiEndpoints: apiEndpoints.length,
    });
    // DIF-013: report crawl outcome with the same shape as crawl.start so
    // PostHog funnels (start → complete) line up. `status` distinguishes
    // success from `completed_empty` so we can measure crawl quality.
    trackTelemetry("crawl.complete", {
      projectId: project.id,
      mode,
      status: run.status,
      // pages = full crawl breadth (matches crawl.start funnel shape);
      // pagesGenerated = diff-filtered subset that drove generation.
      pages: pagesCrawled,
      pagesGenerated: snapshots.length,
      testsGenerated: run.tests.length,
      apiEndpoints: apiEndpoints.length,
      rateLimitHit: !!run.rateLimitError,
      durationMs: run.duration,
      url: project.url,
    });
    emitRunEvent(run.id, "done", { status: run.status, testsGenerated: run.tests.length });
  });
}