Source: pipeline/crawlBrowser.js

/**
 * @module pipeline/crawlBrowser
 * @description Playwright browser crawl loop. Launches Chromium, optionally
 * logs in, crawls same-origin pages via SmartCrawlQueue, and captures DOM snapshots.
 *
 * ### Exports
 * - {@link crawlPages} — `(project, run, { signal }) → { snapshots, snapshotsByUrl }`
 */

import { throwIfAborted } from "../utils/abortHelper.js";
import { SmartCrawlQueue, fingerprintStructure, extractPathPattern, stripNoiseParams } from "./smartCrawl.js";
import { takeSnapshot } from "./pageSnapshot.js";
import { log, logWarn, logSuccess, emitRunEvent } from "../utils/runLogger.js";
import * as runRepo from "../database/repositories/runRepo.js";
import { signRunArtifacts } from "../middleware/appSetup.js";
import { decryptCredentials } from "../utils/credentialEncryption.js";
import { createHarCapture, summariseApiEndpoints } from "./harCapture.js";
import { launchBrowser } from "../runner/config.js";
import { loadRobotsRules, isAllowed, loadSitemapUrls } from "../utils/robotsSitemap.js";

const MAX_PAGES = parseInt(process.env.CRAWL_MAX_PAGES, 10) || 30;
const MAX_DEPTH = parseInt(process.env.CRAWL_MAX_DEPTH, 10) || 3;

/**
 * Check if two URLs share the same effective origin (protocol + host + port).
 * Treats www.example.com and example.com as equivalent — matches stateExplorer.js.
 * @param {string} urlA
 * @param {string} urlB
 * @returns {boolean}
 */
function isSameEffectiveOrigin(urlA, urlB) {
  try {
    const a = new URL(urlA);
    const b = new URL(urlB);
    const normHost = h => h.replace(/^www\./i, "").toLowerCase();
    return a.protocol === b.protocol && normHost(a.hostname) === normHost(b.hostname) && a.port === b.port;
  } catch { return false; }
}

/**
 * Crawl same-origin pages starting from project.url.
 *
 * @param {object} project        — project record (url, credentials)
 * @param {object} run            — mutable run record (logs, pagesFound, pages)
 * @param {object} opts
 * @param {AbortSignal} [opts.signal]
 * @returns {Promise<{ snapshots: object[], snapshotsByUrl: Record<string, object>, apiEndpoints: object[], navigationFailures: Array<{url:string, message:string, category:string}> }>}
 */
export async function crawlPages(project, run, { signal } = {}) {
  const browser = await launchBrowser();

  const snapshots = [];
  const snapshotsByUrl = {};
  /** @type {Array<{url:string, message:string, category:string}>} */
  const navigationFailures = [];
  let harCapture = null;

  try {
    const context = await browser.newContext({ userAgent: "Mozilla/5.0 (compatible; Sentri/1.0)" });

    const crawlQueue = new SmartCrawlQueue(project.url);
    crawlQueue.enqueue(project.url, 0);

    const pathPatternsSeen = new Set();

    // ── Optional login ──────────────────────────────────────────────────────
    const creds = decryptCredentials(project.credentials);
    if (creds?.usernameSelector) {
      const loginPage = await context.newPage();
      try {
        await loginPage.goto(project.url, { timeout: 15000 });
        await loginPage.fill(creds.usernameSelector, creds.username);
        await loginPage.fill(creds.passwordSelector, creds.password);
        await loginPage.click(creds.submitSelector);
        await loginPage.waitForLoadState("networkidle", { timeout: 10000 }).catch(() => {});
        log(run, `🔑 Logged in as ${creds.username}`);
      } catch (e) {
        logWarn(run, `Login failed: ${e.message}`);
      } finally {
        await loginPage.close().catch(() => {});
      }
    }

    // ── Resolve actual origin after redirects ────────────────────────────────
    // Navigate once to discover the real origin (e.g. http → https, www →
    // non-www) BEFORE attaching HAR capture. Without this, createHarCapture
    // filters by the user-entered origin which may differ from the resolved
    // one, causing all API traffic to be silently dropped.
    const probePage = await context.newPage();
    let resolvedOrigin = project.url;
    try {
      await probePage.goto(project.url, { waitUntil: "domcontentloaded", timeout: 15000 });
      resolvedOrigin = probePage.url();
      if (resolvedOrigin !== project.url) {
        log(run, `🔀 Redirected: ${project.url} → ${resolvedOrigin}`);
      }
    } catch (err) {
      // Surface probe-navigation failures so the caller can classify fully
      // unreachable targets (e.g. DNS failures) as failed runs instead of
      // "completed empty". The existing BFS loop still records per-URL
      // failures, but the probe failure matters when the target's root page
      // is the first thing that breaks.
      const failMsg = err?.message || String(err);
      navigationFailures.push({
        url: project.url,
        message: failMsg,
        category: categoriseNavigationError(failMsg),
      });
    }
    finally { await probePage.close().catch(() => {}); }

    // ── HAR capture: attach AFTER redirect so it uses the resolved origin ──
    harCapture = createHarCapture(context, resolvedOrigin);

    // ── robots.txt + sitemap.xml (#53) ──────────────────────────────────────
    const robotsRules = await loadRobotsRules(resolvedOrigin);
    if (robotsRules.rules.length > 0) {
      log(run, `🤖 robots.txt: ${robotsRules.rules.length} rule(s) loaded — restricted paths will be skipped`);
    }
    const sitemapUrls = await loadSitemapUrls(resolvedOrigin, robotsRules.sitemaps);
    if (sitemapUrls.length > 0) {
      log(run, `🗺️  sitemap.xml: ${sitemapUrls.length} URL(s) discovered — seeding crawl queue`);
      for (const sitemapUrl of sitemapUrls) {
        if (isSameEffectiveOrigin(sitemapUrl, resolvedOrigin) && isAllowed(sitemapUrl, robotsRules)) {
          crawlQueue.enqueue(sitemapUrl, 1);
        }
      }
    }

    // ── Crawl loop ──────────────────────────────────────────────────────────
    while (crawlQueue.hasMore() && crawlQueue.visitedCount < MAX_PAGES) {
      if (signal?.aborted) { throwIfAborted(signal); }
      const item = crawlQueue.dequeue();
      if (!item) break;
      const { url, depth } = item;

      // robots.txt compliance (#53) — skip disallowed paths
      // Check BEFORE markVisited so disallowed URLs don't consume crawl budget.
      if (!isAllowed(url, robotsRules)) {
        log(run, `🚫 Skipping (robots.txt): ${url}`);
        continue;
      }

      crawlQueue.markVisited(url);

      const pathPattern = extractPathPattern(url);
      if (pathPatternsSeen.has(pathPattern) && depth > 0) {
        log(run, `⏭️  Skipping duplicate structure: ${url}`);
        continue;
      }
      pathPatternsSeen.add(pathPattern);

      const page = await context.newPage();
      try {
        log(run, `📄 Visiting (depth ${depth}): ${url}`);
        await page.goto(url, { waitUntil: "domcontentloaded", timeout: 15000 });
        // takeSnapshot() now calls waitForLoadState('networkidle') internally,
        // so we no longer need the arbitrary 800ms static wait here.

        // ── Shadow DOM: inject queryShadowAll helper and collect elements ──
        // Modern enterprise apps (Angular, Lit, Stencil, Salesforce LWC) encapsulate
        // UI inside shadow roots that are invisible to standard page.$$() queries.
        // We inject a recursive helper once per page, call it with common interactive
        // selectors, and attach any found elements to the snapshot so elementFilter.js
        // can score and surface them alongside regular DOM elements.
        let shadowElements = [];
        try {
          shadowElements = await page.evaluate(() => {
            // Recursively traverse all shadow roots in the document.
            // Returns a flat array of plain objects (must be serialisable across
            // the evaluate boundary — no DOM node references).
            function queryShadowAll(selector, root = document, insideShadow = false) {
              const results = [];
              // Walk every element in this root
              const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
              let node = walker.nextNode();
              while (node) {
                // Only collect matching elements when we are inside a shadow root —
                // light DOM elements are already captured by takeSnapshot().
                if (insideShadow && node.matches && node.matches(selector)) {
                  const rect = node.getBoundingClientRect();
                  results.push({
                    tag: node.tagName.toLowerCase(),
                    type: node.getAttribute("type") || "",
                    text: (node.textContent || node.getAttribute("aria-label") || node.getAttribute("title") || "").trim().slice(0, 200),
                    href: node.getAttribute("href") || "",
                    role: node.getAttribute("role") || "",
                    ariaLabel: node.getAttribute("aria-label") || "",
                    placeholder: node.getAttribute("placeholder") || "",
                    visible: rect.width > 0 && rect.height > 0,
                    _fromShadow: true,
                  });
                }
                // Recurse into this node's shadow root if it has one
                if (node.shadowRoot) {
                  const inner = queryShadowAll(selector, node.shadowRoot, true);
                  results.push(...inner);
                }
                node = walker.nextNode();
              }
              return results;
            }

            // Selectors covering the interactive elements most likely to be
            // test-worthy inside shadow DOM components
            const SHADOW_INTERACTIVE_SELECTORS = [
              "button",
              "a[href]",
              "input",
              "textarea",
              "select",
              "[role='button']",
              "[role='link']",
              "[role='menuitem']",
              "[role='tab']",
              "[role='checkbox']",
              "[role='radio']",
              "[role='switch']",
              "[role='textbox']",
              "[role='searchbox']",
              "[role='combobox']",
            ].join(", ");

            return queryShadowAll(SHADOW_INTERACTIVE_SELECTORS);
          });
        } catch (shadowErr) {
          // Shadow DOM traversal is best-effort — never break the crawl
          shadowElements = [];
        }

        const snapshot = await takeSnapshot(page);

        // Merge shadow elements into the snapshot's element list so they flow
        // through elementFilter.js scoring alongside regular DOM elements.
        if (shadowElements.length > 0) {
          snapshot.elements = [...(snapshot.elements || []), ...shadowElements];
          log(run, `🕸️  Shadow DOM: ${shadowElements.length} element(s) found inside shadow roots on ${url}`);
        }

        const structureFP = fingerprintStructure(snapshot);
        if (crawlQueue.isStructureDuplicate(structureFP) && depth > 1) {
          log(run, `⏭️  Skipping duplicate layout: ${url}`);
          await page.close();
          continue;
        }
        crawlQueue.markStructureSeen(structureFP);

        snapshots.push(snapshot);
        snapshotsByUrl[url] = snapshot;
        run.pagesFound = snapshots.length;
        // Keep run.pages in sync so the frontend site graph updates live
        run.pages = snapshots.map(s => ({ url: s.url, title: s.title || s.url, status: "crawled" }));
        // Persist to DB so the site map renders after page reload
        runRepo.update(run.id, { pages: run.pages, pagesFound: run.pagesFound });
        // Sign artifact URLs before emitting SSE snapshot (matches testRunner.js pattern)
        emitRunEvent(run.id, "snapshot", { run: signRunArtifacts(run) });

        if (depth < MAX_DEPTH) {
          const links = await page.$$eval("a[href]", els => els.map(e => e.href));
          for (const href of links) {
            try {
              const u = new URL(href, url);
              u.hash = "";
              // Strip only noise query params; preserve significant ones (#52)
              stripNoiseParams(u);
              const normalized = u.toString();
              if (!isSameEffectiveOrigin(normalized, resolvedOrigin)) continue;
              // robots.txt compliance (#53) — skip disallowed before enqueuing
              if (!isAllowed(normalized, robotsRules)) continue;
              crawlQueue.enqueue(normalized, depth + 1);
            } catch {}
          }
        }
      } catch (err) {
        const failMsg = err?.message || String(err);
        logWarn(run, `Failed: ${url} — ${failMsg}`);
        navigationFailures.push({
          url,
          message: failMsg,
          category: categoriseNavigationError(failMsg),
        });
      } finally {
        await page.close();
      }
    }

    // ── Summarise captured API traffic (before browser.close) ──────────────
    if (harCapture) {
      harCapture.detach();
    }
  } finally {
    await browser.close().catch(() => {});
  }

  let apiEndpoints = [];
  if (harCapture) {
    apiEndpoints = summariseApiEndpoints(harCapture.getEntries());
    if (apiEndpoints.length > 0) {
      log(run, `🌐 Captured ${harCapture.getEntries().length} API calls → ${apiEndpoints.length} unique endpoint patterns`);
    }
  }

  logSuccess(run, `Smart crawl done. ${snapshots.length} unique pages found.`);

  return { snapshots, snapshotsByUrl, apiEndpoints, navigationFailures };
}

/**
 * Classify a navigation failure message into a coarse category so the caller
 * can decide whether a totally-unreachable target warrants a `failed` run.
 *
 * Exported for regression tests — see `tests/dns-classification.test.js`.
 *
 * @param {string} message - Error message from `page.goto` (Playwright).
 * @returns {string} One of `"dns"`, `"network"`, `"timeout"`, or `"other"`.
 */
export function categoriseNavigationError(message) {
  const m = (message || "").toLowerCase();
  if (m.includes("err_name_not_resolved")
    || m.includes("enotfound")
    || m.includes("dns")) {
    return "dns";
  }
  if (m.includes("err_connection_refused")
    || m.includes("err_connection_reset")
    || m.includes("err_connection_closed")
    || m.includes("err_connection_timed_out")
    || m.includes("err_network")
    || m.includes("err_address_unreachable")
    || m.includes("err_internet_disconnected")
    || m.includes("err_ssl_")
    || m.includes("err_cert_")
    || m.includes("econnrefused")
    || m.includes("econnreset")
    || m.includes("enetunreach")) {
    return "network";
  }
  if (m.includes("timeout") || m.includes("timed out")) {
    return "timeout";
  }
  return "other";
}