Source: pipeline/crawlDiff.js

/**
 * @module pipeline/crawlDiff
 * @description AUTO-002 diff-aware crawling primitive. Compares the current
 * crawl's page snapshots against the persisted baseline map and classifies
 * each URL into added / changed / unchanged / removed buckets.
 *
 * Fingerprinting reuses `stateFingerprint.js` (no new hashing scheme) so
 * a page's fingerprint is stable across the state-explorer and link-crawl
 * discovery paths.
 */

import { fingerprintState } from "./stateFingerprint.js";

/**
 * @param {object} snapshot - page snapshot `{ url, title, elements[], ... }`.
 * @returns {string} content-addressed fingerprint for the page.
 */
export function buildPageFingerprint(snapshot) {
  return fingerprintState(snapshot);
}

/**
 * Classify each URL in the current crawl against the previous baseline.
 *
 * @param {Record<string, {fingerprint: string}>|null|undefined} previousByUrl
 *   URL → baseline row (`{ fingerprint, capturedAt, ... }`) from
 *   `crawlBaselineRepo.getMapByProjectId()`. `null` / `undefined` / `{}`
 *   are all treated equivalently as "no previous baseline" — every
 *   current URL is classified as added.
 * @param {Array<{url: string}>|null|undefined} currentSnapshots
 *   Raw snapshots from the crawl. `null` / `undefined` → no URLs.
 * @param {object} [opts]
 * @param {function(object): string} [opts.fingerprintOf]
 *   AUTO-002b: optional override for fingerprint computation. State-mode
 *   callers pass a function that returns a pre-computed fingerprint
 *   keyed off the original snapshot identity, because the default
 *   `buildPageFingerprint` recomputes from `snap.url` — which would
 *   embed the composite `url#fp=<fp>` key in the new fingerprint and
 *   make every state-mode re-crawl look "changed". Link-crawl callers
 *   omit this and get the default URL-derived fingerprint.
 * @returns {{changedPages: string[], addedPages: string[], changedOnlyPages: string[], removedPages: string[], unchangedPages: string[], fingerprints: Record<string,string>}}
 */
export function diffCrawlSnapshots(previousByUrl, currentSnapshots, opts = {}) {
  // Accept null/undefined for both inputs. `previousByUrl` is read with
  // bracket-access below, which throws on null; the `|| {}` normalisation
  // keeps the function contract documented (and matches the existing
  // `Object.keys(previousByUrl || {})` on the removed-pages branch).
  const prev = previousByUrl || {};
  const fpOf = typeof opts.fingerprintOf === "function" ? opts.fingerprintOf : buildPageFingerprint;

  const currentByUrl = new Map();
  for (const snapshot of currentSnapshots || []) {
    currentByUrl.set(snapshot.url, fpOf(snapshot));
  }

  const added = [];
  const changed = [];
  const removed = [];
  const unchanged = [];

  for (const [url, fingerprint] of currentByUrl.entries()) {
    const prevRow = prev[url];
    if (!prevRow) {
      added.push(url);
      continue;
    }
    if (prevRow.fingerprint === fingerprint) unchanged.push(url);
    else changed.push(url);
  }

  for (const url of Object.keys(prev)) {
    if (!currentByUrl.has(url)) removed.push(url);
  }

  return {
    changedPages: [...added, ...changed],
    addedPages: added,
    changedOnlyPages: changed,
    removedPages: removed,
    unchangedPages: unchanged,
    fingerprints: Object.fromEntries(currentByUrl),
  };
}