Source: database/repositories/crawlBaselineRepo.js

/**
 * @module database/repositories/crawlBaselineRepo
 * @description AUTO-002 persistence layer for per-project page fingerprints.
 * Two write strategies are intentionally exposed:
 *
 * - {@link replaceProjectBaselines} — full DELETE + re-INSERT. Use only when
 *   the caller is certain the new fingerprint set is *complete* (e.g. after
 *   a fresh first-ever crawl), because any URL absent from `fingerprints` is
 *   treated as removed from the site.
 * - {@link mergeProjectBaselines} — upsert + targeted-delete. Preferred for
 *   every diff-aware crawl: a partial crawl (page N fails with a transient
 *   503) won't silently drop page N's baseline and force an unnecessary
 *   regen on the next run.
 */

import { getDatabase } from "../sqlite.js";

export function getByProjectId(projectId) {
  const db = getDatabase();
  return db.prepare("SELECT projectId, pageUrl, fingerprint, capturedAt FROM crawl_baselines WHERE projectId = ?").all(projectId);
}

export function getMapByProjectId(projectId) {
  const rows = getByProjectId(projectId);
  const map = {};
  for (const row of rows) map[row.pageUrl] = row;
  return map;
}

export function replaceProjectBaselines(projectId, fingerprints) {
  const db = getDatabase();
  const now = new Date().toISOString();
  const tx = db.transaction(() => {
    db.prepare("DELETE FROM crawl_baselines WHERE projectId = ?").run(projectId);
    const insert = db.prepare("INSERT INTO crawl_baselines (projectId, pageUrl, fingerprint, capturedAt) VALUES (?, ?, ?, ?)");
    for (const [pageUrl, fingerprint] of Object.entries(fingerprints || {})) {
      insert.run(projectId, pageUrl, fingerprint, now);
    }
  });
  tx();
}

/**
 * Upsert the current crawl's fingerprints into the baseline table without
 * wiping pages that weren't observed this time. `removedPageUrls` (URLs
 * the diff reported as `removedPages`) are explicitly deleted — this is
 * the only path that drops a baseline row, and it requires the caller to
 * prove the URL is genuinely gone (absent from the current crawl AND
 * present in the previous baseline). Transient failures that produce a
 * subset crawl don't hit this branch because their URLs never reach the
 * `removedPages` list.
 *
 * @param {string} projectId
 * @param {Record<string,string>} fingerprints - URL → new fingerprint for
 *   pages observed in the current crawl.
 * @param {string[]} [removedPageUrls] - URLs classified as `removedPages`
 *   by `diffCrawlSnapshots`. Optional; defaults to none.
 */
export function mergeProjectBaselines(projectId, fingerprints, removedPageUrls = []) {
  const db = getDatabase();
  const now = new Date().toISOString();
  const tx = db.transaction(() => {
    const upsert = db.prepare(`
      INSERT INTO crawl_baselines (projectId, pageUrl, fingerprint, capturedAt)
      VALUES (?, ?, ?, ?)
      ON CONFLICT(projectId, pageUrl) DO UPDATE SET
        fingerprint = excluded.fingerprint,
        capturedAt = excluded.capturedAt
    `);
    for (const [pageUrl, fingerprint] of Object.entries(fingerprints || {})) {
      upsert.run(projectId, pageUrl, fingerprint, now);
    }
    if (Array.isArray(removedPageUrls) && removedPageUrls.length > 0) {
      const del = db.prepare("DELETE FROM crawl_baselines WHERE projectId = ? AND pageUrl = ?");
      for (const url of removedPageUrls) del.run(projectId, url);
    }
  });
  tx();
}