Source: pipeline/domSanitizer.js

import { structuredLog } from "../utils/logFormatter.js";

function luhnValid(raw) {
  const digits = raw.replace(/\D/g, "");
  if (digits.length < 13 || digits.length > 19) return false;
  let sum = 0;
  let alt = false;
  for (let i = digits.length - 1; i >= 0; i--) {
    let n = Number(digits[i]);
    if (alt) {
      n *= 2;
      if (n > 9) n -= 9;
    }
    sum += n;
    alt = !alt;
  }
  return sum % 10 === 0;
}

/**
 * Create a sanitizer context. Pass the same context into multiple
 * {@link sanitizeDomSnapshot} calls to guarantee that identical input
 * values resolve to identical placeholder IDs across all calls (so the AI
 * can correlate references across snapshots + classified pages).
 *
 * @param {object} [opts]
 * @param {string[]} [opts.allowlist]
 * @param {string} [opts.runId]
 */
export function createPiiContext({ allowlist = [], runId } = {}) {
  return {
    allowlist: Array.isArray(allowlist) ? allowlist.filter((v) => typeof v === "string" && v.length > 0) : [],
    placeholders: new Map(),
    seq: { EMAIL: 0, PHONE: 0, SSN: 0, CARD: 0, TOKEN: 0 },
    counters: { email: 0, phone: 0, ssn: 0, card: 0, token: 0, jwt: 0, bearer: 0, queryAuth: 0 },
    runId,
  };
}

/**
 * Emit the per-run `pipeline.pii_redacted` audit log. `total` sums only
 * the five non-overlapping categories (email/phone/ssn/card/token) —
 * `jwt`, `bearer`, and `queryAuth` are subdivisions of `token` and would
 * double-count if added to the aggregate.
 */
export function finalizePiiContext(ctx) {
  const { counters } = ctx;
  const total = counters.email + counters.phone + counters.ssn + counters.card + counters.token;
  structuredLog("pipeline.pii_redacted", { runId: ctx.runId, counts: counters, total });
}

/**
 * Sanitize an input value (string, array, or plain object — walked
 * recursively) by replacing PII matches with deterministic placeholders.
 *
 * Two call forms:
 *  - `sanitizeDomSnapshot(input, ctx)` — caller-owned context (created via
 *    {@link createPiiContext}); no audit log is emitted here. The caller
 *    invokes {@link finalizePiiContext} once after the final call to log
 *    aggregate counts for the run. Use this form when multiple artifacts
 *    must share placeholder IDs.
 *  - `sanitizeDomSnapshot(input, { allowlist, runId })` — convenience form
 *    that creates a fresh context internally and emits its own audit log.
 *    Counters and placeholders are NOT shared with later calls.
 *
 * @param {*} input
 * @param {object} [ctxOrOpts] — either a context object from
 *   {@link createPiiContext} (detected via internal `placeholders` Map) or
 *   `{ allowlist, runId }` to create a one-shot context.
 * @returns {{ output: *, counts: object, ctx: object }}
 */
export function sanitizeDomSnapshot(input, ctxOrOpts = {}) {
  // Detect a caller-owned context via the presence of the internal
  // `placeholders` Map — opts objects only ever carry `allowlist` / `runId`.
  const ownsContext = !ctxOrOpts || !(ctxOrOpts.placeholders instanceof Map);
  const ctx = ownsContext ? createPiiContext(ctxOrOpts) : ctxOrOpts;

  // Exact-value match (case-insensitive). Substring matching was tempting
  // for "partial allowlist fragments" but creates a silent footgun: a short
  // entry like "5551" would exempt every phone, card, and SSN that happens
  // to contain those four digits. Allowlist entries must be complete values
  // (full email, full token, full query value). Whitespace is trimmed at
  // both sides to forgive trailing newlines from the textarea input.
  const allowSet = new Set(ctx.allowlist.map((r) => r.trim().toLowerCase()));
  const allowMatch = (v) => allowSet.has(String(v).trim().toLowerCase());
  const idFor = (k, label) => {
    if (!ctx.placeholders.has(k)) {
      ctx.seq[label] += 1;
      ctx.placeholders.set(k, `<${label}_${ctx.seq[label]}>`);
    }
    return ctx.placeholders.get(k);
  };
  const redact = (text) => {
    if (!text || typeof text !== "string") return text;
    let out = text;
    out = out.replace(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi, (m) => allowMatch(m) ? m : (ctx.counters.email++, idFor(`email:${m.toLowerCase()}`, "EMAIL")));
    // ORDERING: card + ssn MUST run before phone. The phone regex below
    // accepts 10 digits with `[\s.-]` separators, and a canonical hyphenated
    // PAN like "4111-1111-1111-1111" contains a phone-shaped 14-char prefix
    // ("4111-1111-1111"). If phone ran first it would consume that prefix as
    // <PHONE_n>, leaving "-1111" orphaned and the card regex unable to see a
    // contiguous 13–19 digit run — Luhn never fires, the card counter stays
    // zero, and the threat-model promise to redact cards silently fails.
    // Card requires ≥13 digits so it cannot match phone-shaped input; SSN
    // is `\d{3}-\d{2}-\d{4}` (2-digit middle) which cannot collide with
    // phone's 3-digit middle group. Email is safe under any order (`@`).
    out = out.replace(/\b(?:\d[ -]*?){13,19}\b/g, (m) => {
      if (allowMatch(m) || !luhnValid(m)) return m;
      ctx.counters.card++;
      return idFor(`card:${m}`, "CARD");
    });
    out = out.replace(/\b\d{3}-\d{2}-\d{4}\b/g, (m) => allowMatch(m) ? m : (ctx.counters.ssn++, idFor(`ssn:${m}`, "SSN")));
    out = out.replace(/\b(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{3}\)?[\s.-]?)\d{3}[\s.-]?\d{4}\b/g, (m) => allowMatch(m) ? m : (ctx.counters.phone++, idFor(`phone:${m}`, "PHONE")));
    out = out.replace(/\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g, (m) => allowMatch(m) ? m : (ctx.counters.jwt++, ctx.counters.token++, idFor(`jwt:${m}`, "TOKEN")));
    out = out.replace(/\b(?:Bearer|Basic)\s+[A-Za-z0-9._~+\/-]+=*\b/gi, (m) => allowMatch(m) ? m : (ctx.counters.bearer++, ctx.counters.token++, idFor(`auth:${m}`, "TOKEN")));
    out = out.replace(/([?&](?:token|code|access_token)=)([^&#\s]+)/gi, (_, p1, p2) => {
      if (allowMatch(p2)) return `${p1}${p2}`;
      ctx.counters.queryAuth++;
      ctx.counters.token++;
      return `${p1}${idFor(`query:${p2}`, "TOKEN")}`;
    });
    return out;
  };
  const walk = (v) => {
    if (typeof v === "string") return redact(v);
    if (Array.isArray(v)) return v.map(walk);
    if (v && typeof v === "object") return Object.fromEntries(Object.entries(v).map(([k, val]) => [k, walk(val)]));
    return v;
  };
  const output = walk(input);
  if (ownsContext) {
    finalizePiiContext(ctx);
  }
  return { output, counts: ctx.counters, ctx };
}