Source: pipeline/secretScanner.js

/**
 * @module pipeline/secretScanner
 * @description
 * CAP-003 — `gitleaks`-style secret scanner used by the post-generation
 * validation stage to reject AI-generated Playwright tests that embed
 * credentials harvested during crawl (Authorization headers, API keys,
 * JWTs, AWS access keys). Findings are always redacted before surfacing —
 * the raw match value is never echoed back into the issues list or
 * persisted on the test record.
 *
 * Built-in detectors cover AWS access key IDs, JWTs, and `Bearer` tokens.
 * Additional rules are loaded best-effort from the repo's existing
 * `.github/.gitleaks.toml` so the CI ruleset is reused.
 */

import fs from "node:fs";
import path from "node:path";

const DEFAULT_RULES = [
  { id: "aws-access-key-id", description: "AWS access key id", regex: /\bAKIA[0-9A-Z]{16}\b/g },
  { id: "jwt-token", description: "JWT token", regex: /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9._-]{10,}\.[A-Za-z0-9._-]{10,}\b/g },
  { id: "bearer-token", description: "Bearer token", regex: /\bBearer\s+[A-Za-z0-9._~+\/-]{16,}/gi },
];

let cachedRules = null;

/**
 * Resolve the repository root from this module's URL. Used to locate the
 * shared `.github/.gitleaks.toml` config without hard-coding `process.cwd()`
 * (which would break when the server is launched from a non-repo-root cwd).
 *
 * @returns {string} Absolute path to the repo root.
 * @private
 */
function repoRoot() {
  return path.resolve(path.dirname(new URL(import.meta.url).pathname), "../../..");
}

/**
 * Naive `[[rules]]` chunker for `.github/.gitleaks.toml`. Not a full TOML
 * parser — extracts only `id`, `description`, and `regex` fields per rule
 * via simple line-anchored regexes. Multi-line strings, escapes, and other
 * TOML features are intentionally unsupported (best-effort reuse). Invalid
 * regex patterns are silently skipped so a malformed custom rule cannot
 * break the entire scanner.
 *
 * @param {string} tomlText Raw `.gitleaks.toml` file contents.
 * @returns {Array<{id: string, description: string, regex: RegExp}>}
 * @private
 */
function parseCustomRules(tomlText) {
  const parsed = [];
  const chunks = tomlText.split("[[rules]]").slice(1);
  for (const chunk of chunks) {
    const id = chunk.match(/^\s*id\s*=\s*"([^"]+)"/m)?.[1] || "custom-rule";
    const description = chunk.match(/^\s*description\s*=\s*"([^"]+)"/m)?.[1] || id;
    const pattern = chunk.match(/^\s*regex\s*=\s*'([^']+)'/m)?.[1] || chunk.match(/^\s*regex\s*=\s*"([^"]+)"/m)?.[1];
    if (!pattern) continue;
    try {
      parsed.push({ id, description, regex: new RegExp(pattern, "g") });
    } catch {
      // ignore invalid custom regex entries
    }
  }
  return parsed;
}

/**
 * Load and cache the active secret-detection ruleset.
 *
 * Returns the merged list of `DEFAULT_RULES` plus any custom rules parsed
 * best-effort from `.github/.gitleaks.toml`. Result is memoised in
 * module-level `cachedRules` for the lifetime of the process — config
 * changes require a server restart.
 *
 * @returns {Array<{id: string, description: string, regex: RegExp}>}
 */
export function loadSecretRules() {
  if (cachedRules) return cachedRules;
  const file = path.join(repoRoot(), ".github", ".gitleaks.toml");
  let custom = [];
  try {
    const toml = fs.readFileSync(file, "utf8");
    custom = parseCustomRules(toml);
  } catch {
    custom = [];
  }
  cachedRules = [...DEFAULT_RULES, ...custom];
  return cachedRules;
}

/**
 * Redact a matched secret value so it can be safely surfaced in logs,
 * issue strings, and persisted finding records. Values longer than 8
 * characters are shortened to `<first4>…<last4>`; shorter values are
 * fully replaced with `[REDACTED]`. The raw value never crosses the
 * scanner boundary.
 *
 * @param {string} value Raw matched secret.
 * @returns {string} Redacted, display-safe representation.
 */
function redact(value) {
  if (!value) return "";
  if (value.length <= 8) return "[REDACTED]";
  return `${value.slice(0, 4)}…${value.slice(-4)}`;
}

/**
 * Scan a string of generated Playwright code for credential-like tokens.
 *
 * Runs every active rule against the input and returns a redacted
 * findings array (`{ ruleId, description, match, message }`). All
 * occurrences of every rule are emitted so reviewers see the full set
 * of leaked credentials, not just the first match. Empty / non-string
 * input returns an empty array.
 *
 * @param {string} code Generated Playwright source to scan.
 * @returns {Array<{ruleId: string, description: string, match: string, message: string}>}
 */
export function scanForSecrets(code) {
  if (!code || typeof code !== "string") return [];
  const findings = [];
  const rules = loadSecretRules();
  for (const rule of rules) {
    rule.regex.lastIndex = 0;
    let m;
    while ((m = rule.regex.exec(code)) !== null) {
      findings.push({
        ruleId: rule.id,
        description: rule.description,
        match: redact(m[0]),
        message: `secret-like token detected (${rule.id}): ${redact(m[0])}`,
      });
      // Guard against zero-width matches causing an infinite loop.
      if (m.index === rule.regex.lastIndex) rule.regex.lastIndex += 1;
    }
  }
  return findings;
}