/**
* @module pipeline/stateFingerprint
* @description State fingerprinting for the state-based exploration engine.
*
* Produces a deterministic fingerprint of the current browser state that goes
* beyond the existing {@link module:pipeline/smartCrawl.fingerprintStructure}
* (which only hashes element tags). A state fingerprint captures:
* - URL route (pathname + hash, with significant query params)
* - Route param pattern (numeric segments normalised to `:id`)
* - DOM structural shape (reuses smartCrawl.fingerprintStructure)
* - Visible text content hash (with dynamic value normalisation)
* - UI component inventory (modals, sidebars, dropdowns, toasts, etc.)
* - SPA framework markers and loading/error states
* - Form field states (empty vs filled vs error)
*
* Two states are considered identical when their fingerprints match, preventing
* infinite exploration loops and detecting meaningful transitions.
*
* ### Exports
* - {@link fingerprintState} — `(snapshot) → string`
* - {@link statesEqual} — `(fp1, fp2) → boolean`
*/
import { fingerprintStructure, SIGNIFICANT_PARAMS, NOISE_PARAMS } from "./smartCrawl.js";
/**
* Simple deterministic hash — reuses the same algorithm as
* {@link module:pipeline/smartCrawl.fingerprintStructure} and
* {@link module:pipeline/deduplicator.simpleHash}.
*
* @param {string} str
* @returns {string} base-36 hash
*/
function simpleHash(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
hash = ((hash << 5) - hash) + str.charCodeAt(i);
hash = hash & hash;
}
return Math.abs(hash).toString(36);
}
/**
* Extract the route portion of a URL with significant query params.
*
* Normalises trailing slashes so `/about/` and `/about` fingerprint the same.
* Numeric path segments are normalised to `:id` so `/users/123` and
* `/users/456` produce the same route pattern (#52 defect #2).
* Significant query params (category, sort, view, etc.) are included in
* sorted order; noise params are stripped (#52 defect #1).
*
* @param {string} url
* @returns {string}
*/
function extractRoute(url) {
try {
const u = new URL(url);
// Normalise numeric path segments to `:id` (#52 defect #2)
const path = u.pathname
.replace(/\/+$/, "")
.split("/")
.map(seg => /^\d+$/.test(seg) ? ":id" : seg)
.join("/") || "/";
// Include significant query params in sorted order (#52 defect #1)
const sigParams = [];
for (const [key, value] of u.searchParams) {
if (NOISE_PARAMS.some(re => re.test(key))) continue;
if (SIGNIFICANT_PARAMS.has(key.toLowerCase())) {
sigParams.push(`${key.toLowerCase()}=${value}`);
}
}
sigParams.sort();
const qStr = sigParams.length > 0 ? `?${sigParams.join("&")}` : "";
return `${u.hostname}${path}${u.hash}${qStr}`;
} catch { return url; }
}
// ── Dynamic text normalisation (#52 defect #5) ──────────────────────────────
/**
* Normalise dynamic text fragments in a string.
*
* Strips order/ticket numbers, counts with units, currency amounts,
* timestamps, and other dynamic values that would cause trivially different
* fingerprints for the same logical state (#52 defect #5).
*
* @param {string} text
* @returns {string}
*/
function normaliseDynamicText(text) {
return text
.replace(/#\d+/g, "#_") // "Order #12345" → "Order #_"
.replace(/\b\d+\s*items?\b/gi, "_ items") // "2 items" → "_ items"
.replace(/\$[\d,.]+/g, "$_") // "$19.99" → "$_"
.replace(/€[\d,.]+/g, "€_") // "€9.99" → "€_"
.replace(/£[\d,.]+/g, "£_") // "£9.99" → "£_"
.replace(/\b\d{1,2}:\d{2}(:\d{2})?\s*(am|pm)?\b/gi, "_time_") // "2:30 PM"
.replace(/\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b/g, "_date_") // "04/19/2026"
.replace(/\(\d+\)/g, "(_)") // "(3)" notification counts
.replace(/\b\d+\s*new\b/gi, "_ new") // "3 new"
.replace(/\b\d{4,}\b/g, "_num_") // long numbers (IDs, phone)
.replace(/\s+/g, " ")
.trim();
}
/**
* Hash the visible text content from a snapshot's elements.
*
* Only uses STRUCTURAL text signals (headings, button labels, link text) — not
* dynamic content like timestamps, counters, or personalised greetings. This
* prevents trivially different snapshots of the same page (e.g. google.com
* with different doodle text) from being treated as distinct states.
*
* Dynamic values (order numbers, counts, prices) are normalised before hashing
* so "Order #12345" and "Order #12346" produce the same hash (#52 defect #5).
*
* @param {Array} elements
* @returns {string}
*/
function hashVisibleContent(elements) {
const text = (elements || [])
.filter(el => {
if (el.visible === false) return false;
// Only include structural text: headings, buttons, links, labels
const tag = (el.tag || "").toLowerCase();
const role = (el.role || "").toLowerCase();
return ["button", "a", "h1", "h2", "h3", "label"].includes(tag)
|| ["button", "link", "tab", "menuitem"].includes(role);
})
.map(el => normaliseDynamicText((el.text || "").slice(0, 30).toLowerCase()))
.filter(t => t.length > 2) // skip tiny fragments like "×" or "OK"
.join("|");
return simpleHash(text);
}
/**
* Compute a form-state signature from the snapshot's formStructures.
* Captures which fields are filled vs empty and whether required fields
* have values — this distinguishes "clean form" from "form with errors".
*
* @param {Array} formStructures — from pageSnapshot.js
* @returns {string}
*/
function formStateSignature(formStructures) {
if (!formStructures || formStructures.length === 0) return "no_forms";
return formStructures.map(form => {
const fields = (form.fields || []).map(f => {
const state = f.required ? "req" : "opt";
return `${f.tag}:${f.type || "text"}:${state}`;
}).join(",");
return `${form.id}[${fields}]`;
}).join("|");
}
// ── UI component inventory (#52 defect #3) ──────────────────────────────────
/**
* Build a sorted, deterministic inventory of visible UI component types.
*
* Goes beyond the original `hasModals` / `hasTabs` boolean flags to enumerate
* the full set of component types present on the page. This ensures that
* two pages with the same headings but different component layouts (e.g.
* sidebar visible vs collapsed) produce different fingerprints.
*
* @param {object} snapshot — page snapshot from takeSnapshot
* @returns {string} sorted component inventory string
*/
function componentInventory(snapshot) {
const components = [];
if (snapshot.hasModals) components.push("modal");
if (snapshot.hasTabs) components.push("tabs");
if (snapshot.hasTable) components.push("table");
if (snapshot.hasSidebar) components.push("sidebar");
if (snapshot.hasDropdown) components.push("dropdown");
if (snapshot.hasToast) components.push("toast");
if (snapshot.hasAccordion) components.push("accordion");
if (snapshot.hasLoginForm) components.push("login");
// Loading / error / empty states (#52 defect #4)
if (snapshot.hasSpinner) components.push("loading");
if (snapshot.hasErrorState) components.push("error");
if (snapshot.hasEmptyState) components.push("empty");
// SPA framework markers (#52 defect #4)
if (snapshot.spaFramework) components.push(`spa:${snapshot.spaFramework}`);
components.sort();
return components.length > 0 ? components.join(",") : "none";
}
/**
* Produce a deterministic fingerprint of the current application state.
*
* Combines route (with significant query params and normalised path params),
* DOM structure, visible content (with dynamic value normalisation), UI
* component inventory, SPA markers, and form state into a single hash string.
* Used by the state explorer to detect whether an action caused a meaningful
* state transition.
*
* @param {object} snapshot — page snapshot from {@link module:pipeline/pageSnapshot.takeSnapshot}
* @returns {string} deterministic fingerprint string
*/
export function fingerprintState(snapshot) {
const route = extractRoute(snapshot.url);
const structure = fingerprintStructure(snapshot);
const content = hashVisibleContent(snapshot.elements);
const components = componentInventory(snapshot);
const forms = formStateSignature(snapshot.formStructures);
// Include the page title to distinguish SPA route changes where the URL
// and DOM structure are identical but the title differs (e.g. tabbed
// dashboards, wizard steps). The title is normalised using the same
// dynamic text normaliser as visible content (#52 defect #5).
const title = normaliseDynamicText(
(snapshot.title || "").toLowerCase()
).slice(0, 60);
const composite = `${route}|${structure}|${content}|${components}|${forms}|${title}`;
return simpleHash(composite);
}
/**
* Check if two state fingerprints represent the same application state.
*
* @param {string} fp1
* @param {string} fp2
* @returns {boolean}
*/
export function statesEqual(fp1, fp2) {
return fp1 === fp2;
}