/**
* intentClassifier.js — Layer 2: Classify page elements into user intent categories
*
* Categories: AUTH | NAVIGATION | FORM_SUBMISSION | SEARCH | CRUD | CHECKOUT | CONTENT
*
* Priority tiers:
* HIGH — AUTH, CHECKOUT, SEARCH, FORM_SUBMISSION, CRUD (interactive, high test value)
* MEDIUM — NAVIGATION (homepages, dashboards — structural tests only)
* LOW — CONTENT (static pages — minimal test coverage)
*
* Classification modes:
* 1. Heuristic (default) — fast, keyword/pattern-based scoring
* 2. AI-assisted — when confidence is low (<40), asks the AI to classify
*/
import { generateText, parseJSON, hasProvider } from "../aiProvider.js";
// ── Intent patterns ───────────────────────────────────────────────────────────
const HIGH_PRIORITY_INTENTS = new Set(["AUTH", "CHECKOUT", "SEARCH", "FORM_SUBMISSION", "CRUD"]);
const INTENT_PATTERNS = {
AUTH: {
keywords: ["login", "log in", "sign in", "signin", "register", "sign up", "signup",
"create account", "forgot password", "reset password", "logout", "log out",
"sign out", "password", "username", "authenticate"],
// "email" as a keyword was too generic (false positives on contact/content pages).
// Instead, input[type=email] is a weak input signal — it boosts AUTH when
// combined with other signals (password field, login keywords) but is not
// strong enough alone to override FORM_SUBMISSION on a contact page.
inputTypes: ["password"],
weakInputTypes: ["email"],
weight: 100,
},
CHECKOUT: {
keywords: ["checkout", "buy", "purchase", "add to cart", "place order", "pay",
"payment", "billing", "shipping", "credit card", "cart", "order"],
weight: 95,
},
SEARCH: {
keywords: ["search", "find", "filter", "query", "look up"],
// "browse" removed — too generic
inputTypes: ["search"],
weight: 85,
},
FORM_SUBMISSION: {
keywords: ["submit", "send", "contact", "subscribe", "newsletter", "feedback",
"apply", "request", "book", "reserve", "schedule", "upload"],
weight: 80,
},
CRUD: {
keywords: ["create", "new", "add", "edit", "update", "save", "delete", "remove",
"publish", "draft", "archive", "manage"],
weight: 75,
},
NAVIGATION: {
keywords: ["home", "about", "docs", "documentation", "blog", "pricing", "features",
"faq", "help", "support", "dashboard", "profile", "settings",
"account", "back", "next", "previous", "menu"],
// "contact" removed — conflicts with FORM_SUBMISSION
weight: 50,
},
CONTENT: {
keywords: ["read more", "learn more", "view", "see all", "show", "expand", "details"],
weight: 30,
},
};
/**
* classifyElement(element) → { element, intent, confidence }
*
* Uses weighted scoring where element TYPE matters more than text content.
* A password input strongly signals AUTH; a link containing "password" does not.
*/
export function classifyElement(element) {
const text = (element.text || "").toLowerCase();
const type = (element.type || "").toLowerCase();
const name = (element.name || "").toLowerCase();
const id = (element.id || "").toLowerCase();
const tag = (element.tag || "").toLowerCase();
let bestIntent = "NAVIGATION";
let bestScore = 0;
for (const [intent, config] of Object.entries(INTENT_PATTERNS)) {
let score = 0;
// Check text keywords — weight by element type
for (const kw of config.keywords || []) {
if (text.includes(kw)) {
// Buttons and inputs matching keywords are stronger signals than links
const typeMultiplier = (tag === "button" || tag === "input") ? 1.2
: (tag === "a") ? 0.6 : 1.0;
score += config.weight * typeMultiplier;
}
if (name.includes(kw) || id.includes(kw)) score += config.weight * 0.5;
}
// Check input types — strongest signal (e.g. input[type=password] → AUTH)
for (const t of config.inputTypes || []) {
if (type === t) score += config.weight * 2.0;
}
// Weak input types — moderate signal (e.g. input[type=email] → AUTH hint)
for (const t of config.weakInputTypes || []) {
if (type === t) score += config.weight * 0.8;
}
if (score > bestScore) {
bestScore = score;
bestIntent = intent;
}
}
const confidence = Math.min(100, bestScore);
return { element, intent: bestIntent, confidence };
}
// ── AI-assisted classification ────────────────────────────────────────────────
// When the heuristic confidence is below AI_THRESHOLD, we ask the LLM to
// classify the page. This handles non-English UIs, custom components, and
// pages where keyword matching is ambiguous.
const AI_THRESHOLD = parseInt(process.env.AI_CLASSIFY_THRESHOLD, 10) || 40;
async function aiClassifyPage(snapshot, signal) {
const elements = (snapshot.elements || []).slice(0, 15).map(e => ({
tag: e.tag, text: (e.text || "").slice(0, 40), role: e.role, type: e.type,
}));
const prompt = `You are a QA page classifier. Given a web page's metadata and interactive elements, classify the page's dominant user intent.
PAGE:
URL: ${snapshot.url}
Title: ${snapshot.title}
H1: ${snapshot.h1 || "none"}
Forms: ${snapshot.forms}
Has login form: ${snapshot.hasLoginForm}
ELEMENTS (sample):
${JSON.stringify(elements, null, 2)}
Classify into EXACTLY ONE of these categories:
AUTH — login, registration, password reset
CHECKOUT — cart, payment, purchase flow
SEARCH — search bar, filters, results listing
FORM_SUBMISSION — contact forms, subscribe, apply
CRUD — create/edit/delete data
NAVIGATION — homepage, dashboard, navigation hub
CONTENT — articles, documentation, static content
Return ONLY valid JSON (no markdown):
{
"intent": "AUTH",
"confidence": 85,
"reason": "one-sentence explanation"
}`;
const text = await generateText(prompt, { maxTokens: 256, signal });
const result = parseJSON(text);
const intent = (result.intent || "").toUpperCase();
const validIntents = ["AUTH", "CHECKOUT", "SEARCH", "FORM_SUBMISSION", "CRUD", "NAVIGATION", "CONTENT"];
if (!validIntents.includes(intent)) return null;
return { intent, confidence: result.confidence || 70 };
}
/**
* classifyPage(snapshot, filteredElements) → page intent summary
*
* Returns the dominant intent for the page, classified elements, and priority tier.
* Priority is based on the dominant intent — interactive pages get more test coverage.
*/
export function classifyPage(snapshot, filteredElements) {
const classified = filteredElements.map(classifyElement);
// Count intents weighted by element score
const intentCounts = {};
for (const { intent, confidence, element } of classified) {
intentCounts[intent] = (intentCounts[intent] || 0) + confidence + (element._score || 0);
}
// Page-level signals — use form structures when available for stronger signals
if (snapshot.hasLoginForm) {
intentCounts.AUTH = (intentCounts.AUTH || 0) + 300;
} else if (snapshot.forms > 0) {
intentCounts.FORM_SUBMISSION = (intentCounts.FORM_SUBMISSION || 0) + 50;
}
const title = (snapshot.title + " " + (snapshot.h1 || "")).toLowerCase();
if (title.includes("login") || title.includes("sign in")) intentCounts.AUTH = (intentCounts.AUTH || 0) + 200;
if (title.includes("checkout") || title.includes("cart")) intentCounts.CHECKOUT = (intentCounts.CHECKOUT || 0) + 200;
if (title.includes("search")) intentCounts.SEARCH = (intentCounts.SEARCH || 0) + 100;
const dominantIntent = Object.entries(intentCounts).sort((a, b) => b[1] - a[1])[0]?.[0] || "NAVIGATION";
// Priority based on intent — only interactive pages are high priority.
// NAVIGATION and CONTENT pages get lighter coverage (2-3 structural tests).
const isHighPriority = HIGH_PRIORITY_INTENTS.has(dominantIntent);
return {
url: snapshot.url,
title: snapshot.title,
dominantIntent,
intentBreakdown: intentCounts,
classifiedElements: classified,
isHighPriority,
// Confidence score: how strongly does this page match its dominant intent?
// Low confidence → the AI should generate fewer, more conservative tests.
intentConfidence: Math.min(100, intentCounts[dominantIntent] || 0),
};
}
/**
* classifyPageWithAI(snapshot, filteredElements, { signal }) → page intent summary
*
* Same as classifyPage but falls back to the AI when heuristic confidence
* is below AI_THRESHOLD. Call this from the crawler pipeline instead of
* classifyPage when an AI provider is available.
*
* @param {AbortSignal} [signal] — forwarded to AI calls so abort stops classification
*/
export async function classifyPageWithAI(snapshot, filteredElements, { signal } = {}) {
// AI fallback disabled to conserve LLM API quota (Gemini free tier: 20 calls/day).
// The heuristic classifier has been improved with better keyword scoring and
// element-type weighting, so AI assistance is not needed for typical pages.
// To re-enable: remove this early return and uncomment the AI block below.
return classifyPage(snapshot, filteredElements);
/*
const heuristic = classifyPage(snapshot, filteredElements);
if (heuristic.intentConfidence >= AI_THRESHOLD) return heuristic;
try {
if (!hasProvider()) return heuristic;
if (signal?.aborted) return heuristic;
const aiResult = await aiClassifyPage(snapshot, signal);
if (!aiResult) return heuristic;
const isHighPriority = HIGH_PRIORITY_INTENTS.has(aiResult.intent);
return {
...heuristic,
dominantIntent: aiResult.intent,
intentConfidence: aiResult.confidence,
isHighPriority,
_aiAssisted: true,
};
} catch (err) {
if (err.name === "AbortError") throw err;
return heuristic;
}
*/
}
/**
* buildUserJourneys(classifiedPages, snapshotsByUrl?) → Array of journey objects
*
* Chains related pages into GENUINE multi-page user journeys.
* Single-page intents are NOT wrapped as journeys — they are handled
* separately by generateIntentTests in journeyGenerator.js.
*
* Detection strategies (applied in order):
* 1. Intent-based patterns — AUTH→dashboard, multi-CHECKOUT, multi-SEARCH, multi-CRUD
* 2. Link-graph analysis — when snapshots are provided, discover cross-intent
* journeys by following outbound links between classified pages
* 3. Form→confirmation — FORM_SUBMISSION page linking to a CONTENT/NAVIGATION page
*/
export function buildUserJourneys(classifiedPages, snapshotsByUrl = {}) {
const journeys = [];
const usedUrls = new Set(); // track URLs already in a journey to avoid overlap
// ── 1. Intent-based pattern matching (original logic, improved) ────────────
// Auth flow — login page → post-login destination
const authPages = classifiedPages.filter(p => p.dominantIntent === "AUTH");
const dashboardPages = classifiedPages.filter(p =>
p.url.includes("dashboard") || p.url.includes("home") || p.title.toLowerCase().includes("dashboard")
);
if (authPages.length > 0 && dashboardPages.length > 0) {
const pages = [...authPages, ...dashboardPages].slice(0, 3);
journeys.push({
name: "Authentication Flow",
type: "AUTH",
pages,
description: "User login and post-login navigation",
});
pages.forEach(p => usedUrls.add(p.url));
}
// Checkout flow — only if we have multiple checkout-related pages
const cartPages = classifiedPages.filter(p => p.dominantIntent === "CHECKOUT");
if (cartPages.length >= 2) {
journeys.push({
name: "Checkout Flow",
type: "CHECKOUT",
pages: cartPages,
description: "Add to cart and purchase flow",
});
cartPages.forEach(p => usedUrls.add(p.url));
}
// Search → results flow
const searchPages = classifiedPages.filter(p => p.dominantIntent === "SEARCH");
if (searchPages.length >= 2) {
journeys.push({
name: "Search Flow",
type: "SEARCH",
pages: searchPages,
description: "Search and filter functionality",
});
searchPages.forEach(p => usedUrls.add(p.url));
}
// CRUD flow — list → create/edit → detail
const crudPages = classifiedPages.filter(p => p.dominantIntent === "CRUD");
if (crudPages.length >= 2) {
const pages = crudPages.slice(0, 4);
journeys.push({
name: "CRUD Flow",
type: "CRUD",
pages,
description: "Create, read, update, delete workflow",
});
pages.forEach(p => usedUrls.add(p.url));
}
// ── 2. Link-graph journey discovery ────────────────────────────────────────
// When snapshots include outbound links, we can discover cross-intent journeys
// that the pattern matcher misses (e.g. pricing → signup → dashboard).
if (Object.keys(snapshotsByUrl).length > 0) {
const classifiedByUrl = {};
for (const cp of classifiedPages) classifiedByUrl[cp.url] = cp;
// outboundLinks in pageSnapshot.js are normalised with ALL query params
// stripped (u.search = ""), but classifiedByUrl keys may include significant
// query params (e.g. /products?category=electronics). Build a secondary
// lookup that maps param-stripped URLs to classified pages so the adjacency
// map can resolve outbound links correctly (#52 consistency fix).
const classifiedByStrippedUrl = {};
for (const cp of classifiedPages) {
try {
const u = new URL(cp.url);
u.search = "";
u.hash = "";
const stripped = u.toString();
// First match wins — avoids overwriting when multiple param variants
// map to the same stripped URL (the adjacency just needs any match).
if (!classifiedByStrippedUrl[stripped]) classifiedByStrippedUrl[stripped] = cp;
} catch { classifiedByStrippedUrl[cp.url] = cp; }
}
// Build adjacency: page URL → set of classified page URLs it links to
const adjacency = {};
for (const cp of classifiedPages) {
const snap = snapshotsByUrl[cp.url];
if (!snap?.outboundLinks) continue;
adjacency[cp.url] = new Set();
for (const link of snap.outboundLinks) {
// outboundLinks are param-stripped, so look up in the stripped index
const target = classifiedByStrippedUrl[link];
if (target && target.url !== cp.url) {
adjacency[cp.url].add(target.url);
}
}
}
// Find chains of 2-4 pages connected by links that aren't already in a journey
for (const startPage of classifiedPages) {
if (usedUrls.has(startPage.url)) continue;
if (!adjacency[startPage.url]?.size) continue;
// Only start chains from high-priority pages
if (!startPage.isHighPriority) continue;
const chain = [startPage];
const chainUrls = new Set([startPage.url]);
let current = startPage;
// Greedy walk: follow the first link to another classified page
for (let step = 0; step < 3; step++) {
const neighbors = adjacency[current.url];
if (!neighbors) break;
let next = null;
for (const neighborUrl of neighbors) {
if (!chainUrls.has(neighborUrl) && !usedUrls.has(neighborUrl)) {
next = classifiedByUrl[neighborUrl];
break;
}
}
if (!next) break;
chain.push(next);
chainUrls.add(next.url);
current = next;
}
if (chain.length >= 2) {
const intents = chain.map(p => p.dominantIntent).join(" → ");
journeys.push({
name: `${chain[0].dominantIntent} → ${chain[chain.length - 1].dominantIntent} Flow`,
type: chain[0].dominantIntent,
pages: chain,
description: `Cross-page flow: ${intents}`,
_discoveredBy: "link_graph",
});
chain.forEach(p => usedUrls.add(p.url));
}
}
}
// ── 3. Form → confirmation journey ─────────────────────────────────────────
// A FORM_SUBMISSION page that links to a CONTENT or NAVIGATION page is likely
// a "submit form → see confirmation" flow.
const formPages = classifiedPages.filter(p =>
p.dominantIntent === "FORM_SUBMISSION" && !usedUrls.has(p.url)
);
for (const formPage of formPages) {
const snap = snapshotsByUrl[formPage.url];
if (!snap?.outboundLinks) continue;
for (const link of snap.outboundLinks) {
// outboundLinks are param-stripped; use stripped lookup (#52 consistency fix)
const target = classifiedPages.find(p => {
try {
const u = new URL(p.url);
u.search = "";
u.hash = "";
return u.toString() === link &&
!usedUrls.has(p.url) &&
(p.dominantIntent === "CONTENT" || p.dominantIntent === "NAVIGATION");
} catch { return false; }
});
if (target) {
journeys.push({
name: "Form Submission Flow",
type: "FORM_SUBMISSION",
pages: [formPage, target],
description: `Submit form on ${formPage.title} → confirmation on ${target.title}`,
_discoveredBy: "form_confirmation",
});
usedUrls.add(formPage.url);
usedUrls.add(target.url);
break; // one confirmation page per form is enough
}
}
}
// DO NOT create single-page "journeys" — those are handled by generateIntentTests.
return journeys;
}