/**
* smartCrawl.js — Layer 6: Intelligent crawling with structure deduplication
*
* Detects duplicate page layouts, avoids revisiting same structures,
* prioritizes unique routes, stops exploring low-value paths.
*/
// ── URL value scoring ─────────────────────────────────────────────────────────
const HIGH_VALUE_PATHS = [
"/login", "/signin", "/register", "/signup",
"/checkout", "/cart", "/payment",
"/dashboard", "/account", "/profile", "/settings",
"/search", "/products", "/shop",
"/admin", "/manage",
"/contact", "/support",
];
const LOW_VALUE_PATHS = [
"/cdn-", "/static/", "/assets/", "/images/", "/fonts/",
"/favicon", "/robots", "/sitemap",
"/.well-known",
];
const LOW_VALUE_EXTENSIONS = [
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
".css", ".js", ".woff", ".woff2", ".ttf",
".pdf", ".zip", ".xml", ".json",
];
export function scoreUrl(url, baseUrl) {
try {
const u = new URL(url);
const path = u.pathname.toLowerCase();
// Skip non-HTTP
if (!url.startsWith("http")) return 0;
// Skip binary/static files
if (LOW_VALUE_EXTENSIONS.some(ext => path.endsWith(ext))) return 0;
// Skip known noise paths
if (LOW_VALUE_PATHS.some(p => path.includes(p))) return 0;
// High value paths
if (HIGH_VALUE_PATHS.some(p => path.includes(p))) return 100;
// Penalize very deep paths (likely pagination / details)
const depth = path.split("/").filter(Boolean).length;
if (depth > 4) return 20;
// Penalize query-string heavy URLs (likely filters, not new pages)
if (u.searchParams.size > 2) return 15;
// Penalize numbered segments (pagination: /page/2, /products/123)
if (/\/\d+\/?$/.test(path)) return 25;
// Default
return 50;
} catch { return 0; }
}
// ── Structure fingerprinting ──────────────────────────────────────────────────
/**
* fingerprintStructure(snapshot) → string
*
* Creates a structural fingerprint of a page based on its DOM shape,
* not its content. Used to detect "template" pages (e.g. blog post A vs B).
*/
export function fingerprintStructure(snapshot) {
// Normalize elements to their tag+type shape, ignore text content
const shape = (snapshot.elements || [])
.map(el => `${el.tag}:${el.type || ""}`)
.sort()
.join(",");
const formCount = snapshot.forms || 0;
const hasH1 = snapshot.h1 ? "1" : "0";
// Simple hash of the structural shape
const str = `forms:${formCount}|h1:${hasH1}|${shape}`;
let hash = 0;
for (let i = 0; i < str.length; i++) {
hash = ((hash << 5) - hash) + str.charCodeAt(i);
hash = hash & hash;
}
return Math.abs(hash).toString(36);
}
// ── Smart queue management ────────────────────────────────────────────────────
export class SmartCrawlQueue {
constructor(baseUrl) {
this.baseUrl = baseUrl;
this.visited = new Set();
this.structuresSeen = new Set();
this.queue = []; // { url, depth, score }
}
enqueue(url, depth) {
if (this.visited.has(url)) return;
if (!url.startsWith("http")) return;
const score = scoreUrl(url, this.baseUrl);
if (score === 0) return; // Skip worthless URLs
this.queue.push({ url, depth, score });
// Keep queue sorted by score (high value first)
this.queue.sort((a, b) => b.score - a.score);
}
dequeue() {
return this.queue.shift() || null;
}
markVisited(url) {
this.visited.add(url);
}
markStructureSeen(fingerprint) {
this.structuresSeen.add(fingerprint);
}
isStructureDuplicate(fingerprint) {
return this.structuresSeen.has(fingerprint);
}
get size() { return this.queue.length; }
get visitedCount() { return this.visited.size; }
hasMore() { return this.queue.length > 0; }
}
// ── Path deduplication ────────────────────────────────────────────────────────
/**
* extractPathPattern(url) → string
*
* Converts /products/123 and /products/456 to /products/:id
* so we only crawl one version.
*/
export function extractPathPattern(url) {
try {
const u = new URL(url);
const pattern = u.pathname
.split("/")
.map(segment => /^\d+$/.test(segment) ? ":id" : segment)
.join("/");
return `${u.hostname}${pattern}`;
} catch { return url; }
}
// ── Query-param-aware path deduplication (#52 defect #1) ─────────────────────
/**
* Query parameter names that carry state-significant meaning.
* Exported so stateFingerprint.js can reuse the same set (DRY).
*/
export const SIGNIFICANT_PARAMS = new Set([
"category", "sort", "order", "view", "tab", "page", "filter",
"type", "status", "q", "query", "search", "mode", "step",
"section", "panel", "lang", "locale",
]);
/**
* Query parameter patterns that are always noise.
* Exported so stateFingerprint.js can reuse the same list (DRY).
*/
export const NOISE_PARAMS = [
/^utm_/i, /^fbclid$/i, /^gclid$/i, /^_ga$/i, /^mc_/i,
/^ref$/i, /^source$/i, /token/i, /session/i, /nonce/i,
/timestamp/i, /^_$/i, /^cb$/i, /^t$/i,
];
/**
* extractPathPatternWithParams(url) → string
*
* Like {@link extractPathPattern} but includes significant query parameters
* in the pattern so `/products?category=electronics` and
* `/products?category=books` produce different patterns.
*
* Used by the state explorer where query params are preserved (#52 defect #1).
* The original {@link extractPathPattern} (without params) is still used by
* crawlBrowser.js where query params are stripped before pattern extraction.
*
* @param {string} url
* @returns {string}
*/
export function extractPathPatternWithParams(url) {
try {
const u = new URL(url);
const pattern = u.pathname
.split("/")
.map(segment => /^\d+$/.test(segment) ? ":id" : segment)
.join("/");
// Include significant query params in sorted order
const sigParams = [];
for (const [key, value] of u.searchParams) {
if (NOISE_PARAMS.some(re => re.test(key))) continue;
if (SIGNIFICANT_PARAMS.has(key.toLowerCase())) {
sigParams.push(`${key.toLowerCase()}=${value}`);
}
}
sigParams.sort();
const qStr = sigParams.length > 0 ? `?${sigParams.join("&")}` : "";
return `${u.hostname}${pattern}${qStr}`;
} catch { return url; }
}
/**
* Strip noise query parameters from a URL, preserving significant ones.
*
* Shared utility for both crawlBrowser.js and stateExplorer.js so link
* normalisation is consistent across crawl modes (#52 defect #1).
*
* @param {URL} u — mutable URL object (modified in place)
*/
export function stripNoiseParams(u) {
for (const key of [...u.searchParams.keys()]) {
if (NOISE_PARAMS.some(re => re.test(key))) u.searchParams.delete(key);
}
}