/**
* @module utils/robotsSitemap
* @description Fetches and parses robots.txt rules and sitemap.xml URLs for
* crawl compliance. Zero external dependencies — uses the global `fetch()` API
* available in Node 18+.
*
* ### Design decisions
* - Only the `Sentri` and `*` user-agent groups are evaluated.
* - `Allow` directives take precedence over `Disallow` when both match at the
* same specificity (longest prefix wins), matching Google's interpretation.
* - Sitemap parsing handles both `<sitemapindex>` (recursive) and `<urlset>`
* formats. Gzip sitemaps are NOT supported (would require `zlib`); they are
* silently skipped.
* - All network errors are swallowed — a missing or unreachable robots.txt
* means "allow everything", per the standard.
*
* ### Exports
* - {@link loadRobotsRules} — fetch + parse robots.txt → rules object
* - {@link isAllowed} — check a URL against parsed rules
* - {@link loadSitemapUrls} — fetch + parse sitemap.xml → URL list
*
* @see https://datatracker.ietf.org/doc/html/rfc9309
*/
// ── robots.txt parsing ───────────────────────────────────────────────────────
/**
* @typedef {Object} RobotsRules
* @property {Array<{pattern: string, allow: boolean}>} rules — sorted longest-first
* @property {string[]} sitemaps — Sitemap URLs declared in robots.txt
*/
/**
* Parse raw robots.txt content into structured rules.
*
* Only rules for `User-agent: Sentri` or `User-agent: *` are kept.
* The Sentri-specific group takes priority if present.
*
* @param {string} text — raw robots.txt content
* @returns {RobotsRules}
*/
export function parseRobotsTxt(text) {
const lines = text.split(/\r?\n/);
const sitemaps = [];
// Collect rules per user-agent group
/** @type {Map<string, Array<{pattern: string, allow: boolean}>>} */
const groups = new Map();
let currentAgents = [];
for (const raw of lines) {
const line = raw.trim();
// Skip comments and empty lines
if (!line || line.startsWith("#")) continue;
const colonIdx = line.indexOf(":");
if (colonIdx === -1) continue;
const directive = line.slice(0, colonIdx).trim().toLowerCase();
const value = line.slice(colonIdx + 1).split("#")[0].trim();
if (directive === "sitemap") {
if (value) sitemaps.push(value);
continue;
}
if (directive === "user-agent") {
const agent = value.toLowerCase();
// If the previous line was also a user-agent, accumulate (multi-agent group)
if (currentAgents.length > 0 && !groups.has(currentAgents[0])) {
// Still building agent list for this group
currentAgents.push(agent);
} else {
currentAgents = [agent];
}
continue;
}
if (directive === "allow" || directive === "disallow") {
if (!value && directive === "disallow") continue; // empty Disallow = allow all
for (const agent of currentAgents) {
if (!groups.has(agent)) groups.set(agent, []);
groups.get(agent).push({ pattern: value || "/", allow: directive === "allow" });
}
}
}
// Prefer Sentri-specific rules, fall back to wildcard
const sentri = groups.get("sentri") || groups.get("sentri/1.0");
const wildcard = groups.get("*");
const rules = sentri || wildcard || [];
// Sort by pattern length descending — longest match wins.
// At equal length, Allow takes precedence over Disallow per RFC 9309.
rules.sort((a, b) => b.pattern.length - a.pattern.length || (b.allow ? 1 : 0) - (a.allow ? 1 : 0));
return { rules, sitemaps };
}
/**
* Fetch and parse robots.txt from a base URL.
*
* @param {string} baseUrl — site origin (e.g. "https://example.com")
* @param {object} [opts]
* @param {number} [opts.timeoutMs=5000]
* @returns {Promise<RobotsRules>}
*/
export async function loadRobotsRules(baseUrl, { timeoutMs = 5000 } = {}) {
try {
const origin = new URL(baseUrl).origin;
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
const res = await fetch(`${origin}/robots.txt`, {
signal: controller.signal,
headers: { "User-Agent": "Mozilla/5.0 (compatible; Sentri/1.0)" },
});
clearTimeout(timer);
if (!res.ok) return { rules: [], sitemaps: [] };
const text = await res.text();
return parseRobotsTxt(text);
} catch {
// Network error, timeout, or invalid URL — allow everything
return { rules: [], sitemaps: [] };
}
}
/**
* Check whether a URL is allowed by the parsed robots.txt rules.
*
* Uses longest-prefix matching: the rule whose pattern is the longest prefix
* of the URL path wins. If no rule matches, the URL is allowed (default).
*
* @param {string} url — full URL to check
* @param {RobotsRules} robotsRules — from {@link loadRobotsRules}
* @returns {boolean}
*/
export function isAllowed(url, robotsRules) {
if (!robotsRules || !robotsRules.rules || robotsRules.rules.length === 0) return true;
try {
const path = new URL(url).pathname;
// Find the first (longest) matching rule
for (const rule of robotsRules.rules) {
// Simple prefix match — handles most real-world robots.txt patterns.
// Wildcard (*) and end-of-string ($) patterns from the spec are rare
// and not worth the complexity for a QA crawler.
if (path.startsWith(rule.pattern)) {
return rule.allow;
}
}
return true; // no matching rule = allowed
} catch { return true; }
}
// ── sitemap.xml parsing ──────────────────────────────────────────────────────
/**
* Extract URLs from a sitemap XML string.
*
* Handles both `<urlset>` (leaf sitemap) and `<sitemapindex>` (index pointing
* to child sitemaps). Uses regex extraction instead of a full XML parser to
* avoid adding a dependency.
*
* @param {string} xml — raw sitemap XML content
* @returns {{ urls: string[], childSitemaps: string[] }}
*/
export function parseSitemapXml(xml) {
const urls = [];
const childSitemaps = [];
// Extract <loc> values from <url> entries (leaf sitemap)
// Use [\s\S]*? to allow intervening child elements (e.g. <lastmod>)
// between <url> and <loc>, since the XML sitemap spec does not mandate
// element order.
const urlLocRe = /<url>[\s\S]*?<loc>\s*([^<]+?)\s*<\/loc>/gi;
let match;
while ((match = urlLocRe.exec(xml)) !== null) {
const loc = match[1].trim();
if (loc.startsWith("http")) urls.push(loc);
}
// Extract <loc> values from <sitemap> entries (sitemap index)
const sitemapLocRe = /<sitemap>[\s\S]*?<loc>\s*([^<]+?)\s*<\/loc>/gi;
while ((match = sitemapLocRe.exec(xml)) !== null) {
const loc = match[1].trim();
if (loc.startsWith("http")) childSitemaps.push(loc);
}
return { urls, childSitemaps };
}
/**
* Fetch and parse sitemap URLs from a base URL.
*
* Tries URLs declared in robots.txt `Sitemap:` directives first, then falls
* back to the conventional `/sitemap.xml` location. Follows one level of
* sitemap index indirection.
*
* @param {string} baseUrl — site origin
* @param {string[]} [declaredSitemaps] — Sitemap URLs from robots.txt
* @param {object} [opts]
* @param {number} [opts.timeoutMs=5000]
* @param {number} [opts.maxUrls=200] — cap to avoid memory issues on huge sitemaps
* @returns {Promise<string[]>} — deduplicated list of page URLs
*/
export async function loadSitemapUrls(baseUrl, declaredSitemaps = [], { timeoutMs = 5000, maxUrls = 200 } = {}) {
let origin;
try { origin = new URL(baseUrl).origin; } catch { return []; }
const sitemapUrls = declaredSitemaps.length > 0
? [...declaredSitemaps]
: [`${origin}/sitemap.xml`];
const allUrls = new Set();
for (const sitemapUrl of sitemapUrls) {
if (allUrls.size >= maxUrls) break;
// Skip gzipped sitemaps — would require zlib
if (sitemapUrl.endsWith(".gz")) continue;
try {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
const res = await fetch(sitemapUrl, {
signal: controller.signal,
headers: { "User-Agent": "Mozilla/5.0 (compatible; Sentri/1.0)" },
});
clearTimeout(timer);
if (!res.ok) continue;
const xml = await res.text();
const { urls, childSitemaps } = parseSitemapXml(xml);
for (const u of urls) {
if (allUrls.size >= maxUrls) break;
allUrls.add(u);
}
// Follow one level of sitemap index
for (const childUrl of childSitemaps) {
if (allUrls.size >= maxUrls) break;
if (childUrl.endsWith(".gz")) continue;
try {
const c = new AbortController();
const t = setTimeout(() => c.abort(), timeoutMs);
const childRes = await fetch(childUrl, {
signal: c.signal,
headers: { "User-Agent": "Mozilla/5.0 (compatible; Sentri/1.0)" },
});
clearTimeout(t);
if (!childRes.ok) continue;
const childXml = await childRes.text();
const { urls: childPageUrls } = parseSitemapXml(childXml);
for (const u of childPageUrls) {
if (allUrls.size >= maxUrls) break;
allUrls.add(u);
}
} catch { /* skip unreachable child sitemap */ }
}
} catch { /* skip unreachable sitemap */ }
}
return [...allUrls];
}