/**
* feedbackLoop.js — Layer 5: Analyze run results, track failure patterns, improve tests
*
* Pipeline: generate → run → analyze → improve → rerun
*
* Failure categories:
* SELECTOR_ISSUE — element not found, locator broke
* ASSERTION_FAIL — assertion value mismatch
* NAVIGATION_FAIL — page didn't load or wrong URL
* TIMEOUT — element wait exceeded timeout
* URL_MISMATCH — toHaveURL assertion failed, URL redirect, or page.url() mismatch
* UNKNOWN — unclassified failure
*
* Quality analytics (P3):
* - Failure breakdown by category, test type, prompt version, assertion pattern
* - Flaky test detection across run history
* - Actionable insights for prompt improvement
*/
import { generateText, parseJSON } from "../aiProvider.js";
import { throwIfAborted } from "../utils/abortHelper.js";
import * as testRepo from "../database/repositories/testRepo.js";
import * as runRepo from "../database/repositories/runRepo.js";
import { getPromptRules } from "../selfHealing.js";
import { getTier, TIER_CONFIG } from "./prompts/promptTiers.js";
import { buildCapabilityCoverageBlock } from "./prompts/playwrightCapabilityGuide.js";
// ── Failure classification ────────────────────────────────────────────────────
//
// Priority-ordered array of [category, patterns] tuples.
//
// Order matters: the first matching category wins. Using an ordered array (not
// a plain object) makes the priority explicit and stable — Object.entries
// iteration order is implementation-defined and can vary across V8 versions.
//
// Priority rationale:
// 1. SELECTOR_ISSUE — checked first because "waiting for locator … timeout
// 30 000 ms exceeded" matches both SELECTOR_ISSUE and TIMEOUT. A locator
// failure is the root cause; the timeout is the symptom. Reporting the
// root cause produces more actionable self-healing hints.
// 2. URL_MISMATCH — specific navigation-result error; distinct from a
// general navigation failure.
// 3. NAVIGATION_FAIL — network / goto errors.
// 4. ASSERTION_FAIL — generic expect() mismatch (lower specificity than the
// above, so checked after them).
// 5. TIMEOUT — catch-all for any remaining timeout messages that were
// not already classified as a selector or navigation issue.
const FAILURE_PATTERNS = [
["SELECTOR_ISSUE", [
/locator.*not found/i,
/element not visible/i,
/no elements found/i,
/waiting for locator/i,
/element handle is not attached/i,
/strict mode violation/i,
]],
["URL_MISMATCH", [
/url mismatch/i,
/redirected to unexpected url/i,
/page\.url\(\).*not.*match/i,
/expect\(received\)\.toHaveURL\(expected\)/i,
/toHaveURL.*received/i,
]],
["NAVIGATION_FAIL", [
/net::ERR/i,
/page.goto/i,
/navigation failed/i,
/timeout.*navigation/i,
/ERR_NAME_NOT_RESOLVED/i,
]],
["NETWORK_MOCK_FAIL", [
/page\.route/i,
/route\.fulfill/i,
/route handler/i,
/mock(ed)? response/i,
]],
["FRAME_FAIL", [
/frameLocator/i,
/frame .* not found/i,
/iframe.*not found/i,
/cannot access iframe/i,
]],
["API_ASSERTION_FAIL", [
/request\.newContext.*(?:status|schema|contract|body)/i,
/api\.(?:get|post|put|patch|delete|fetch).*(?:status|schema|contract|body)/i,
/api response (?:status|schema|contract)/i,
/\bres\.status\(\)/i,
]],
["ASSERTION_FAIL", [
/expect.*received/i,
/toHave.*expected/i,
/toBeVisible.*expected/i,
/matcher error/i,
]],
["TIMEOUT", [
/timeout \d+ms exceeded/i,
/waiting for.*timeout/i,
/Test timeout/i,
]],
];
export function classifyFailure(errorMessage) {
if (!errorMessage) return "UNKNOWN";
for (const [category, patterns] of FAILURE_PATTERNS) {
if (patterns.some(p => p.test(errorMessage))) return category;
}
return "UNKNOWN";
}
// ── Assertion pattern extraction ──────────────────────────────────────────────
// Extracts which Playwright assertion method caused the failure so we can
// track which assertion types are most fragile across runs.
const ASSERTION_METHOD_RE = /\.(toHaveURL|toHaveTitle|toBeVisible|toContainText|toHaveText|toHaveValue|toBeEnabled|toBeDisabled|toHaveCount|toBeChecked)\b/i;
function extractFailedAssertionMethod(errorMessage) {
const match = (errorMessage || "").match(ASSERTION_METHOD_RE);
return match ? match[1] : null;
}
// ── Flakiness detection ───────────────────────────────────────────────────────
export function detectFlakiness(testHistory) {
// testHistory = array of "passed"|"failed"|"warning" strings
if (testHistory.length < 2) return false;
const statuses = new Set(testHistory);
return statuses.has("passed") && statuses.has("failed");
}
/**
* detectFlakyTests(projectId) → Map<testId, flakyInfo>
*
* Scans all run results for a project and identifies tests that have both
* passed and failed across different runs.
*/
export function detectFlakyTests(projectId) {
const testResults = new Map(); // testId → { passes, fails }
const allRuns = runRepo.getByProjectId(projectId);
for (const run of allRuns) {
if (!run.results) continue;
for (const result of run.results) {
if (!testResults.has(result.testId)) {
testResults.set(result.testId, { passes: 0, fails: 0 });
}
const entry = testResults.get(result.testId);
if (result.status === "passed") entry.passes++;
if (result.status === "failed") entry.fails++;
}
}
const flakyTests = new Map();
for (const [testId, { passes, fails }] of testResults) {
if (passes > 0 && fails > 0) {
const test = testRepo.getById(testId);
const total = passes + fails;
flakyTests.set(testId, {
testId,
name: test?.name || "Unknown",
passCount: passes,
failCount: fails,
flakyRate: Math.round((Math.min(passes, fails) / total) * 100),
});
}
}
return flakyTests;
}
// ── Quality analytics ────────────────────────────────────────────────────────
// Correlates failures with test metadata (type, promptVersion, modelUsed,
// assertion patterns) to produce actionable insights for prompt improvement.
/**
* buildQualityAnalytics(improvements, testMap) → analytics object
*
* Produces a structured breakdown of failures for the run record.
*/
export function buildQualityAnalytics(improvements, testMap) {
const byCategory = {};
const byType = {};
const byPromptVersion = {};
const byModel = {};
const failedAssertionMethods = {};
for (const imp of improvements) {
const t = imp.test;
// By failure category
byCategory[imp.failureCategory] = (byCategory[imp.failureCategory] || 0) + 1;
// By test type
const type = t.type || "unknown";
byType[type] = (byType[type] || 0) + 1;
// By prompt version
const pv = t.promptVersion || "unknown";
byPromptVersion[pv] = (byPromptVersion[pv] || 0) + 1;
// By AI model
const model = t.modelUsed || "unknown";
byModel[model] = (byModel[model] || 0) + 1;
// By assertion method that failed
const method = extractFailedAssertionMethod(imp.errorMessage);
if (method) {
failedAssertionMethods[method] = (failedAssertionMethods[method] || 0) + 1;
}
}
// Generate actionable insights
const insights = [];
if (byCategory.URL_MISMATCH > 0) {
insights.push(`${byCategory.URL_MISMATCH} test(s) failed on URL assertions — consider switching to content-based assertions (toBeVisible, toContainText) instead of toHaveURL.`);
}
if (byCategory.SELECTOR_ISSUE > 0) {
insights.push(`${byCategory.SELECTOR_ISSUE} test(s) failed on selectors — the AI may be generating CSS selectors instead of using self-healing helpers (safeClick, safeFill, safeExpect).`);
}
if (byCategory.TIMEOUT > 0) {
insights.push(`${byCategory.TIMEOUT} test(s) timed out — likely using waitForLoadState('networkidle') or insufficient timeouts. Check for SPA-heavy pages.`);
}
if (failedAssertionMethods.toHaveURL > 0) {
const maxMethod = Object.entries(failedAssertionMethods).sort((a, b) => b[1] - a[1])[0];
const qualifier = maxMethod && maxMethod[0] === "toHaveURL" ? "the most fragile" : "a fragile";
insights.push(`toHaveURL is ${qualifier} assertion (${failedAssertionMethods.toHaveURL} failure${failedAssertionMethods.toHaveURL !== 1 ? "s" : ""}). Prefer asserting visible page content over URL patterns.`);
}
return {
byCategory,
byType,
byPromptVersion,
byModel,
failedAssertionMethods,
insights,
totalFailures: improvements.length,
};
}
// ── Improvement prompt builder ────────────────────────────────────────────────
function buildImprovementPrompt(test, failureCategory, errorMessage, snapshot, tier) {
const categoryInstructions = {
NETWORK_MOCK_FAIL: `The test failed around network interception/mocking.
Fix by:
- Preserving page.route()/route.fulfill() flow — do not remove mock setup
- Ensuring mocked response shape matches app expectations (keys/types)
- Keeping assertions aligned to the mocked payload and rendered UI`,
FRAME_FAIL: `The test failed inside an iframe/frame context.
Fix by:
- Using frameLocator() targeting the correct iframe selector/title/name
- Performing interactions/assertions on frame-scoped locators
- Avoiding page-level selectors for frame-contained elements`,
API_ASSERTION_FAIL: `The test failed in API request/response validation.
Fix by:
- Keeping request.newContext() calls and endpoint method usage intact
- Asserting status/body against actual API contract (types + required keys)
- Avoiding UI-only page assertions for API-only tests`,
SELECTOR_ISSUE: `The test failed because a selector couldn't find an element.
Rewrite using more resilient selectors:
- Use getByRole(), getByLabel(), getByText() instead of CSS selectors
- Use .filter({ hasText: /.../ }) for specificity
- Add .first() to avoid strict mode violations
- Avoid nth-child, position-based selectors`,
URL_MISMATCH: `The test failed because a toHaveURL() assertion didn't match the actual URL.
Real-world sites redirect unpredictably (CAPTCHAs, consent pages, geo-redirects, login walls).
Fix by:
- REMOVE the toHaveURL() assertion entirely
- Replace it with a CONTENT assertion: await expect(page.getByText('expected heading')).toBeVisible()
- If you must check the URL, use the LOOSEST hostname-only regex: await expect(page).toHaveURL(/example\\.com/i)
- NEVER match on path segments or query params`,
NAVIGATION_FAIL: `The test failed due to navigation issues.
Fix by:
- Using { waitUntil: 'domcontentloaded' } instead of 'networkidle'
- Adding a retry mechanism for page.goto()
- Checking the URL is correct and accessible`,
TIMEOUT: `The test timed out waiting for elements.
Fix by:
- Increasing timeout: { timeout: 30000 }
- Using await page.waitForSelector('selector', { timeout: 15000 }) before assertions
- Using { waitUntil: 'domcontentloaded' } after navigation — NEVER use 'networkidle'
- Adding await page.waitForLoadState('domcontentloaded') after page.goto()`,
ASSERTION_FAIL: `The assertion failed - the actual value didn't match expected.
This often happens because the test hard-coded a crawl-time value that changed at runtime.
Fix by:
- Using softer matchers: toContainText instead of toHaveText for any text that may vary
- Using regex patterns for dynamic content: dates (/\\d{4}-\\d{2}-\\d{2}/), IDs (/Order #\\d+/), prices (/\\$[\\d,.]+/), UUIDs (/[a-f0-9-]{36}/)
- For personalized text (e.g. "Welcome John"), assert only the static label: toContainText('Welcome')
- For counts that change, use not.toHaveCount(0) instead of toHaveCount(N)
- For toasts/notifications, use toContainText(/success|saved|created|updated|deleted/i)
- Adding proper wait before assertion: await expect(locator).toContainText('expected', { timeout: 10000 })
- Asserting on what's actually present on the page — check the error message for the "received" value`,
UNKNOWN: `The test failed for an unknown reason.
Rewrite more defensively:
- Wrap risky operations in try/catch
- Use .catch(() => {}) for optional assertions
- Add explicit waits before interactions`,
};
return `You are a senior QA engineer fixing a broken Playwright test.
FAILED TEST:
Name: ${test.name}
URL: ${test.sourceUrl}
Error: ${errorMessage}
Failure Category: ${failureCategory}
ORIGINAL CODE:
${test.playwrightCode}
PAGE CONTEXT:
- Title: ${snapshot?.title || "unknown"}
- Forms: ${snapshot?.forms || 0}
- Elements: ${JSON.stringify((snapshot?.elements || []).slice(0, TIER_CONFIG[tier || "cloud"].maxElements), null, 2)}
INSTRUCTIONS:
${categoryInstructions[failureCategory] || categoryInstructions.UNKNOWN}
SELF-HEALING RULES:
${getPromptRules(tier || "cloud")}
${buildCapabilityCoverageBlock({ mode: "debug", tier: tier || "cloud" })}
Return ONLY valid JSON (no markdown):
{
"name": "improved test name",
"description": "what was fixed and why",
"priority": "${test.priority || "medium"}",
"type": "${test.type || "functional"}",
"steps": ["step 1", "step 2"],
"playwrightCode": "full improved playwright test code"
}`;
}
// ── Main feedback loop ────────────────────────────────────────────────────────
/**
* analyzeRunResults(runResults, tests, snapshots) → improvement plan
*
* Returns a list of tests that need regeneration with failure context.
*/
export function analyzeRunResults(runResults, testMap, snapshotsByUrl) {
const improvements = [];
const stats = { total: 0, passed: 0, failed: 0, flaky: 0, needsRegeneration: 0 };
// High-priority categories that should be auto-fixed — these are almost always
// prompt-quality issues rather than real application bugs.
// ASSERTION_FAIL is included because hard-coded crawl-time values (dates, IDs,
// counts) are a prompt-quality issue, not a real application regression.
const HIGH_PRIORITY_CATEGORIES = new Set([
"SELECTOR_ISSUE",
"URL_MISMATCH",
"TIMEOUT",
"ASSERTION_FAIL",
"NETWORK_MOCK_FAIL",
"FRAME_FAIL",
"API_ASSERTION_FAIL",
]);
for (const result of runResults) {
stats.total++;
if (result.status === "passed") {
stats.passed++;
continue;
}
if (result.status === "failed") {
stats.failed++;
const test = testMap[result.testId];
if (!test) continue;
const failureCategory = classifyFailure(result.error);
const snapshot = snapshotsByUrl[test.sourceUrl];
improvements.push({
testId: result.testId,
test,
failureCategory,
errorMessage: result.error,
snapshot,
assertionMethod: extractFailedAssertionMethod(result.error),
priority: HIGH_PRIORITY_CATEGORIES.has(failureCategory) ? "high" : "medium",
});
stats.needsRegeneration++;
}
}
return { improvements, stats };
}
/**
* regenerateFailingTest(improvement, signal) → improved test or null
*
* Calls the AI to produce a fixed version of a failing test.
* Accepts an optional AbortSignal so the operation can be cancelled.
*/
export async function regenerateFailingTest(improvement, signal) {
const { test, failureCategory, errorMessage, snapshot } = improvement;
try {
throwIfAborted(signal);
const tier = getTier();
const prompt = buildImprovementPrompt(test, failureCategory, errorMessage, snapshot, tier);
const text = await generateText(prompt, { signal });
const improved = parseJSON(text);
// Only pick safe fields from the AI response — never let the LLM
// override critical DB fields like id, projectId, or reviewStatus.
return {
...test,
name: improved.name || test.name,
description: improved.description || test.description,
priority: improved.priority || test.priority,
type: improved.type || test.type,
steps: Array.isArray(improved.steps) ? improved.steps : test.steps,
playwrightCode: improved.playwrightCode || test.playwrightCode,
_regenerated: true,
_regenerationReason: failureCategory,
_originalCode: test.playwrightCode,
};
} catch (err) {
if (err.name === "AbortError") throw err; // propagate abort
return null; // Regeneration failed — keep original
}
}
/**
* applyFeedbackLoop(run, { signal } = {}) → summary
*
* Full feedback loop: analyzes results, regenerates failing tests.
* Called after a test run completes.
* Accepts an optional AbortSignal so long-running AI calls can be cancelled.
*/
export async function applyFeedbackLoop(run, { signal } = {}) {
if (!run.results?.length) return { improved: 0, skipped: 0, analytics: null };
// Build lookup maps
const testMap = {};
for (const testId of (run.tests || [])) {
const t = testRepo.getById(testId);
if (t) testMap[testId] = t;
}
const snapshotsByUrl = {};
// Snapshots are stored on the run during crawl
for (const snap of (run.snapshots || [])) {
snapshotsByUrl[snap.url] = snap;
}
const { improvements, stats } = analyzeRunResults(run.results, testMap, snapshotsByUrl);
// Build quality analytics — correlate failures with prompt version, model, type
const analytics = buildQualityAnalytics(improvements, testMap);
// Detect flaky tests across all runs for this project
const projectId = run.projectId;
if (projectId) {
const flakyTests = detectFlakyTests(projectId);
analytics.flakyTests = Array.from(flakyTests.values());
stats.flaky = flakyTests.size;
}
// Store analytics on the run record so the frontend can display them
run.qualityAnalytics = analytics;
let improved = 0;
for (const improvement of improvements) {
if (improvement.priority !== "high") continue; // Only auto-fix high priority failures
if (signal?.aborted) break; // Respect abort signal between AI calls
const regenerated = await regenerateFailingTest(improvement, signal);
if (regenerated) {
// Route regenerated tests back through human review instead of
// auto-approving. This preserves the "nothing executes until a
// human approves" principle and prevents silently introducing
// flawed tests into the approved pool.
// Strip non-column properties before persisting. regenerateFailingTest()
// adds underscore-prefixed metadata (_regenerated, _regenerationReason,
// _originalCode) and the original test may carry _quality, _assertionEnhanced,
// _generatedFrom — none of which are columns in the tests table.
const { id: _id, _regenerated, _regenerationReason, _originalCode, _quality, _assertionEnhanced, _generatedFrom, ...fields } = regenerated;
testRepo.update(improvement.testId, { ...fields, reviewStatus: "draft" });
improved++;
}
}
return { improved, skipped: improvements.length - improved, stats, analytics };
}