Source: aiProvider.js

/**
 * @module aiProvider
 * @description Multi-AI provider abstraction layer.
 *
 * ### Supported providers
 * | Provider         | Key Env Variable    | Model Env Variable   | Default Model              |
 * |------------------|---------------------|----------------------|----------------------------|
 * | Anthropic Claude | `ANTHROPIC_API_KEY` | `ANTHROPIC_MODEL`    | claude-sonnet-4-20250514   |
 * | OpenAI GPT       | `OPENAI_API_KEY`    | `OPENAI_MODEL`       | gpt-4o-mini                |
 * | Google Gemini    | `GOOGLE_API_KEY`    | `GOOGLE_MODEL`       | gemini-2.5-flash           |
 * | Ollama (local)   | `AI_PROVIDER=local` | `OLLAMA_MODEL`       | mistral:7b                 |
 *
 * **Detection order:** Runtime override (header dropdown) → `AI_PROVIDER` env var → auto-detect (Anthropic → OpenAI → Google → Ollama).
 *
 * ### Exports
 * - {@link generateText} — Single-shot text generation.
 * - {@link streamText} — Token-streaming text generation (Anthropic/OpenAI; fallback for others).
 * - {@link parseJSON} — Parse AI response text as JSON (strips markdown fences).
 * - {@link getProvider}, {@link hasProvider}, {@link isLocalProvider}, {@link isProviderDegraded}, {@link getProviderName}, {@link getProviderMeta} — Provider detection.
 * - {@link setRuntimeKey}, {@link setRuntimeOllama}, {@link setActiveProvider} — Runtime configuration (Settings page).
 * - {@link getConfiguredKeys} — Masked key status for the Settings UI.
 * - {@link getSupportedProviders} — All provider names/models for the UI (derived from runtime config).
 * - {@link checkOllamaConnection} — Ollama connectivity check.
 * - {@link loadKeysFromDatabase} — Restore all persisted keys from DB into the runtime cache (called at startup).
 */

import Anthropic from "@anthropic-ai/sdk";
import OpenAI from "openai";
import { GoogleGenerativeAI } from "@google/generative-ai";
import { throwIfAborted } from "./utils/abortHelper.js";
import { formatLogLine } from "./utils/logFormatter.js";
import * as apiKeyRepo from "./database/repositories/apiKeyRepo.js";

// ── Runtime key store ────────────────────────────────────────────────────────
// In-memory cache populated at startup from the DB (via loadKeysFromDatabase)
// and updated whenever /api/settings writes a new key. Keys are also persisted
// to the `api_keys` DB table, so they survive server restarts.
const runtimeKeys = {};

// Ollama runtime config (settable via /api/settings for the local provider)
let runtimeOllamaBaseUrl = "";
let runtimeOllamaModel   = "";
// Explicit deactivation flag — when true, Ollama is disabled even if env vars are set.
// Set to true by DELETE /api/settings/local; cleared by POST /api/settings with local provider.
let runtimeOllamaDisabled = false;

// ── Active provider override ──────────────────────────────────────────────────
// When set, this provider is used instead of auto-detection order.
// Allows the header dropdown to switch between already-configured providers
// without re-entering keys. Cleared when the selected provider loses its key.
let runtimeActiveProvider = null;

// ── Sticky fallback override ──────────────────────────────────────────────────
// When a rate-limit fallback succeeds, this is set to the fallback provider so
// all subsequent generateText() calls in the same pipeline skip the rate-limited
// primary and go directly to the working fallback.  Auto-expires after
// STICKY_FALLBACK_TTL_MS so normal provider selection resumes once the rate
// limit window resets.  Cleared by setActiveProvider() (user explicitly picks
// a provider via the dropdown) and by setRuntimeKey() (user enters a new key).
let _stickyFallbackProvider = null;
let _stickyFallbackExpiry   = 0;

const STICKY_FALLBACK_TTL_MS = 10 * 60 * 1000; // 10 minutes

/**
 * Override the active provider selection (used by the quick-switch dropdown).
 * The provider must already have a valid key/config — this does not set any key.
 * @param {string|null} provider - Provider ID to pin, or null to resume auto-detect.
 */
export function setActiveProvider(provider) {
  runtimeActiveProvider = provider || null;
  // User explicitly chose a provider — clear any sticky fallback
  _stickyFallbackProvider = null;
  _stickyFallbackExpiry   = 0;
}

// Maps cloud provider IDs to their env-var names (single source of truth)
const CLOUD_KEY_MAP = { anthropic: "ANTHROPIC_API_KEY", openai: "OPENAI_API_KEY", google: "GOOGLE_API_KEY" };

// Default models per cloud provider — overridable via env vars
const CLOUD_DEFAULT_MODELS = {
  anthropic: { envVar: "ANTHROPIC_MODEL", fallback: "claude-sonnet-4-20250514", name: "Claude Sonnet" },
  openai:    { envVar: "OPENAI_MODEL",    fallback: "gpt-4o-mini",              name: "GPT-4o-mini" },
  google:    { envVar: "GOOGLE_MODEL",    fallback: "gemini-2.5-flash",         name: "Gemini 2.5 Flash" },
};

function getCloudModel(provider) {
  const cfg = CLOUD_DEFAULT_MODELS[provider];
  if (!cfg) return "";
  return process.env[cfg.envVar] || cfg.fallback;
}

function getCloudName(provider) {
  const cfg = CLOUD_DEFAULT_MODELS[provider];
  if (!cfg) return provider;
  // If user overrode the model, show the model id as the name
  const model = getCloudModel(provider);
  return model !== cfg.fallback ? model : cfg.name;
}

// Auto-detect order for cloud providers
const CLOUD_DETECT_ORDER = ["anthropic", "openai", "google"];

/**
 * Set an AI provider API key at runtime (via Settings page).
 * Persists the key to the database so it survives server restarts.
 * Pass an empty string to clear the key both in-memory and in the DB.
 *
 * @param {string} provider - `"anthropic"` | `"openai"` | `"google"`.
 * @param {string} key      - The API key string, or `""` to deactivate.
 */
export function setRuntimeKey(provider, key) {
  const envName = CLOUD_KEY_MAP[provider];
  if (!envName) return;
  runtimeKeys[envName] = key;
  // FEA-003: Reset circuit breaker when the key changes so the provider is
  // immediately retried with the new credentials instead of waiting out the
  // cooldown from the old key's rate-limit failures.
  if (circuitBreakers[provider]) {
    circuitBreakers[provider].failures = 0;
    circuitBreakers[provider].disabledUntil = 0;
  }
  // Clear sticky fallback — user is configuring a provider, let detection re-evaluate
  _stickyFallbackProvider = null;
  _stickyFallbackExpiry   = 0;
  try {
    if (key) {
      apiKeyRepo.set(provider, key);
    } else {
      apiKeyRepo.remove(provider);
    }
  } catch (err) {
    // DB unavailable during tests or before init — safe to ignore, in-memory cache still works.
    console.error(formatLogLine("error", null, `[aiProvider] Failed to persist key for ${provider}: ${err.message}`));
  }
}

/**
 * Configure Ollama runtime settings (via Settings page).
 * Persists the config to the database so it survives server restarts.
 *
 * @param {Object}  [opts]
 * @param {string}  [opts.baseUrl]  - Ollama server URL.
 * @param {string}  [opts.model]    - Model name (e.g. `"mistral:7b"`).
 * @param {boolean} [opts.disabled] - Set `true` to deactivate Ollama.
 */
export function setRuntimeOllama({ baseUrl, model, disabled } = {}) {
  if (baseUrl  !== undefined) runtimeOllamaBaseUrl  = baseUrl;
  if (model    !== undefined) runtimeOllamaModel    = model;
  if (disabled !== undefined) runtimeOllamaDisabled = disabled;
  try {
    if (disabled) {
      apiKeyRepo.remove("local");
    } else if (runtimeOllamaBaseUrl || runtimeOllamaModel) {
      apiKeyRepo.set("local", { baseUrl: runtimeOllamaBaseUrl, model: runtimeOllamaModel });
    }
  } catch (err) {
    console.error(formatLogLine("error", null, `[aiProvider] Failed to persist Ollama config: ${err.message}`));
  }
}

function getKey(envName) {
  // Use `in` + explicit check so that setting a runtime key to "" (deactivation)
  // takes precedence over the env var. Previously `||` made "" falsy, falling
  // through to process.env and making runtime deactivation impossible.
  if (envName in runtimeKeys) return runtimeKeys[envName];
  const envVal = process.env[envName] || "";
  if (envVal) return envVal;
  // DEMO-MODE: Fall back to the platform-owned demo key for Google when no
  // user key is configured. This lets users try Sentri without bringing their
  // own API key. The demo key is rate-limited per-user by demoQuota middleware.
  if (envName === "GOOGLE_API_KEY" && process.env.DEMO_GOOGLE_API_KEY) {
    return process.env.DEMO_GOOGLE_API_KEY;
  }
  return "";
}

function getOllamaBaseUrl() {
  return runtimeOllamaBaseUrl
    || process.env.OLLAMA_BASE_URL
    || "http://localhost:11434";
}

function getOllamaModel() {
  return runtimeOllamaModel
    || process.env.OLLAMA_MODEL
    || "mistral:7b";
}

// ── Provider metadata ─────────────────────────────────────────────────────────

function buildProviderMeta() {
  return {
    anthropic: { name: getCloudName("anthropic"), model: getCloudModel("anthropic"), color: "#cd7f32" },
    openai:    { name: getCloudName("openai"),    model: getCloudModel("openai"),    color: "#10a37f" },
    google:    { name: getCloudName("google"),    model: getCloudModel("google"),    color: "#4285f4" },
    local:     { name: `Ollama (${getOllamaModel()})`, model: getOllamaModel(), color: "#7c3aed" },
  };
}

const PROVIDER_DOCS = {
  anthropic: "https://console.anthropic.com/settings/keys",
  openai:    "https://platform.openai.com/api-keys",
  google:    "https://aistudio.google.com/apikey",
  local:     "https://ollama.ai",
};

/**
 * Returns the list of all supported providers with current names/models.
 * Derives from buildProviderMeta() so model names stay in sync with what's
 * actually used in API calls. Consumed by GET /api/config.
 * @returns {Array<{id: string, name: string, model: string, docsUrl: string}>}
 */
export function getSupportedProviders() {
  const meta = buildProviderMeta();
  return Object.entries(meta).map(([id, m]) => ({
    id,
    name: m.name,
    model: m.model,
    docsUrl: PROVIDER_DOCS[id] || "",
  }));
}

// ── Provider detection ────────────────────────────────────────────────────────

/**
 * Check whether a provider is usable right now (has a key or, for Ollama, is not disabled).
 * Single source of truth — used by detectProvider, the quick-switch override, and the forced-env path.
 * @param {string} provider
 * @returns {boolean}
 */
function isProviderUsable(provider) {
  if (provider === "local") {
    return !runtimeOllamaDisabled;
  }
  const envName = CLOUD_KEY_MAP[provider];
  if (!envName) return false;
  // Runtime key of "" means explicitly cleared — respect that
  if (envName in runtimeKeys) return runtimeKeys[envName].length > 0;
  if (process.env[envName]) return true;
  // DEMO-MODE: Google is usable when the demo key is set
  if (envName === "GOOGLE_API_KEY" && process.env.DEMO_GOOGLE_API_KEY) return true;
  return false;
}

/** True if Ollama has any config (runtime or env) hinting it should be auto-detected. */
function hasOllamaConfig() {
  return !!(runtimeOllamaBaseUrl || runtimeOllamaModel || process.env.OLLAMA_BASE_URL || process.env.OLLAMA_MODEL);
}

function detectProvider() {
  // ── Sticky fallback from a previous rate-limit event ─────────────────────
  // Checked FIRST — when a rate-limit fallback succeeded, all subsequent
  // calls must use the fallback provider, even if the user explicitly
  // selected the (now rate-limited) primary via the dropdown.  Without this
  // priority, every call would re-try the broken provider for ~3 min before
  // falling back again.  Auto-expires after STICKY_FALLBACK_TTL_MS so normal
  // provider selection resumes once the rate limit window resets.
  if (_stickyFallbackProvider && Date.now() < _stickyFallbackExpiry) {
    if (isProviderUsable(_stickyFallbackProvider)) return _stickyFallbackProvider;
    // Fallback no longer usable — clear and fall through
    _stickyFallbackProvider = null;
    _stickyFallbackExpiry   = 0;
  } else if (_stickyFallbackProvider) {
    // Expired — clear
    _stickyFallbackProvider = null;
    _stickyFallbackExpiry   = 0;
  }

  // ── Quick-switch override from the header dropdown ────────────────────────
  // Checked AFTER the sticky fallback so a rate-limited provider is not
  // retried just because the user had it selected in the dropdown.
  if (runtimeActiveProvider) {
    if (isProviderUsable(runtimeActiveProvider)) return runtimeActiveProvider;
    // Key gone — clear the override and fall through
    runtimeActiveProvider = null;
  }

  // ── AI_PROVIDER env var (explicit static config) ─────────────────────────
  const forced = process.env.AI_PROVIDER?.toLowerCase();
  if (forced) {
    if (forced === "local") return "local";
    if (!CLOUD_KEY_MAP[forced]) throw new Error(`Unknown AI_PROVIDER="${forced}". Valid: anthropic, openai, google, local`);
    if (!getKey(CLOUD_KEY_MAP[forced])) throw new Error(`AI_PROVIDER="${forced}" but ${CLOUD_KEY_MAP[forced]} is not set`);
    return forced;
  }

  // ── Auto-detect: first cloud provider with a key, then Ollama as fallback ─
  const detected = CLOUD_DETECT_ORDER.find(id => isProviderUsable(id));
  if (detected) return detected;

  if (isProviderUsable("local") && hasOllamaConfig()) return "local";

  return null;
}

/** @returns {string|null} Current provider ID (`"anthropic"`, `"openai"`, `"google"`, `"local"`), or `null`. */
export function getProvider()     { try { return detectProvider(); } catch { return null; } }
/** @returns {boolean} `true` if any AI provider is configured. */
export function hasProvider()     { return getProvider() !== null; }
/** @returns {boolean} `true` if the active provider is Ollama (local). */
export function isLocalProvider() { return getProvider() === "local"; }
/**
 * `true` when the AI provider is operating in a degraded state — either a
 * sticky fallback is active (primary was rate-limited) or the primary
 * provider's circuit breaker is open.  Used by the feedback loop to skip
 * expensive AI calls that would block run completion.
 * @returns {boolean}
 */
export function isProviderDegraded() {
  if (_stickyFallbackProvider && Date.now() < _stickyFallbackExpiry) return true;
  const primary = getProvider();
  return primary ? isCircuitBreakerOpen(primary) : false;
}
/** @returns {string} Human-readable provider name (e.g. `"Claude Sonnet"`), or `"No provider configured"`. */
export function getProviderName() {
  const p = getProvider();
  return p ? buildProviderMeta()[p].name : "No provider configured";
}
/** @returns {{provider: string, name: string, model: string, color: string}|null} Full provider metadata, or `null`. */
export function getProviderMeta() {
  const p = getProvider();
  return p ? { provider: p, ...buildProviderMeta()[p] } : null;
}

/**
 * Returns masked API keys and Ollama config for the Settings UI.
 * Never returns full keys — only masked versions for display.
 *
 * @returns {{anthropic: string, openai: string, google: string, activeProvider: string|null, ollamaBaseUrl: string, ollamaModel: string}}
 */
export function getConfiguredKeys() {
  const result = { activeProvider: getProvider() };
  // Cloud providers — masked keys via the shared map.
  // Exclude the demo key fallback so the Settings UI (and demoQuota BYOK
  // detection) only reflects keys the user explicitly configured.
  for (const [id, envName] of Object.entries(CLOUD_KEY_MAP)) {
    const userKey = getUserConfiguredKey(envName);
    result[id] = maskKey(userKey);
  }
  // Ollama-specific fields (never sensitive, no masking needed)
  result.ollamaBaseUrl = getOllamaBaseUrl();
  result.ollamaModel   = getOllamaModel();
  // True only when Ollama has explicit config AND is not disabled — prevents
  // the dropdown from showing Ollama as "saved" when it's just the default URL.
  result.ollamaConfigured = !runtimeOllamaDisabled && hasOllamaConfig();
  return result;
}

/**
 * Get a user-configured key WITHOUT the demo fallback.
 * Used by getConfiguredKeys() so BYOK detection is accurate.
 * @param {string} envName
 * @returns {string}
 */
function getUserConfiguredKey(envName) {
  if (envName in runtimeKeys) return runtimeKeys[envName];
  return process.env[envName] || "";
}

function maskKey(key) {
  if (!key) return "";
  if (key.length <= 8) return "••••••••";
  return key.slice(0, 6) + "••••••••" + key.slice(-4);
}

// ── Database key persistence ──────────────────────────────────────────────────

/**
 * Restore all persisted API keys and Ollama config from the database into the
 * runtime cache. Called once at server startup after the DB is initialised.
 *
 * Keys stored in the DB take precedence over the default detection logic only
 * when no matching env var is already set — env vars remain the canonical
 * override so Docker / K8s deployments are unaffected.
 *
 * @returns {number} The number of providers successfully loaded from the database.
 */
export function loadKeysFromDatabase() {
  let loaded = 0;
  try {
    const entries = apiKeyRepo.getAll();
    for (const { provider, value } of entries) {
      if (provider === "local") {
        // Restore Ollama config only when env vars are not already set.
        const cfg = value;
        if (cfg && typeof cfg === "object") {
          if (!runtimeOllamaBaseUrl && !process.env.OLLAMA_BASE_URL) {
            runtimeOllamaBaseUrl = cfg.baseUrl || "";
          }
          if (!runtimeOllamaModel && !process.env.OLLAMA_MODEL) {
            runtimeOllamaModel = cfg.model || "";
          }
          runtimeOllamaDisabled = false;
          loaded += 1;
        }
      } else {
        const envName = CLOUD_KEY_MAP[provider];
        if (!envName) continue;
        // Only restore from DB when the env var is absent and cache is not already
        // populated — env vars always win.
        if (!process.env[envName] && !(envName in runtimeKeys)) {
          runtimeKeys[envName] = String(value);
          loaded += 1;
        }
      }
    }
    if (loaded > 0) {
      console.log(formatLogLine("info", null, `[aiProvider] Restored ${loaded} provider key(s) from database`));
    }
  } catch (err) {
    // Non-fatal: the server still works with env vars; log and continue.
    console.error(formatLogLine("error", null, `[aiProvider] Failed to load keys from database: ${err.message}`));
  }
  return loaded;
}

// ── Ollama connectivity check ─────────────────────────────────────────────────

/**
 * Check Ollama server connectivity and verify the configured model is available.
 *
 * @returns {Promise<Object>} Resolves to `{ok: boolean, model?: string, baseUrl?: string, availableModels?: string[], error?: string}`.
 */
export async function checkOllamaConnection() {
  const base = getOllamaBaseUrl();
  const model = getOllamaModel();
  try {
    const tagsRes = await fetch(`${base}/api/tags`, { signal: AbortSignal.timeout(4000) });
    if (!tagsRes.ok) return { ok: false, error: `Ollama /api/tags returned HTTP ${tagsRes.status}` };
    const { models = [] } = await tagsRes.json();
    const names = models.map(m => m.name.split(":")[0]);
    // model name may include a tag (mistral:7b:latest) — strip for comparison
    const modelBase = model.split(":")[0];
    const found = names.some(n => n === modelBase || n === model);
    if (!found) {
      return {
        ok: false,
        error: `Model "${model}" not found. Run: ollama pull ${model}\nAvailable: ${names.join(", ") || "(none)"}`,
        availableModels: models.map(m => m.name),
      };
    }
    return { ok: true, model, baseUrl: base, availableModels: models.map(m => m.name) };
  } catch (err) {
    return {
      ok: false,
      error: `Cannot reach Ollama at ${base}. Is it running? (ollama serve)\nDetail: ${err.message}`,
    };
  }
}

// ── Retry with exponential backoff ────────────────────────────────────────────

const RATE_LIMIT_CODES = [429, 529];
const RETRY_ERRORS     = ["rate_limit_error", "overloaded_error", "Too Many Requests"];
const MAX_RETRIES      = parseInt(process.env.LLM_MAX_RETRIES, 10)  || 3;
const BASE_DELAY_MS    = parseInt(process.env.LLM_BASE_DELAY_MS, 10) || 2000;
const MAX_BACKOFF_MS   = parseInt(process.env.LLM_MAX_BACKOFF_MS, 10) || 30000;

async function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

/**
 * Detect whether an error is a rate limit / quota exhaustion from any AI provider.
 * Used internally for retry decisions and exported for the pipeline to detect
 * rate limits that survived all retries.
 *
 * @param {Error} err
 * @returns {boolean}
 */
export function isRateLimitError(err) {
  const msg = (err?.message || "").toLowerCase();
  const status = err?.status || err?.statusCode || 0;
  if (RATE_LIMIT_CODES.includes(status)) return true;
  // Use word-boundary-aware patterns to avoid false positives on port
  // numbers (e.g. "localhost:4290"), disk quota errors, etc.
  return /\brate.?limit/i.test(msg)
    || /\brate_limit/i.test(msg)
    || /\b429\b/.test(msg)
    || /\bquota\s*(exceeded|exhausted|limit)/i.test(msg)
    || /\btoo many requests\b/i.test(msg)
    || /\bresource.?exhausted\b/i.test(msg)
    || /\boverloaded/i.test(msg);
}

/**
 * Detect transient server-side failures that warrant retry + provider
 * fallback but aren't rate limits. Common examples:
 *   - Google Gemini 503 "This model is currently experiencing high demand"
 *   - Anthropic 503 "overloaded_error" (already matches isRateLimitError via /overloaded/)
 *   - OpenAI 500/502/504 transient backend errors
 *   - Provider-specific HTTP 5xx with "high demand", "service unavailable", "try again later"
 *
 * Distinct from isRateLimitError — these are not quota issues, they're
 * temporary server outages. We retry with backoff and fall back to other
 * providers, but don't trip the per-provider rate-limit circuit breaker
 * (the provider's key is fine; the backend is struggling).
 *
 * @param {Error} err
 * @returns {boolean}
 */
export function isTransientServerError(err) {
  const msg = (err?.message || "").toLowerCase();
  const status = err?.status || err?.statusCode || 0;
  // HTTP 5xx except 501 (Not Implemented) — 500/502/503/504 are retriable
  if (status >= 500 && status !== 501) return true;
  return /\b50[0234]\b/.test(msg)
    || /\bservice unavailable\b/i.test(msg)
    || /\bhigh demand\b/i.test(msg)
    || /\btry again later\b/i.test(msg)
    || /\binternal server error\b/i.test(msg)
    || /\bbad gateway\b/i.test(msg)
    || /\bgateway timeout\b/i.test(msg);
}

/**
 * True if the error should be retried — either a rate limit (quota issue)
 * or a transient server error (provider outage).
 */
function isRetryableError(err) {
  return isRateLimitError(err) || isTransientServerError(err);
}

function extractRetryAfter(err) {
  const match = (err?.message || "").match(/retry in (\d+(?:\.\d+)?)(s|ms)/i);
  if (match) {
    const val = parseFloat(match[1]);
    return match[2].toLowerCase() === "ms" ? val : val * 1000;
  }
  return null;
}

async function withRetry(fn, label = "") {
  for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
    try {
      return await fn();
    } catch (err) {
      if (attempt === MAX_RETRIES) throw err;
      if (!isRetryableError(err)) throw err;
      const retryAfter = extractRetryAfter(err);
      // Honor server-requested Retry-After delays (cap at 2× MAX_BACKOFF_MS to
      // prevent absurd waits). Only cap computed exponential backoff at MAX_BACKOFF_MS.
      const delay = retryAfter
        ? Math.min(retryAfter, MAX_BACKOFF_MS * 2)
        : Math.min(BASE_DELAY_MS * Math.pow(2, attempt), MAX_BACKOFF_MS);
      // Distinguish quota issues (rate limit) from backend overload (5xx) in logs
      // so operators can tell whether to add quota or wait out an outage.
      const reason = isRateLimitError(err) ? "Rate limit" : "Transient server error (5xx)";
      console.warn(formatLogLine("warn", null, `${reason} hit${label ? " for " + label : ""}: ${err.message?.slice(0, 120)}. Retrying in ${Math.round(delay / 1000)}s (attempt ${attempt + 1}/${MAX_RETRIES})`));
      await sleep(delay);
    }
  }
}

// ── Core constants ────────────────────────────────────────────────────────────

const DEFAULT_MAX_TOKENS = parseInt(process.env.LLM_MAX_TOKENS, 10) || 16384;

// Per-call timeout for cloud AI providers (GAP-08).
// Prevents a hung API call from blocking the pipeline indefinitely.
// Ollama has its own timeout (OLLAMA_TIMEOUT_MS, default 120s) so this only
// applies to Anthropic, OpenAI, and Google.  Override via LLM_TIMEOUT_MS.
const CLOUD_TIMEOUT_MS = parseInt(process.env.LLM_TIMEOUT_MS, 10) || 120_000;

/**
 * Compose an AbortSignal that fires on EITHER the external signal (user abort)
 * OR a per-call timeout — whichever comes first.  Returns the composite signal
 * and a cleanup function that MUST be called in a finally block to prevent the
 * timeout from leaking if the call completes before the deadline.
 *
 * @param {AbortSignal|undefined} external - Signal from runWithAbort (user abort).
 * @param {number}                timeoutMs - Per-call deadline.
 * @returns {Object} `{ signal: AbortSignal, cleanup: Function }`
 */
function composeSignal(external, timeoutMs) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(new Error("AI call timed out")), timeoutMs);

  // Forward external abort
  let onExternal = null;
  if (external) {
    if (external.aborted) {
      clearTimeout(timer);
      controller.abort(external.reason);
    } else {
      onExternal = () => { clearTimeout(timer); controller.abort(external.reason); };
      external.addEventListener("abort", onExternal, { once: true });
    }
  }

  return {
    signal: controller.signal,
    cleanup: () => {
      clearTimeout(timer);
      if (onExternal && external) external.removeEventListener("abort", onExternal);
    },
  };
}

// ── Ollama caller ─────────────────────────────────────────────────────────────

async function callOllama(prompt, maxTokens, externalSignal, useJson = true) {
  const base  = getOllamaBaseUrl();
  const model = getOllamaModel();

  // Local models (especially 7B) have much smaller effective context windows.
  // Cap num_predict so the prompt + output don't exceed the model's limits.
  // Ollama returns HTTP 500 when the combined size overflows.
  const OLLAMA_MAX_PREDICT = parseInt(process.env.OLLAMA_MAX_PREDICT, 10) || 4096;
  const effectiveTokens = Math.min(maxTokens || DEFAULT_MAX_TOKENS, OLLAMA_MAX_PREDICT);

  const body = {
    model,
    prompt,
    stream: false,
    options: {
      // Ollama uses num_predict for max tokens
      num_predict: effectiveTokens,
      temperature: 0.2,
    },
  };
  // Only ask for JSON format when the caller needs structured output (pipeline).
  // Chat needs free-form text.
  if (useJson) body.format = "json";

  const controller = new AbortController();
  // Ollama can be slow for large prompts — give it generous time
  const timeoutMs = parseInt(process.env.OLLAMA_TIMEOUT_MS, 10) || 120_000;
  const timeoutId = setTimeout(() => controller.abort(), timeoutMs);

  // If an external abort signal is provided (e.g. user clicked "Stop Task"),
  // forward it to our internal controller so the fetch is cancelled immediately.
  // We keep a reference to the handler so we can remove it in `finally` —
  // without cleanup, 60+ sequential AI calls sharing one signal would trigger
  // a MaxListenersExceededWarning.
  let onExternalAbort = null;
  if (externalSignal) {
    if (externalSignal.aborted) {
      clearTimeout(timeoutId);
      throw new DOMException("Aborted", "AbortError");
    } else {
      onExternalAbort = () => controller.abort();
      externalSignal.addEventListener("abort", onExternalAbort, { once: true });
    }
  }

  try {
    const res = await fetch(`${base}/api/generate`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify(body),
      signal: controller.signal,
    });

    if (!res.ok) {
      const text = await res.text().catch(() => "");
      throw new Error(`Ollama HTTP ${res.status}: ${text.slice(0, 300)}`);
    }

    // Ollama with stream:false should return a single JSON object, but some
    // versions return NDJSON (one JSON object per line). We read as text and
    // handle both formats.
    const raw = await res.text();
    let data;
    try {
      data = JSON.parse(raw);
    } catch {
      // NDJSON fallback — each line is a JSON object with a partial "response"
      // field (one token per line). Concatenate all response fields to
      // reconstruct the full output, since the final done:true line typically
      // has an empty response.
      const lines = raw.split("\n").map(l => l.trim()).filter(Boolean);
      let fullResponse = "";
      let foundAny = false;
      for (const line of lines) {
        try {
          const candidate = JSON.parse(line);
          if (candidate.response !== undefined) {
            fullResponse += candidate.response;
            foundAny = true;
          }
        } catch { /* skip unparseable lines */ }
      }
      if (!foundAny) throw new Error(`Ollama returned unparseable response: ${raw.slice(0, 300)}`);
      data = { response: fullResponse };
    }

    // Ollama returns { response: "..." } for non-streaming generate
    if (!data.response) throw new Error(`Unexpected Ollama response shape: ${JSON.stringify(data).slice(0, 200)}`);
    return data.response;
  } catch (err) {
    if (err.name === "AbortError") {
      // Distinguish user-initiated abort from internal timeout
      if (externalSignal?.aborted) {
        throw new DOMException("Aborted", "AbortError");
      }
      throw new Error(`Ollama request timed out after ${timeoutMs / 1000}s. Try a smaller/faster model or increase OLLAMA_TIMEOUT_MS.`);
    }
    throw err;
  } finally {
    clearTimeout(timeoutId);
    if (onExternalAbort && externalSignal) {
      externalSignal.removeEventListener("abort", onExternalAbort);
    }
  }
}

// ─── Structured message helpers ───────────────────────────────────────────────
// Prompt builders can pass either a plain string or { system, user } to
// generateText / streamText. These helpers normalise both shapes into the
// provider-specific message format.

function normaliseMessages(promptOrMessages) {
  if (typeof promptOrMessages === "string") {
    // Legacy: single string → user-only message (backwards compatible)
    return { system: null, user: promptOrMessages, combined: promptOrMessages };
  }
  const { system, user } = promptOrMessages;
  // Combined fallback for providers that don't support system messages (Ollama)
  const combined = system ? `${system}\n\n---\n\n${user}` : user;
  return { system: system || null, user, combined };
}

// ── FEA-003: Circuit breaker per provider ─────────────────────────────────────
// When a provider hits a rate-limit failure that survived all internal retries
// in withRetry(), disable it for 5 min.  Threshold is 1 (not 3) because
// withRetry() already retried MAX_RETRIES times internally — the error that
// reaches generateText() represents a confirmed, durable rate limit, not a
// transient blip.

/** @type {Object<string, {failures: number, disabledUntil: number}>} */
const circuitBreakers = {};

const CIRCUIT_BREAKER_THRESHOLD = 1;
const CIRCUIT_BREAKER_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes

/**
 * Record a rate-limit failure for a provider. If the threshold is reached,
 * the provider is disabled for CIRCUIT_BREAKER_COOLDOWN_MS.
 *
 * @param {string} provider
 */
function recordProviderFailure(provider) {
  if (!circuitBreakers[provider]) circuitBreakers[provider] = { failures: 0, disabledUntil: 0 };
  circuitBreakers[provider].failures += 1;
  if (circuitBreakers[provider].failures >= CIRCUIT_BREAKER_THRESHOLD) {
    circuitBreakers[provider].disabledUntil = Date.now() + CIRCUIT_BREAKER_COOLDOWN_MS;
    console.warn(formatLogLine("warn", null, `[aiProvider] Circuit breaker tripped for ${provider} — disabled for ${CIRCUIT_BREAKER_COOLDOWN_MS / 1000}s after ${CIRCUIT_BREAKER_THRESHOLD} consecutive rate-limit failures`));
  }
}

/**
 * Record a successful call — resets the failure counter.
 *
 * @param {string} provider
 */
function recordProviderSuccess(provider) {
  if (circuitBreakers[provider]) {
    circuitBreakers[provider].failures = 0;
  }
}

/**
 * Check whether a provider's circuit breaker is open (disabled).
 *
 * @param {string} provider
 * @returns {boolean} `true` if the provider is temporarily disabled.
 */
function isCircuitBreakerOpen(provider) {
  const cb = circuitBreakers[provider];
  if (!cb) return false;
  if (cb.disabledUntil > Date.now()) return true;
  // Cooldown expired — reset
  if (cb.disabledUntil > 0) {
    cb.disabledUntil = 0;
    cb.failures = 0;
  }
  return false;
}

/**
 * FEA-003: Get the ordered list of fallback providers to try when the primary
 * provider hits a rate limit or transient error.
 *
 * **Same-tier only** — cloud primary falls back to other cloud providers;
 * local primary has no fallback. This prevents cross-tier mismatches where
 * a prompt built for cloud (~1600 chars, 128K context assumed) gets
 * delivered to Ollama (4K context, needs >120s to process) and hits the
 * chat timeout. Ollama is never a cross-tier rescue — the prompt shape,
 * context window, and response latency are too different.
 *
 * To use Ollama as a primary, set `AI_PROVIDER=local` or pick it from
 * the provider dropdown — detectProvider() will route all calls to Ollama
 * with the correct tier-specific prompt.
 *
 * @param {string} primaryProvider - The provider that failed.
 * @returns {string[]} Ordered list of same-tier fallback provider IDs.
 */
function getFallbackProviders(primaryProvider) {
  // Local tier has only one provider (Ollama) — no fallback possible.
  if (primaryProvider === "local") return [];
  // Cloud tier: try other cloud providers in detection order.
  return CLOUD_DETECT_ORDER.filter(p =>
    p !== primaryProvider &&
    isProviderUsable(p) &&
    !isCircuitBreakerOpen(p),
  );
}

// ── Core API call ─────────────────────────────────────────────────────────────

async function callProvider(provider, promptOrMessages, maxTokens, signal, responseFormat) {
  const tokens = maxTokens || DEFAULT_MAX_TOKENS;
  const { system, user, combined } = normaliseMessages(promptOrMessages);
  // Default to JSON for backward compatibility (pipeline needs structured output).
  // Chat endpoint passes responseFormat: "text" for free-form conversation.
  const useJson = responseFormat !== "text";

  if (provider === "anthropic") {
    const client = new Anthropic({ apiKey: getKey("ANTHROPIC_API_KEY") });
    // composeSignal is created inside each retry attempt so that a per-call
    // timeout on attempt N does not leave the signal permanently aborted for
    // subsequent attempts.  The external (user-abort) signal is still checked
    // across all attempts — only the timeout is per-attempt.
    return await withRetry(async () => {
      const { signal: composedSignal, cleanup } = composeSignal(signal, CLOUD_TIMEOUT_MS);
      try {
        const params = {
          model: buildProviderMeta().anthropic.model,
          max_tokens: tokens,
          messages: [{ role: "user", content: user }],
        };
        // Anthropic natively supports a top-level "system" field
        if (system) params.system = system;
        const msg = await client.messages.create(params, { signal: composedSignal });
        return msg.content[0].text;
      } finally { cleanup(); }
    }, "Anthropic");
  }

  if (provider === "openai") {
    const client = new OpenAI({ apiKey: getKey("OPENAI_API_KEY") });
    return await withRetry(async () => {
      const { signal: composedSignal, cleanup } = composeSignal(signal, CLOUD_TIMEOUT_MS);
      try {
        const messages = [];
        if (system) messages.push({ role: "system", content: system });
        messages.push({ role: "user", content: user });
        const params = {
          model: buildProviderMeta().openai.model,
          max_tokens: tokens,
          messages,
        };
        if (useJson) params.response_format = { type: "json_object" };
        const res = await client.chat.completions.create(params, { signal: composedSignal });
        return res.choices[0].message.content;
      } finally { cleanup(); }
    }, "OpenAI");
  }

  if (provider === "google") {
    const genAI = new GoogleGenerativeAI(getKey("GOOGLE_API_KEY"));
    return await withRetry(async () => {
      const { signal: composedSignal, cleanup } = composeSignal(signal, CLOUD_TIMEOUT_MS);
      try {
        const generationConfig = { maxOutputTokens: tokens };
        if (useJson) generationConfig.responseMimeType = "application/json";
        const modelConfig = {
          model: buildProviderMeta().google.model,
          generationConfig,
        };
        // Gemini supports systemInstruction for system-level context
        if (system) modelConfig.systemInstruction = { parts: [{ text: system }] };
        const model = genAI.getGenerativeModel(modelConfig);
        const result = await model.generateContent({ contents: [{ role: "user", parts: [{ text: user }] }] }, { signal: composedSignal });
        return result.response.text();
      } finally { cleanup(); }
    }, "Google Gemini");
  }

  if (provider === "local") {
    // Ollama doesn't support system messages in /api/generate — use combined prompt
    for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
      try {
        return await callOllama(combined, tokens, signal, useJson);
      } catch (err) {
        // Don't retry if the user aborted
        if (err.name === "AbortError" || signal?.aborted) throw err;
        const isRetryable =
          err.message.includes("ECONNREFUSED") ||
          err.message.includes("fetch failed") ||
          err.message.includes("Ollama HTTP 500");
        if (attempt === MAX_RETRIES || !isRetryable) throw err;
        const delay = Math.min(BASE_DELAY_MS * Math.pow(2, attempt), MAX_BACKOFF_MS);
        console.warn(formatLogLine("warn", null, `[Ollama] ${err.message.slice(0, 80)}. Retrying in ${delay / 1000}s (attempt ${attempt + 1}/${MAX_RETRIES})`));
        await sleep(delay);
      }
    }
  }

  throw new Error(`Unknown provider: ${provider}`);
}

// ── Public API ────────────────────────────────────────────────────────────────

/**
 * Generate text from an AI provider (single-shot, non-streaming).
 * Automatically detects the active provider and routes the request.
 *
 * FEA-003: On rate-limit errors, automatically falls back to the next
 * configured provider in CLOUD_DETECT_ORDER before giving up. Each
 * provider has a circuit breaker that disables it for 5 minutes after
 * a rate-limit failure that survived all internal retries.
 *
 * @param {string|{system: string, user: string}} prompt - Plain string or structured `{ system, user }` messages.
 * @param {Object}      [options]
 * @param {number}      [options.maxTokens] - Max output tokens (default 16384).
 * @param {AbortSignal} [options.signal]    - Abort signal for cancellation.
 * @returns {Promise<string>} The generated text response.
 * @throws {Error} If no AI provider is configured or all providers fail.
 */
export async function generateText(prompt, options) {
  const provider = detectProvider();
  if (!provider) {
    throw new Error(
      "No AI provider configured. Options:\n" +
      "  Cloud: set ANTHROPIC_API_KEY / OPENAI_API_KEY / GOOGLE_API_KEY in backend/.env\n" +
      "  Local: set AI_PROVIDER=local (requires Ollama running at http://localhost:11434)\n" +
      "         Optionally: OLLAMA_MODEL=mistral:7b  OLLAMA_BASE_URL=http://localhost:11434"
    );
  }

  // ── FEA-003: Try primary provider, then fall back on rate-limit OR transient 5xx errors ──
  try {
    const result = await callProvider(provider, prompt, options?.maxTokens, options?.signal, options?.responseFormat);
    recordProviderSuccess(provider);
    return result;
  } catch (err) {
    // Only fall back on retriable errors (rate limits or transient server errors).
    // Auth errors, invalid prompts, etc. are programmer errors and should propagate.
    if (!isRetryableError(err)) throw err;

    // Ollama (local) doesn't have rate limits — its errors (HTTP 500, context
    // overflow, timeout) can match isRateLimitError() false positives (e.g.
    // "overloaded" in error messages). Don't circuit-break local models;
    // just rethrow so the caller's retry/error handling takes over.
    if (provider === "local") throw err;

    // Primary provider failed with a retriable error — record failure (rate limit only)
    // and try fallbacks. Transient 5xx errors don't trip the circuit breaker because
    // the quota is fine; the provider's backend is just temporarily overloaded.
    const errType = isRateLimitError(err) ? "rate-limited" : "transient server error (5xx)";
    if (isRateLimitError(err)) recordProviderFailure(provider);
    const fallbacks = getFallbackProviders(provider);

    if (fallbacks.length === 0) {
      // No fallbacks available — log why and rethrow so the caller (and user)
      // can tell this was a real "nothing more we can do" situation, not a
      // silent skip.
      console.warn(formatLogLine("warn", null, `[aiProvider] ${provider} ${errType} after ${MAX_RETRIES + 1} attempts — no other provider configured for fallback. Giving up. Configure a second provider in Settings to enable automatic fallback.`));
      throw err;
    }

    for (const fallbackProvider of fallbacks) {
      console.warn(formatLogLine("warn", null, `[aiProvider] ${provider} ${errType} — falling back to ${fallbackProvider}`));
      try {
        const result = await callProvider(fallbackProvider, prompt, options?.maxTokens, options?.signal, options?.responseFormat);
        recordProviderSuccess(fallbackProvider);
        // ── Sticky fallback: pin this provider so subsequent calls in the same
        // pipeline skip the failing primary entirely.  Expires after
        // STICKY_FALLBACK_TTL_MS so normal selection resumes once the
        // quota/outage window closes.
        _stickyFallbackProvider = fallbackProvider;
        _stickyFallbackExpiry   = Date.now() + STICKY_FALLBACK_TTL_MS;
        console.log(formatLogLine("info", null, `[aiProvider] Pinned ${fallbackProvider} as sticky fallback for ${STICKY_FALLBACK_TTL_MS / 1000}s`));
        return result;
      } catch (fallbackErr) {
        if (isRetryableError(fallbackErr)) {
          // Only trip the circuit breaker for rate-limit failures on non-local
          // providers. Transient 5xx errors don't disable the provider — the
          // backend is temporarily overloaded, not permanently broken.
          if (isRateLimitError(fallbackErr) && fallbackProvider !== "local") {
            recordProviderFailure(fallbackProvider);
          }
          const fallbackErrType = isRateLimitError(fallbackErr) ? "rate-limited" : "transient server error (5xx)";
          console.warn(formatLogLine("warn", null, `[aiProvider] Fallback ${fallbackProvider} ${fallbackErrType} — trying next`));
          continue;
        }
        // Non-retriable error from fallback — throw it
        throw fallbackErr;
      }
    }

    // All fallbacks exhausted — throw the original error
    throw err;
  }
}

/**
 * Parse AI response text as JSON. Strips markdown code fences if present.
 *
 * @param {string} text - Raw AI response text.
 * @returns {Object} Parsed JSON object.
 * @throws {SyntaxError} If the text is not valid JSON after cleanup.
 */
export function parseJSON(text) {
  const clean = text.trim()
    .replace(/^```json\s*/i, "")
    .replace(/^```\s*/i, "")
    .replace(/\s*```$/i, "");
  return JSON.parse(clean);
}

/**
 * Token-streaming variant of {@link generateText}.
 * Calls `onToken(string)` for each token as it arrives.
 * Returns the full accumulated text when the stream completes.
 *
 * ### Error handling
 * If the streaming call fails with a retryable error (rate limit or
 * transient 5xx) BEFORE any tokens are emitted, we transparently retry
 * via `generateText()` — which applies the full FEA-003 retry + fallback
 * chain and emits the full response as a single synthetic "token". Once
 * tokens have started flowing we can't safely fall back (the user would
 * see two partial responses), so mid-stream failures propagate as-is.
 *
 * Google and Ollama providers never start a real stream — they always
 * delegate to `generateText()` (their SDKs don't support incremental
 * streaming from this codebase), so they get fallback for free.
 *
 * @param {string|{system: string, user: string}} promptOrMessages - Plain string or structured messages.
 * @param {function(string): void} onToken - Callback invoked for each token.
 * @param {Object}      [options]
 * @param {number}      [options.maxTokens] - Max output tokens.
 * @param {AbortSignal} [options.signal]    - Abort signal for cancellation.
 * @returns {Promise<string>} The full accumulated response text.
 * @throws {Error} If no AI provider is configured.
 */
export async function streamText(promptOrMessages, onToken, options = {}) {
  const provider = detectProvider();
  if (!provider) throw new Error("No AI provider configured.");

  const { signal, responseFormat } = options;
  const { system, user } = normaliseMessages(promptOrMessages);
  const useJson = responseFormat !== "text";

  // Helper — when a streaming call fails BEFORE any tokens have been emitted,
  // fall back to non-streaming generateText() which has full FEA-003 retry
  // + multi-provider fallback support. The result is delivered as one
  // synthetic "token" (same pattern as Google/Ollama below).
  async function fallbackToNonStreaming(err) {
    console.warn(formatLogLine("warn", null, `[aiProvider] streamText ${provider} failed before any tokens (${err.message?.slice(0, 120)}) — retrying via non-streaming path with provider fallback.`));
    const text = await generateText(promptOrMessages, { ...options, responseFormat });
    onToken(text);
    return text;
  }

  if (provider === "anthropic") {
    let tokensEmitted = 0;
    try {
      const client = new Anthropic({ apiKey: getKey("ANTHROPIC_API_KEY") });
      const params = {
        model: buildProviderMeta().anthropic.model,
        max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
        messages: [{ role: "user", content: user }],
      };
      if (system) params.system = system;
      const stream = client.messages.stream(params, { signal });
      for await (const chunk of stream) {
        throwIfAborted(signal);
        if (chunk.type === "content_block_delta" && chunk.delta?.text) {
          onToken(chunk.delta.text);
          tokensEmitted++;
        }
      }
      return (await stream.finalMessage()).content[0].text;
    } catch (err) {
      if (err.name === "AbortError" || signal?.aborted) throw err;
      // Only fall back if no tokens were emitted — otherwise the user would see two partial responses.
      if (tokensEmitted === 0 && isRetryableError(err)) return fallbackToNonStreaming(err);
      throw err;
    }
  }

  if (provider === "openai") {
    let tokensEmitted = 0;
    try {
      const client = new OpenAI({ apiKey: getKey("OPENAI_API_KEY") });
      const messages = [];
      if (system) messages.push({ role: "system", content: system });
      messages.push({ role: "user", content: user });
      const params = {
        model: buildProviderMeta().openai.model,
        max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
        stream: true,
        messages,
      };
      if (useJson) params.response_format = { type: "json_object" };
      const stream = await client.chat.completions.create(params, { signal });
      let full = "";
      for await (const chunk of stream) {
        throwIfAborted(signal);
        const token = chunk.choices[0]?.delta?.content ?? "";
        if (token) { full += token; onToken(token); tokensEmitted++; }
      }
      return full;
    } catch (err) {
      if (err.name === "AbortError" || signal?.aborted) throw err;
      if (tokensEmitted === 0 && isRetryableError(err)) return fallbackToNonStreaming(err);
      throw err;
    }
  }

  // Google / Ollama — no streaming SDK; deliver whole response as one token.
  // generateText() handles retry + fallback internally so these providers
  // get FEA-003 coverage for free.
  const text = await generateText(promptOrMessages, { ...options, responseFormat });
  onToken(text);
  return text;
}