Source code for scripts.ai_assistant.ui.providers

"""LLM provider configuration and Ollama model helpers."""

from __future__ import annotations

import json
import logging
import subprocess
import time
import urllib.error
import urllib.request
from typing import Any

import config
from scripts.ai_assistant.ollama_config import get_ollama_base_url

logger = logging.getLogger(__name__)

_OTHER_MODEL_OPTION = "Other (type below)"
_OLLAMA_FALLBACK_MODELS = ["qwen3:8b", "qwen3:4b", "qwen3:1.7b", "mistral:latest", "gemma3:12b"]
_OLLAMA_NON_CHAT_PREFIXES = (
    "all-minilm",
    "bge",
    "granite-embedding",
    "jina-embeddings",
    "mxbai",
    "nomic-embed",
    "snowflake-arctic-embed",
)
_OLLAMA_NON_CHAT_TOKENS = ("embed", "embedding", "rerank")


def _normalise_ollama_model_name(model_name: str) -> str:
    """Normalise ``foo`` and ``foo:latest`` to the same logical model."""
    model = model_name.strip()
    if model.endswith(":latest"):
        return model[:-7]
    return model


# Pure downgrade ladder: the ordered list of qwen3 chat tags we prefer when
# the configured model is not installed locally. Memory budget drops left→
# right, so the first installed tag is the largest the host can actually
# serve. Consumed by :func:`preferred_or_installed_downgrade` — exposed so
# tests can extend the ladder without monkey-patching.
QWEN3_DOWNGRADE_LADDER: tuple[str, ...] = (
    "qwen3:235b",
    "qwen3:32b",
    "qwen3:30b",
    "qwen3:14b",
    "qwen3:8b",
    "qwen3:4b",
    "qwen3:1.7b",
)


[docs] def preferred_or_installed_downgrade( preferred: str, installed: list[str] | tuple[str, ...], ) -> str | None: """Resolve *preferred* against *installed*, downgrading when needed. Returns the preferred tag when it (or its `:latest` equivalent) is installed. Otherwise walks :data:`QWEN3_DOWNGRADE_LADDER` from the preferred size downward and returns the first tag present. Returns ``None`` when no qwen3 tag is installed — callers should treat that as "ask the operator" rather than silently picking a non-qwen3 tag. Hardware reality on the current dev box: ``qwen3:8b`` OOMs at ~3 GiB free, so a user configuring ``qwen3:8b`` gets silently downgraded to ``qwen3:1.7b`` rather than an inference-time crash. """ if not preferred or not installed: return None installed_set = {_normalise_ollama_model_name(m) for m in installed} preferred_norm = _normalise_ollama_model_name(preferred) if preferred_norm in installed_set: return preferred if preferred_norm not in QWEN3_DOWNGRADE_LADDER: return None start = QWEN3_DOWNGRADE_LADDER.index(preferred_norm) for candidate in QWEN3_DOWNGRADE_LADDER[start:]: if candidate in installed_set: return candidate for candidate in QWEN3_DOWNGRADE_LADDER[:start]: if candidate in installed_set: return candidate return None
def _ollama_models_match(candidate: str, desired: str) -> bool: """Return True when two Ollama names differ only by the implicit tag.""" return bool(candidate and desired) and ( _normalise_ollama_model_name(candidate) == _normalise_ollama_model_name(desired) ) def _is_ollama_chat_model(model_name: str) -> bool: """Hide obvious embedding/reranker tags from the chat-model selector.""" lowered = _normalise_ollama_model_name(model_name).lower() if not lowered: return False if lowered.startswith(_OLLAMA_NON_CHAT_PREFIXES): return False return not any(token in lowered for token in _OLLAMA_NON_CHAT_TOKENS) def _clean_ollama_model_names(raw_models: list[str]) -> list[str]: """Return deduplicated model names in a stable sorted order.""" names = sorted({name.strip() for name in raw_models if name and name.strip()}) return names def _build_ollama_selector_state( discovered_models: list[str], *, source: str, saved_model: str, configured_model: str, default_model: str, ) -> dict[str, Any]: """Build safe selector options for Ollama chat models. When model discovery falls back to static defaults, keep the selector on a known configured model if possible, otherwise force manual confirmation instead of auto-picking an arbitrary local tag. """ real_models = [model for model in discovered_models if model != _OTHER_MODEL_OPTION] chat_models = [model for model in real_models if _is_ollama_chat_model(model)] hidden_models = [model for model in real_models if model not in chat_models] options = [*chat_models, _OTHER_MODEL_OPTION] sticky_preferences = [saved_model.strip(), configured_model.strip()] selected_model = next( ( candidate for preferred in sticky_preferences if preferred for candidate in chat_models if _ollama_models_match(candidate, preferred) ), None, ) if selected_model is None and source in {"api", "cli"} and chat_models: selected_model = chat_models[0] if selected_model is None: selected_model = _OTHER_MODEL_OPTION fallback_hint = next( (preferred for preferred in [*sticky_preferences, default_model] if preferred), default_model, ) if selected_model != _OTHER_MODEL_OPTION: fallback_hint = selected_model return { "options": options, "index": options.index(selected_model), "fallback_hint": fallback_hint, "hidden_models": hidden_models, } def _get_ollama_base_url() -> str: """Return the Ollama API base URL for UI model discovery.""" return get_ollama_base_url() def _try_start_ollama(base_url: str, *, max_wait: int = 10) -> bool: """Attempt to start Ollama if not reachable. Returns *True* if now reachable.""" # Already reachable? try: with urllib.request.urlopen(f"{base_url}/api/tags", timeout=2) as resp: # noqa: S310 resp.read() return True except (urllib.error.URLError, OSError): pass # Try launching the server try: subprocess.Popen( ["ollama", "serve"], # noqa: S607 stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) except FileNotFoundError: return False # Poll until ready for _ in range(max_wait): time.sleep(1) try: with urllib.request.urlopen(f"{base_url}/api/tags", timeout=2) as resp: # noqa: S310 resp.read() return True except (urllib.error.URLError, OSError): continue return False def _get_ollama_models() -> tuple[list[str], str]: """Return models currently installed in the local Ollama server. Tries the Ollama REST API first (``/api/tags``), falls back to ``ollama list`` via subprocess. If both fail, attempts to auto-start Ollama and retries once before returning hardcoded defaults. """ base_url = _get_ollama_base_url() fallback = [*_OLLAMA_FALLBACK_MODELS, _OTHER_MODEL_OPTION] # 1. REST API try: with urllib.request.urlopen( # noqa: S310 f"{base_url}/api/tags", timeout=2 ) as resp: data = json.loads(resp.read()) names = _clean_ollama_model_names( [ m.get("name", "").strip() or m.get("model", "").strip() for m in data.get("models", []) if isinstance(m, dict) ] ) if names: return [*names, _OTHER_MODEL_OPTION], "api" except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError): pass # 2. subprocess fallback try: ollama_bin = subprocess.run( ["ollama", "list"], # noqa: S607 capture_output=True, text=True, timeout=4, check=False, ) except (FileNotFoundError, subprocess.TimeoutExpired): ollama_bin = None if ollama_bin is not None and ollama_bin.returncode == 0: names = _clean_ollama_model_names( [line.split()[0] for line in ollama_bin.stdout.splitlines()[1:] if line.strip()] ) if names: return [*names, _OTHER_MODEL_OPTION], "cli" # 3. Auto-start Ollama and retry the API once if _try_start_ollama(base_url): try: with urllib.request.urlopen( # noqa: S310 f"{base_url}/api/tags", timeout=2 ) as resp: data = json.loads(resp.read()) names = _clean_ollama_model_names( [ m.get("name", "").strip() or m.get("model", "").strip() for m in data.get("models", []) if isinstance(m, dict) ] ) if names: return [*names, _OTHER_MODEL_OPTION], "api" except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError): pass return fallback, "fallback" _PROVIDER_CONFIG: dict[str, dict] = { # Ollama is listed first — works offline, no API key required. # Pull a model with: ollama pull qwen3:8b "Ollama (local)": { "provider": "ollama", "env_var": None, "default_model": "qwen3:8b", "needs_key": False, "models": [ "qwen3:1.7b", "qwen3:4b", "qwen3:8b", "qwen3:14b", "qwen3:30b", "qwen3:32b", "qwen3:235b", "qwen3-coder:30b", "qwen3-next:80b", "mistral:latest", "gemma3:12b", "gemma3:27b", "deepseek-r1:8b", _OTHER_MODEL_OPTION, ], }, "Anthropic": { "provider": "anthropic", "env_var": "ANTHROPIC_API_KEY", "default_model": "claude-opus-4-7", "needs_key": True, "models": [ "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-opus-4-5-20251101", "claude-sonnet-4-5-20250929", "claude-haiku-4-5-20251001", _OTHER_MODEL_OPTION, ], }, "OpenAI": { "provider": "openai", "env_var": "OPENAI_API_KEY", "default_model": "gpt-5.5", "needs_key": True, "models": [ "gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5-nano", _OTHER_MODEL_OPTION, ], }, "Google Gemini": { "provider": "google-genai", "env_var": "GOOGLE_API_KEY", "default_model": "gemini-3.1-pro-preview", "needs_key": True, "models": [ "gemini-3.1-pro-preview", "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview", "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", _OTHER_MODEL_OPTION, ], }, "NVIDIA AI Endpoints": { "provider": "nvidia-ai-endpoints", "env_var": "NVIDIA_API_KEY", "default_model": "moonshotai/kimi-k2.5", "needs_key": True, "models": [ "moonshotai/kimi-k2.5", "meta/llama-3.3-70b-instruct", "mistralai/mistral-large-2-instruct", "nvidia/llama-3.1-nemotron-ultra-253b-v1", "deepseek-ai/deepseek-r1", "qwen/qwen3-235b-a22b", _OTHER_MODEL_OPTION, ], }, } def _default_provider_label() -> str: """Map the live config provider id back to the UI label.""" for label, provider_cfg in _PROVIDER_CONFIG.items(): if provider_cfg["provider"] == config.LLM_PROVIDER: return label return "Ollama (local)"