Source code for scripts.ai_assistant.ui.providers

"""LLM provider configuration and Ollama model helpers."""

from __future__ import annotations

import json
import logging
import subprocess
import time
import urllib.error
import urllib.request
from typing import Any

import config
from scripts.ai_assistant.ollama_config import get_ollama_base_url

logger = logging.getLogger(__name__)

_OTHER_MODEL_OPTION = "Other (type below)"
_OLLAMA_FALLBACK_MODELS = ["qwen3:8b", "qwen3:4b", "qwen3:1.7b", "mistral:latest", "gemma3:12b"]
_OLLAMA_NON_CHAT_PREFIXES = (
    "all-minilm",
    "bge",
    "granite-embedding",
    "jina-embeddings",
    "mxbai",
    "nomic-embed",
    "snowflake-arctic-embed",
)
_OLLAMA_NON_CHAT_TOKENS = ("embed", "embedding", "rerank")


def _normalise_ollama_model_name(model_name: str) -> str:
    """Normalise ``foo`` and ``foo:latest`` to the same logical model."""
    model = model_name.strip()
    if model.endswith(":latest"):
        return model[:-7]
    return model


# Pure downgrade ladder: the ordered list of qwen3 chat tags we prefer when
# the configured model is not installed locally. Memory budget drops left→
# right, so the first installed tag is the largest the host can actually
# serve. Consumed by :func:`preferred_or_installed_downgrade` — exposed so
# tests can extend the ladder without monkey-patching.
QWEN3_DOWNGRADE_LADDER: tuple[str, ...] = (
    "qwen3:235b",
    "qwen3:32b",
    "qwen3:30b",
    "qwen3:14b",
    "qwen3:8b",
    "qwen3:4b",
    "qwen3:1.7b",
)



[docs]
def preferred_or_installed_downgrade(
    preferred: str,
    installed: list[str] | tuple[str, ...],
) -> str | None:
    """Resolve *preferred* against *installed*, downgrading when needed.

    Returns the preferred tag when it (or its `:latest` equivalent) is
    installed. Otherwise walks :data:`QWEN3_DOWNGRADE_LADDER` from the
    preferred size downward and returns the first tag present. Returns
    ``None`` when no qwen3 tag is installed — callers should treat that
    as "ask the operator" rather than silently picking a non-qwen3 tag.

    Hardware reality on the current dev box: ``qwen3:8b`` OOMs at ~3 GiB
    free, so a user configuring ``qwen3:8b`` gets silently downgraded to
    ``qwen3:1.7b`` rather than an inference-time crash.
    """
    if not preferred or not installed:
        return None
    installed_set = {_normalise_ollama_model_name(m) for m in installed}
    preferred_norm = _normalise_ollama_model_name(preferred)
    if preferred_norm in installed_set:
        return preferred
    if preferred_norm not in QWEN3_DOWNGRADE_LADDER:
        return None
    start = QWEN3_DOWNGRADE_LADDER.index(preferred_norm)
    for candidate in QWEN3_DOWNGRADE_LADDER[start:]:
        if candidate in installed_set:
            return candidate
    for candidate in QWEN3_DOWNGRADE_LADDER[:start]:
        if candidate in installed_set:
            return candidate
    return None



def _ollama_models_match(candidate: str, desired: str) -> bool:
    """Return True when two Ollama names differ only by the implicit tag."""
    return bool(candidate and desired) and (
        _normalise_ollama_model_name(candidate) == _normalise_ollama_model_name(desired)
    )


def _is_ollama_chat_model(model_name: str) -> bool:
    """Hide obvious embedding/reranker tags from the chat-model selector."""
    lowered = _normalise_ollama_model_name(model_name).lower()
    if not lowered:
        return False
    if lowered.startswith(_OLLAMA_NON_CHAT_PREFIXES):
        return False
    return not any(token in lowered for token in _OLLAMA_NON_CHAT_TOKENS)


def _clean_ollama_model_names(raw_models: list[str]) -> list[str]:
    """Return deduplicated model names in a stable sorted order."""
    names = sorted({name.strip() for name in raw_models if name and name.strip()})
    return names


def _build_ollama_selector_state(
    discovered_models: list[str],
    *,
    source: str,
    saved_model: str,
    configured_model: str,
    default_model: str,
) -> dict[str, Any]:
    """Build safe selector options for Ollama chat models.

    When model discovery falls back to static defaults, keep the selector on a
    known configured model if possible, otherwise force manual confirmation
    instead of auto-picking an arbitrary local tag.
    """
    real_models = [model for model in discovered_models if model != _OTHER_MODEL_OPTION]
    chat_models = [model for model in real_models if _is_ollama_chat_model(model)]
    hidden_models = [model for model in real_models if model not in chat_models]
    options = [*chat_models, _OTHER_MODEL_OPTION]

    sticky_preferences = [saved_model.strip(), configured_model.strip()]
    selected_model = next(
        (
            candidate
            for preferred in sticky_preferences
            if preferred
            for candidate in chat_models
            if _ollama_models_match(candidate, preferred)
        ),
        None,
    )
    if selected_model is None and source in {"api", "cli"} and chat_models:
        selected_model = chat_models[0]
    if selected_model is None:
        selected_model = _OTHER_MODEL_OPTION

    fallback_hint = next(
        (preferred for preferred in [*sticky_preferences, default_model] if preferred),
        default_model,
    )
    if selected_model != _OTHER_MODEL_OPTION:
        fallback_hint = selected_model

    return {
        "options": options,
        "index": options.index(selected_model),
        "fallback_hint": fallback_hint,
        "hidden_models": hidden_models,
    }


def _get_ollama_base_url() -> str:
    """Return the Ollama API base URL for UI model discovery."""
    return get_ollama_base_url()


def _try_start_ollama(base_url: str, *, max_wait: int = 10) -> bool:
    """Attempt to start Ollama if not reachable.  Returns *True* if now reachable."""
    # Already reachable?
    try:
        with urllib.request.urlopen(f"{base_url}/api/tags", timeout=2) as resp:  # noqa: S310
            resp.read()
        return True
    except (urllib.error.URLError, OSError):
        pass
    # Try launching the server
    try:
        subprocess.Popen(
            ["ollama", "serve"],  # noqa: S607
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
    except FileNotFoundError:
        return False
    # Poll until ready
    for _ in range(max_wait):
        time.sleep(1)
        try:
            with urllib.request.urlopen(f"{base_url}/api/tags", timeout=2) as resp:  # noqa: S310
                resp.read()
            return True
        except (urllib.error.URLError, OSError):
            continue
    return False


def _get_ollama_models() -> tuple[list[str], str]:
    """Return models currently installed in the local Ollama server.

    Tries the Ollama REST API first (``/api/tags``), falls back to
    ``ollama list`` via subprocess.  If both fail, attempts to auto-start
    Ollama and retries once before returning hardcoded defaults.
    """
    base_url = _get_ollama_base_url()
    fallback = [*_OLLAMA_FALLBACK_MODELS, _OTHER_MODEL_OPTION]

    # 1. REST API
    try:
        with urllib.request.urlopen(  # noqa: S310
            f"{base_url}/api/tags", timeout=2
        ) as resp:
            data = json.loads(resp.read())
        names = _clean_ollama_model_names(
            [
                m.get("name", "").strip() or m.get("model", "").strip()
                for m in data.get("models", [])
                if isinstance(m, dict)
            ]
        )
        if names:
            return [*names, _OTHER_MODEL_OPTION], "api"
    except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
        pass

    # 2. subprocess fallback
    try:
        ollama_bin = subprocess.run(
            ["ollama", "list"],  # noqa: S607
            capture_output=True,
            text=True,
            timeout=4,
            check=False,
        )
    except (FileNotFoundError, subprocess.TimeoutExpired):
        ollama_bin = None
    if ollama_bin is not None and ollama_bin.returncode == 0:
        names = _clean_ollama_model_names(
            [line.split()[0] for line in ollama_bin.stdout.splitlines()[1:] if line.strip()]
        )
        if names:
            return [*names, _OTHER_MODEL_OPTION], "cli"

    # 3. Auto-start Ollama and retry the API once
    if _try_start_ollama(base_url):
        try:
            with urllib.request.urlopen(  # noqa: S310
                f"{base_url}/api/tags", timeout=2
            ) as resp:
                data = json.loads(resp.read())
            names = _clean_ollama_model_names(
                [
                    m.get("name", "").strip() or m.get("model", "").strip()
                    for m in data.get("models", [])
                    if isinstance(m, dict)
                ]
            )
            if names:
                return [*names, _OTHER_MODEL_OPTION], "api"
        except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
            pass

    return fallback, "fallback"


_PROVIDER_CONFIG: dict[str, dict] = {
    # Ollama is listed first — works offline, no API key required.
    # Pull a model with: ollama pull qwen3:8b
    "Ollama (local)": {
        "provider": "ollama",
        "env_var": None,
        "default_model": "qwen3:8b",
        "needs_key": False,
        "models": [
            "qwen3:1.7b",
            "qwen3:4b",
            "qwen3:8b",
            "qwen3:14b",
            "qwen3:30b",
            "qwen3:32b",
            "qwen3:235b",
            "qwen3-coder:30b",
            "qwen3-next:80b",
            "mistral:latest",
            "gemma3:12b",
            "gemma3:27b",
            "deepseek-r1:8b",
            _OTHER_MODEL_OPTION,
        ],
    },
    "Anthropic": {
        "provider": "anthropic",
        "env_var": "ANTHROPIC_API_KEY",
        "default_model": "claude-opus-4-7",
        "needs_key": True,
        "models": [
            "claude-opus-4-7",
            "claude-opus-4-6",
            "claude-sonnet-4-6",
            "claude-opus-4-5-20251101",
            "claude-sonnet-4-5-20250929",
            "claude-haiku-4-5-20251001",
            _OTHER_MODEL_OPTION,
        ],
    },
    "OpenAI": {
        "provider": "openai",
        "env_var": "OPENAI_API_KEY",
        "default_model": "gpt-5.5",
        "needs_key": True,
        "models": [
            "gpt-5.5",
            "gpt-5.4",
            "gpt-5.4-mini",
            "gpt-5-nano",
            _OTHER_MODEL_OPTION,
        ],
    },
    "Google Gemini": {
        "provider": "google-genai",
        "env_var": "GOOGLE_API_KEY",
        "default_model": "gemini-3.1-pro-preview",
        "needs_key": True,
        "models": [
            "gemini-3.1-pro-preview",
            "gemini-3-flash-preview",
            "gemini-3.1-flash-lite-preview",
            "gemini-2.5-pro",
            "gemini-2.5-flash",
            "gemini-2.5-flash-lite",
            _OTHER_MODEL_OPTION,
        ],
    },
    "NVIDIA AI Endpoints": {
        "provider": "nvidia-ai-endpoints",
        "env_var": "NVIDIA_API_KEY",
        "default_model": "moonshotai/kimi-k2.5",
        "needs_key": True,
        "models": [
            "moonshotai/kimi-k2.5",
            "meta/llama-3.3-70b-instruct",
            "mistralai/mistral-large-2-instruct",
            "nvidia/llama-3.1-nemotron-ultra-253b-v1",
            "deepseek-ai/deepseek-r1",
            "qwen/qwen3-235b-a22b",
            _OTHER_MODEL_OPTION,
        ],
    },
}


def _default_provider_label() -> str:
    """Map the live config provider id back to the UI label."""
    for label, provider_cfg in _PROVIDER_CONFIG.items():
        if provider_cfg["provider"] == config.LLM_PROVIDER:
            return label
    return "Ollama (local)"