Source code for hypotestx.core.llm.backends.huggingface

"""
HuggingFace backend — two modes:

1. Inference API  (``use_local=False``, default)
   - Free tier with rate limits
   - No GPU needed — model runs on HF servers
   - Get token: https://huggingface.co/settings/tokens

2. Local transformers  (``use_local=True``)
   - Requires:  pip install transformers torch
   - Model downloaded once to ~/.cache/huggingface
   - Runs on your CPU/GPU, completely offline after download

Recommended free models for routing:
    microsoft/Phi-3.5-mini-instruct   ~7 GB  best quality locally
    Qwen/Qwen2.5-3B-Instruct          ~6 GB  good balance
    TinyLlama/TinyLlama-1.1B-Chat-v1.0  ~2 GB  very fast, lower quality

Usage:
    # Inference API (cloud, free)
    result = hx.analyze(df, "...",
                         backend=HuggingFaceBackend(token="hf_..."))

    # Local inference
    result = hx.analyze(df, "...",
                         backend=HuggingFaceBackend(
                             model="microsoft/Phi-3.5-mini-instruct",
                             use_local=True))
"""

from __future__ import annotations

import json
import urllib.error
import urllib.request
from typing import Any, Dict, List, Optional

from ..base import LLMBackend

_HF_API_URL = "https://api-inference.huggingface.co/models"
_DEFAULT_API_MODEL = "HuggingFaceH4/zephyr-7b-beta"
_DEFAULT_LOCAL_MODEL = "microsoft/Phi-3.5-mini-instruct"



[docs]
class HuggingFaceBackend(LLMBackend):
    """
    HuggingFace backend (Inference API or local transformers).

    Args:
        token:       HF access token (required for Inference API; optional locally).
        model:       Model repo ID.
        use_local:   If True, load the model locally via ``transformers``.
        timeout:     HTTP timeout for Inference API (default: 60).
        max_tokens:  Maximum new tokens (default: 512).
        device:      PyTorch device for local inference (``"cpu"`` or ``"cuda"``).
        load_kwargs: Extra kwargs forwarded to ``AutoModelForCausalLM.from_pretrained()``.
    """

    name = "huggingface"

    def __init__(
        self,
        token: str = "",
        model: str = "",
        use_local: bool = False,
        timeout: int = 60,
        max_tokens: int = 512,
        device: str = "cpu",
        load_kwargs: Optional[Dict[str, Any]] = None,
    ):
        self.token = token
        self.use_local = use_local
        self.model = model or (_DEFAULT_LOCAL_MODEL if use_local else _DEFAULT_API_MODEL)
        self.timeout = timeout
        self.max_tokens = max_tokens
        self.device = device
        self.load_kwargs = load_kwargs or {}
        self._pipeline = None  # lazy-loaded local pipeline

    # ------------------------------------------------------------------ #
    # LLMBackend interface                                                 #
    # ------------------------------------------------------------------ #


[docs]
    def chat(self, messages: List[Dict[str, str]]) -> str:
        if self.use_local:
            return self._local_chat(messages)
        return self._api_chat(messages)


    # ------------------------------------------------------------------ #
    # Inference API path                                                   #
    # ------------------------------------------------------------------ #

    def _api_chat(self, messages: List[Dict[str, str]]) -> str:
        """Use the HuggingFace Inference API (text-generation task)."""
        # Flatten messages into a single prompt (chat template best-effort)
        prompt = _format_messages_as_prompt(messages)

        payload = json.dumps(
            {
                "inputs": prompt,
                "parameters": {
                    "max_new_tokens": self.max_tokens,
                    "return_full_text": False,
                },
            }
        ).encode("utf-8")

        url = f"{_HF_API_URL}/{self.model}"
        headers: Dict[str, str] = {"Content-Type": "application/json"}
        if self.token:
            headers["Authorization"] = f"Bearer {self.token}"

        req = urllib.request.Request(url, data=payload, headers=headers)
        try:
            with urllib.request.urlopen(req, timeout=self.timeout) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except urllib.error.HTTPError as exc:
            body = exc.read().decode("utf-8", errors="replace")
            if exc.code == 503:
                raise RuntimeError(
                    f"[HuggingFace] Model '{self.model}' is loading. " "Wait ~20s and retry."
                ) from exc
            raise RuntimeError(f"[HuggingFace] HTTP {exc.code}: {body}") from exc
        except urllib.error.URLError as exc:
            raise RuntimeError(f"[HuggingFace] Connection error: {exc.reason}") from exc

        if isinstance(data, list) and len(data) > 0:
            return data[0].get("generated_text", "")
        if isinstance(data, dict):
            return data.get("generated_text", str(data))
        return str(data)

    # ------------------------------------------------------------------ #
    # Local transformers path                                              #
    # ------------------------------------------------------------------ #

    def _local_chat(self, messages: List[Dict[str, str]]) -> str:
        """Run inference locally via the transformers library."""
        pipe = self._get_pipeline()

        try:
            # Modern chat-template API
            output = pipe(
                messages,
                max_new_tokens=self.max_tokens,
                do_sample=False,
                return_full_text=False,
            )
            return output[0]["generated_text"]
        except TypeError:
            # Older models: flatten to text
            prompt = _format_messages_as_prompt(messages)
            output = pipe(
                prompt,
                max_new_tokens=self.max_tokens,
                do_sample=False,
                return_full_text=False,
            )
            return output[0]["generated_text"]

    def _get_pipeline(self):
        """Lazy-load (and cache) the local transformers pipeline."""
        if self._pipeline is not None:
            return self._pipeline
        try:
            from transformers import pipeline  # type: ignore
        except ImportError as exc:
            raise ImportError(
                "Local HuggingFace inference requires transformers + torch:\n"
                "  pip install transformers torch"
            ) from exc

        print(
            f"[HypoTestX] Loading '{self.model}' locally on {self.device}. "
            "First run downloads the model weights (~GB)."
        )
        kwargs = {"device": self.device, **self.load_kwargs}
        if self.token:
            kwargs["token"] = self.token
        self._pipeline = pipeline("text-generation", model=self.model, **kwargs)
        return self._pipeline

    def __repr__(self) -> str:
        mode = "local" if self.use_local else "api"
        return f"<HuggingFaceBackend model='{self.model}' mode='{mode}'>"



# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _format_messages_as_prompt(messages: List[Dict[str, str]]) -> str:
    """
    Simple chat-template fallback for models that don't support a message list.
    Uses ChatML format widely supported by open-source models.
    """
    parts = []
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        if role == "system":
            parts.append(f"<|im_start|>system\n{content}<|im_end|>")
        elif role == "user":
            parts.append(f"<|im_start|>user\n{content}<|im_end|>")
        elif role == "assistant":
            parts.append(f"<|im_start|>assistant\n{content}<|im_end|>")
    parts.append("<|im_start|>assistant\n")
    return "\n".join(parts)