Source code for hypotestx.core.llm.backends.fallback

"""
Regex/rule-based fallback backend — zero dependencies, zero API calls.

Used automatically when no LLM backend is specified.
Handles the most common question patterns with reasonable accuracy.
For production use or complex questions, switch to a real LLM backend.
"""

from __future__ import annotations

import re
from typing import Dict, List, Optional, Tuple

from ..base import LLMBackend, RoutingResult, SchemaInfo

# ---------------------------------------------------------------------------
# Pattern tables
# ---------------------------------------------------------------------------

_TESTS_BY_KEYWORD = [
    # (regex, test_key, default_alternative)   — searched in order
    # Correlations
    (
        "correlat|related to|linear relationship|predict.*from|association.*numeric" "|scatter",
        "pearson",
        "two-sided",
    ),
    ("rank.*corr|spearman|monotone|ordinal corr", "spearman", "two-sided"),
    # Categorical association
    (
        "association|independent|chi.square|chi2|contingency|"
        "relationship.*categor|categor.*relationship",
        "chi_square",
        "two-sided",
    ),
    ("fisher", "fisher", "two-sided"),
    # One-sample
    (
        r"mean.*equal|equal.*mean|mean.*differ.*\d|differ.*\d.*mean|"
        r"average.*\d|\d.*average|test.*mean|mean.*test|"
        "is.*mean|population mean",
        "one_sample_ttest",
        "two-sided",
    ),
    # Paired
    (
        "before.*after|pre.*post|paired|repeated.*measure|"
        "within.*subject|change over time|same.*subject",
        "paired_ttest",
        "two-sided",
    ),
    # ANOVA / multi-group
    (
        "anova|more than two|three.*group|multiple.*group|"
        "several.*group|across.*group|among.*group",
        "anova",
        "two-sided",
    ),
    # Kruskal-Wallis
    ("kruskal|non.param.*group|group.*non.param", "kruskal_wallis", "two-sided"),
    # Two-sample
    (
        "compar.*mean|mean.*compar|differ.*group|group.*differ|"
        "between.*group|higher.*than|lower.*than|more.*than.*less.*than|"
        "male.*female|female.*male|group.*a.*group.*b|"
        "two.*group|independen.*sample",
        "two_sample_ttest",
        "two-sided",
    ),
]

_DIRECTION_KEYWORDS = {
    "greater": r"\bhigher\b|\bmore\b|\bgreater\b|\blarger\b|\bexceed\b|\babove\b",
    "less": r"\blower\b|\bless\b|\bsmaller\b|\bbelow\b|\bunder\b|\bfewer\b",
}

_TWO_SAMPLE_KEYS = {"two_sample_ttest", "mann_whitney", "anova", "kruskal_wallis"}

# ---------------------------------------------------------------------------
# Backend
# ---------------------------------------------------------------------------



[docs]
class FallbackBackend(LLMBackend):
    """
    Pure regex routing — no LLM, no internet, no dependencies.

    Accuracy is lower than an LLM but it always works offline and is
    extremely fast.  Use it for quick experiments or when no LLM is
    available.
    """

    name = "fallback"


[docs]
    def chat(self, messages: List[Dict[str, str]]) -> str:
        """
        The fallback backend does not call an LLM.
        ``route()`` is overridden directly instead.
        """
        return ""



[docs]
    def route(
        self,
        question: str,
        schema: SchemaInfo,
        extra_context: str = "",
        warn_fallback: bool = True,
    ) -> RoutingResult:
        """Bypass LLM and route via regex rules."""
        import warnings

        if warn_fallback:
            warnings.warn(
                f'\n[HypoTestX] Using built-in regex fallback to route: "{question}"\n'
                "  Confidence is limited (~0.6). For better accuracy use a real LLM backend:\n"
                '    hx.analyze(df, question, backend="gemini", api_key="...")\n'
                '    hx.analyze(df, question, backend="ollama")  # free, offline\n'
                "  Suppress this with: warn_fallback=False",
                UserWarning,
                stacklevel=4,
            )
        r = RoutingResult()
        lower = question.lower()

        # ── Detect test type ────────────────────────────────────────────
        for pattern, test, default_alt in _TESTS_BY_KEYWORD:
            if re.search(pattern, lower):
                r.test = test
                r.alternative = default_alt
                break

        if not r.test:
            r.test = "two_sample_ttest"  # safest default

        # ── Detect direction ────────────────────────────────────────────
        # Skip for two-sample tests: group ordering is alphabetical (from
        # sorted unique values), so "greater" / "less" would be misleading
        # unless the first-mentioned group happens to be first alphabetically.
        if r.test not in _TWO_SAMPLE_KEYS:
            for direction, regex in _DIRECTION_KEYWORDS.items():
                if re.search(regex, lower):
                    r.alternative = direction
                    break

        # ── Detect mu for one-sample ────────────────────────────────────
        if r.test == "one_sample_ttest":
            m = re.search(r"(\d+\.?\d*)", question)
            if m:
                r.mu = float(m.group(1))

        # ── Map question words to schema columns ────────────────────────
        r.value_column, r.group_column = _match_columns(lower, schema)

        _CORRELATION_TESTS = {"pearson", "spearman", "point_biserial"}
        _CATEGORICAL_TESTS = {"chi_square", "fisher"}

        if r.test in _CORRELATION_TESTS:
            # For correlation, x and y should both be numeric columns.
            # _match_columns puts matched numerics in value_column (first match).
            # Pull a second numeric column from mentioned or from schema.
            mentioned_num = _mentioned_numerics(lower, schema)
            if len(mentioned_num) >= 2:
                r.x_column = mentioned_num[0]
                r.y_column = mentioned_num[1]
            elif len(mentioned_num) == 1:
                nums = list(schema.numerics.keys())
                r.x_column = mentioned_num[0]
                # pick first numeric that is different
                r.y_column = next((c for c in nums if c != r.x_column), None)
            else:
                nums = list(schema.numerics.keys())
                r.x_column = nums[0] if len(nums) > 0 else None
                r.y_column = nums[1] if len(nums) > 1 else None
            r.value_column = r.x_column
            r.group_column = r.y_column
        elif r.test in _CATEGORICAL_TESTS:
            # Both columns should be categorical
            mentioned_cat = _mentioned_categoricals(lower, schema)
            if len(mentioned_cat) >= 2:
                r.x_column = mentioned_cat[0]
                r.y_column = mentioned_cat[1]
            elif len(mentioned_cat) == 1:
                cats = list(schema.categoricals.keys())
                r.x_column = mentioned_cat[0]
                r.y_column = next((c for c in cats if c != r.x_column), None)
            else:
                cats = list(schema.categoricals.keys())
                r.x_column = cats[0] if len(cats) > 0 else None
                r.y_column = cats[1] if len(cats) > 1 else None
            r.group_column = r.x_column
            r.value_column = r.y_column
        else:
            r.x_column = r.group_column or r.value_column
            r.y_column = r.value_column if r.x_column != r.value_column else None

        r.reasoning = "(routed by regex fallback — no LLM used)"
        r.confidence = 0.6
        r.routing_source = "fallback"
        return r




# ---------------------------------------------------------------------------
# Column matching helpers
# ---------------------------------------------------------------------------


def _match_columns(
    question_lower: str,
    schema: SchemaInfo,
) -> Tuple[Optional[str], Optional[str]]:
    """
    Heuristically assign value_column (numeric) and group_column (categorical)
    by finding schema column names that appear in the question text.
    """
    mentioned_numeric = []
    mentioned_categ = []

    for col in schema.columns:
        col_l = col.lower().replace("_", " ")
        if col_l in question_lower or col.lower() in question_lower:
            if col in schema.numerics:
                mentioned_numeric.append(col)
            elif col in schema.categoricals:
                mentioned_categ.append(col)

    value_col = mentioned_numeric[0] if mentioned_numeric else None
    group_col = mentioned_categ[0] if mentioned_categ else None

    # If nothing matched explicitly, use first numeric / first categorical
    if value_col is None and schema.numerics:
        value_col = list(schema.numerics.keys())[0]
    if group_col is None and schema.categoricals:
        group_col = list(schema.categoricals.keys())[0]

    return value_col, group_col


def _mentioned_numerics(question_lower: str, schema: SchemaInfo) -> List[str]:
    """Return all numeric schema columns whose name appears in the question."""
    found = []
    for col in schema.columns:
        if col not in schema.numerics:
            continue
        col_l = col.lower().replace("_", " ")
        if col_l in question_lower or col.lower() in question_lower:
            found.append(col)
    return found


def _mentioned_categoricals(question_lower: str, schema: SchemaInfo) -> List[str]:
    """Return all categorical schema columns whose name appears in the question."""
    found = []
    for col in schema.columns:
        if col not in schema.categoricals:
            continue
        col_l = col.lower().replace("_", " ")
        if col_l in question_lower or col.lower() in question_lower:
            found.append(col)
    return found