Source code for hypotestx.core.llm.backends.fallback

"""
Regex/rule-based fallback backend — zero dependencies, zero API calls.

Used automatically when no LLM backend is specified.
Handles the most common question patterns with reasonable accuracy.
For production use or complex questions, switch to a real LLM backend.
"""

from __future__ import annotations

import re
from typing import Dict, List, Optional, Tuple

from ..base import LLMBackend, RoutingResult, SchemaInfo

# ---------------------------------------------------------------------------
# Pattern tables
# ---------------------------------------------------------------------------

_TESTS_BY_KEYWORD = [
    # (regex, test_key, default_alternative)   — searched in order
    # Correlations
    (
        "correlat|related to|linear relationship|predict.*from|association.*numeric" "|scatter",
        "pearson",
        "two-sided",
    ),
    ("rank.*corr|spearman|monotone|ordinal corr", "spearman", "two-sided"),
    # Categorical association
    (
        "association|independent|chi.square|chi2|contingency|"
        "relationship.*categor|categor.*relationship",
        "chi_square",
        "two-sided",
    ),
    ("fisher", "fisher", "two-sided"),
    # One-sample
    (
        r"mean.*equal|equal.*mean|mean.*differ.*\d|differ.*\d.*mean|"
        r"average.*\d|\d.*average|test.*mean|mean.*test|"
        "is.*mean|population mean",
        "one_sample_ttest",
        "two-sided",
    ),
    # Paired
    (
        "before.*after|pre.*post|paired|repeated.*measure|"
        "within.*subject|change over time|same.*subject",
        "paired_ttest",
        "two-sided",
    ),
    # ANOVA / multi-group
    (
        "anova|more than two|three.*group|multiple.*group|"
        "several.*group|across.*group|among.*group",
        "anova",
        "two-sided",
    ),
    # Kruskal-Wallis
    ("kruskal|non.param.*group|group.*non.param", "kruskal_wallis", "two-sided"),
    # Two-sample
    (
        "compar.*mean|mean.*compar|differ.*group|group.*differ|"
        "between.*group|higher.*than|lower.*than|more.*than.*less.*than|"
        "male.*female|female.*male|group.*a.*group.*b|"
        "two.*group|independen.*sample",
        "two_sample_ttest",
        "two-sided",
    ),
]

_DIRECTION_KEYWORDS = {
    "greater": r"\bhigher\b|\bmore\b|\bgreater\b|\blarger\b|\bexceed\b|\babove\b",
    "less": r"\blower\b|\bless\b|\bsmaller\b|\bbelow\b|\bunder\b|\bfewer\b",
}

_TWO_SAMPLE_KEYS = {"two_sample_ttest", "mann_whitney", "anova", "kruskal_wallis"}

# ---------------------------------------------------------------------------
# Backend
# ---------------------------------------------------------------------------


[docs] class FallbackBackend(LLMBackend): """ Pure regex routing — no LLM, no internet, no dependencies. Accuracy is lower than an LLM but it always works offline and is extremely fast. Use it for quick experiments or when no LLM is available. """ name = "fallback"
[docs] def chat(self, messages: List[Dict[str, str]]) -> str: """ The fallback backend does not call an LLM. ``route()`` is overridden directly instead. """ return ""
[docs] def route( self, question: str, schema: SchemaInfo, extra_context: str = "", warn_fallback: bool = True, ) -> RoutingResult: """Bypass LLM and route via regex rules.""" import warnings if warn_fallback: warnings.warn( f'\n[HypoTestX] Using built-in regex fallback to route: "{question}"\n' " Confidence is limited (~0.6). For better accuracy use a real LLM backend:\n" ' hx.analyze(df, question, backend="gemini", api_key="...")\n' ' hx.analyze(df, question, backend="ollama") # free, offline\n' " Suppress this with: warn_fallback=False", UserWarning, stacklevel=4, ) r = RoutingResult() lower = question.lower() # ── Detect test type ──────────────────────────────────────────── for pattern, test, default_alt in _TESTS_BY_KEYWORD: if re.search(pattern, lower): r.test = test r.alternative = default_alt break if not r.test: r.test = "two_sample_ttest" # safest default # ── Detect direction ──────────────────────────────────────────── # Skip for two-sample tests: group ordering is alphabetical (from # sorted unique values), so "greater" / "less" would be misleading # unless the first-mentioned group happens to be first alphabetically. if r.test not in _TWO_SAMPLE_KEYS: for direction, regex in _DIRECTION_KEYWORDS.items(): if re.search(regex, lower): r.alternative = direction break # ── Detect mu for one-sample ──────────────────────────────────── if r.test == "one_sample_ttest": m = re.search(r"(\d+\.?\d*)", question) if m: r.mu = float(m.group(1)) # ── Map question words to schema columns ──────────────────────── r.value_column, r.group_column = _match_columns(lower, schema) _CORRELATION_TESTS = {"pearson", "spearman", "point_biserial"} _CATEGORICAL_TESTS = {"chi_square", "fisher"} if r.test in _CORRELATION_TESTS: # For correlation, x and y should both be numeric columns. # _match_columns puts matched numerics in value_column (first match). # Pull a second numeric column from mentioned or from schema. mentioned_num = _mentioned_numerics(lower, schema) if len(mentioned_num) >= 2: r.x_column = mentioned_num[0] r.y_column = mentioned_num[1] elif len(mentioned_num) == 1: nums = list(schema.numerics.keys()) r.x_column = mentioned_num[0] # pick first numeric that is different r.y_column = next((c for c in nums if c != r.x_column), None) else: nums = list(schema.numerics.keys()) r.x_column = nums[0] if len(nums) > 0 else None r.y_column = nums[1] if len(nums) > 1 else None r.value_column = r.x_column r.group_column = r.y_column elif r.test in _CATEGORICAL_TESTS: # Both columns should be categorical mentioned_cat = _mentioned_categoricals(lower, schema) if len(mentioned_cat) >= 2: r.x_column = mentioned_cat[0] r.y_column = mentioned_cat[1] elif len(mentioned_cat) == 1: cats = list(schema.categoricals.keys()) r.x_column = mentioned_cat[0] r.y_column = next((c for c in cats if c != r.x_column), None) else: cats = list(schema.categoricals.keys()) r.x_column = cats[0] if len(cats) > 0 else None r.y_column = cats[1] if len(cats) > 1 else None r.group_column = r.x_column r.value_column = r.y_column else: r.x_column = r.group_column or r.value_column r.y_column = r.value_column if r.x_column != r.value_column else None r.reasoning = "(routed by regex fallback — no LLM used)" r.confidence = 0.6 r.routing_source = "fallback" return r
# --------------------------------------------------------------------------- # Column matching helpers # --------------------------------------------------------------------------- def _match_columns( question_lower: str, schema: SchemaInfo, ) -> Tuple[Optional[str], Optional[str]]: """ Heuristically assign value_column (numeric) and group_column (categorical) by finding schema column names that appear in the question text. """ mentioned_numeric = [] mentioned_categ = [] for col in schema.columns: col_l = col.lower().replace("_", " ") if col_l in question_lower or col.lower() in question_lower: if col in schema.numerics: mentioned_numeric.append(col) elif col in schema.categoricals: mentioned_categ.append(col) value_col = mentioned_numeric[0] if mentioned_numeric else None group_col = mentioned_categ[0] if mentioned_categ else None # If nothing matched explicitly, use first numeric / first categorical if value_col is None and schema.numerics: value_col = list(schema.numerics.keys())[0] if group_col is None and schema.categoricals: group_col = list(schema.categoricals.keys())[0] return value_col, group_col def _mentioned_numerics(question_lower: str, schema: SchemaInfo) -> List[str]: """Return all numeric schema columns whose name appears in the question.""" found = [] for col in schema.columns: if col not in schema.numerics: continue col_l = col.lower().replace("_", " ") if col_l in question_lower or col.lower() in question_lower: found.append(col) return found def _mentioned_categoricals(question_lower: str, schema: SchemaInfo) -> List[str]: """Return all categorical schema columns whose name appears in the question.""" found = [] for col in schema.columns: if col not in schema.categoricals: continue col_l = col.lower().replace("_", " ") if col_l in question_lower or col.lower() in question_lower: found.append(col) return found