Source code for hypotestx.tests.categorical

"""
Categorical statistical tests — pure Python implementations.

Tests
-----
- Chi-Square Test of Independence / Goodness-of-Fit
- Fisher's Exact Test  (2×2 tables)
"""

from typing import List, Optional, Union

from ..core.exceptions import DataFormatError
from ..core.result import HypoResult
from ..core.validators import validate_alpha, validate_alternative, validate_contingency_table
from ..math.basic import sqrt
from ..math.distributions import ChiSquare

# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _build_expected(observed: List[List[float]]) -> List[List[float]]:
    """Compute expected cell frequencies from row/column marginals."""
    nrows = len(observed)
    ncols = len(observed[0])
    row_sums = [sum(observed[r]) for r in range(nrows)]
    col_sums = [sum(observed[r][c] for r in range(nrows)) for c in range(ncols)]
    total = sum(row_sums)
    if total == 0:
        raise DataFormatError(
            "Contingency table is all zeros — cannot compute expected frequencies"
        )
    return [[row_sums[r] * col_sums[c] / total for c in range(ncols)] for r in range(nrows)]


def _hypergeom_pmf(k: int, N: int, K: int, n: int) -> float:
    """
    Hypergeometric PMF: P(X = k)

    Drawing *n* items from a population of *N* where *K* are successes.
    Uses log-space arithmetic to avoid overflow with large factorials.
    """

    def log_binom(a: int, b: int) -> float:
        """log C(a, b) using log-gamma"""
        if b < 0 or b > a:
            return float("-inf")
        if b == 0 or b == a:
            return 0.0
        # log C(a,b) = lgamma(a+1) - lgamma(b+1) - lgamma(a-b+1)
        import math

        return math.lgamma(a + 1) - math.lgamma(b + 1) - math.lgamma(a - b + 1)

    import math

    log_p = log_binom(K, k) + log_binom(N - K, n - k) - log_binom(N, n)
    if log_p == float("-inf"):
        return 0.0
    return math.exp(log_p)


# ---------------------------------------------------------------------------
# Chi-Square Test
# ---------------------------------------------------------------------------


[docs] def chi_square_test( observed: Union[List[List[float]], List[float]], expected: Optional[List[float]] = None, alpha: float = 0.05, correction: bool = False, ) -> HypoResult: """ Chi-square test of independence (2-D table) or goodness-of-fit (1-D). For a 2-D contingency table the test checks whether row and column variables are independent. For a 1-D array of observed counts it checks whether they follow the specified (or uniform) expected distribution. Args: observed: 2-D contingency table *or* 1-D list of observed counts expected: Expected counts for each category (1-D case only). If None, a uniform expected distribution is assumed. alpha: Significance level correction: Apply Yates' continuity correction (2×2 tables only) Returns: HypoResult with statistic=chi2, effect_size=Cramer's V (or phi for 2×2) Examples: >>> # 2-D contingency table >>> table = [[30, 10], [20, 40]] >>> result = chi_square_test(table) >>> # Goodness-of-fit >>> result = chi_square_test([50, 30, 20], expected=[40, 30, 30]) """ validate_alpha(alpha) # Determine if 1-D or 2-D input is_1d = False if observed and not isinstance(observed[0], (list, tuple)): is_1d = True if is_1d: # ── Goodness-of-fit ──────────────────────────────────────────────── observed_1d = [float(v) for v in observed] k = len(observed_1d) if k < 2: raise DataFormatError("Goodness-of-fit test requires at least 2 categories") total = sum(observed_1d) if total <= 0: raise DataFormatError("Total observed count must be positive") if expected is None: expected_1d = [total / k] * k else: if len(expected) != k: raise DataFormatError( f"expected must have the same length as observed ({k}), got {len(expected)}" ) expected_1d = [float(v) for v in expected] exp_total = sum(expected_1d) if abs(exp_total - total) > 1e-6: # re-scale expected to match observed total expected_1d = [e * total / exp_total for e in expected_1d] # Check expected frequencies if any(e < 5 for e in expected_1d): import warnings warnings.warn( "Some expected frequencies are less than 5; chi-square approximation may be inaccurate." # noqa: E501 ) chi2 = sum((o - e) ** 2 / e for o, e in zip(observed_1d, expected_1d) if e > 0) df = k - 1 n = total nrows, ncols = 1, k # for effect size formula table_2d = [observed_1d] # Effect size: w = sqrt(chi2 / n) effect_size = sqrt(chi2 / n) if n > 0 else None effect_name = "Cohen's w" else: # ── Test of independence ─────────────────────────────────────────── table_2d = validate_contingency_table(observed) nrows = len(table_2d) ncols = len(table_2d[0]) expected_2d = _build_expected(table_2d) n = sum(table_2d[r][c] for r in range(nrows) for c in range(ncols)) # Check expected frequencies small_cells = sum(1 for r in range(nrows) for c in range(ncols) if expected_2d[r][c] < 5) if small_cells > 0: import warnings warnings.warn( f"{small_cells} cell(s) have expected frequency < 5; " "consider Fisher's exact test for 2×2 tables." ) # Yates' continuity correction (2×2 only) if correction and nrows == 2 and ncols == 2: chi2 = sum( (abs(table_2d[r][c] - expected_2d[r][c]) - 0.5) ** 2 / expected_2d[r][c] for r in range(nrows) for c in range(ncols) if expected_2d[r][c] > 0 ) else: chi2 = sum( (table_2d[r][c] - expected_2d[r][c]) ** 2 / expected_2d[r][c] for r in range(nrows) for c in range(ncols) if expected_2d[r][c] > 0 ) df = (nrows - 1) * (ncols - 1) # Effect size: Cramer's V (phi for 2×2) min_dim = min(nrows - 1, ncols - 1) if n > 0 and min_dim > 0: cramers_v = sqrt(chi2 / (n * min_dim)) cramers_v = min(cramers_v, 1.0) else: cramers_v = None effect_size = cramers_v effect_name = "phi" if (nrows == 2 and ncols == 2) else "Cramer's V" # p-value from chi-square distribution chi2_dist = ChiSquare(df) p_value = max(0.0, min(1.0, 1 - chi2_dist.cdf(chi2))) data_summary = { "observed": table_2d if not is_1d else observed_1d, "expected": expected_2d if not is_1d else expected_1d, "n_total": n, "n_rows": nrows, "n_cols": ncols, } significance = "significant" if p_value < alpha else "not significant" if is_1d: interpretation = ( f"The chi-square goodness-of-fit test is {significance} " f"(chi2({df}) = {chi2:.4f}, p = {p_value:.4f}). " + ( "The observed distribution differs significantly from expected." if p_value < alpha else "No significant deviation from expected distribution." ) ) else: interpretation = ( f"The chi-square test of independence is {significance} " f"(chi2({df}) = {chi2:.4f}, p = {p_value:.4f}). " + ( "The row and column variables are significantly associated." if p_value < alpha else "No significant association found between the variables." ) ) return HypoResult( test_name="Chi-Square Test" + (" of Independence" if not is_1d else " (Goodness-of-Fit)"), statistic=chi2, p_value=p_value, effect_size=effect_size, effect_size_name=effect_name, confidence_interval=None, degrees_of_freedom=df, sample_sizes=int(n), alpha=alpha, alternative="two-sided", interpretation=interpretation, data_summary=data_summary, )
# --------------------------------------------------------------------------- # Fisher's Exact Test # ---------------------------------------------------------------------------
[docs] def fisher_exact_test( table: List[List[float]], alpha: float = 0.05, alternative: str = "two-sided", ) -> HypoResult: """ Fisher's Exact Test for a 2×2 contingency table. Computes the exact p-value using the hypergeometric distribution. Preferred over the chi-square test when any expected cell count is < 5. Args: table: 2×2 contingency table [[a, b], [c, d]] alpha: Significance level alternative: 'two-sided', 'greater' (more association than expected), or 'less' Returns: HypoResult with statistic=odds_ratio Examples: >>> table = [[8, 2], [1, 5]] >>> result = fisher_exact_test(table) >>> print(result.p_value) """ table = validate_contingency_table(table) validate_alpha(alpha) validate_alternative(alternative) if len(table) != 2 or len(table[0]) != 2: raise DataFormatError( "Fisher's Exact Test requires a 2×2 contingency table, " f"got {len(table)}×{len(table[0])}" ) a, b = int(table[0][0]), int(table[0][1]) c, d = int(table[1][0]), int(table[1][1]) R1 = a + b # row 1 total R2 = c + d # row 2 total C1 = a + c # col 1 total C2 = b + d # col 2 total N = R1 + R2 if N == 0: raise DataFormatError("Contingency table total count is zero") # Possible values for cell (0,0) given fixed marginals a_min = max(0, R1 - C2) a_max = min(R1, C1) # Compute probability for each possible table pmf = {} for k in range(a_min, a_max + 1): pmf[k] = _hypergeom_pmf(k, N, C1, R1) p_observed = pmf.get(a, 0.0) if alternative == "two-sided": p_value = sum(p for p in pmf.values() if p <= p_observed + 1e-10) elif alternative == "greater": p_value = sum(p for k, p in pmf.items() if k >= a) else: # less p_value = sum(p for k, p in pmf.items() if k <= a) p_value = max(0.0, min(1.0, p_value)) # Odds ratio (with Haldane-Anscombe correction for zeros) if b == 0 or c == 0: # Apply small continuity correction to avoid division by zero odds_ratio = ((a + 0.5) * (d + 0.5)) / ((b + 0.5) * (c + 0.5)) else: odds_ratio = (a * d) / (b * c) data_summary = { "table": [[a, b], [c, d]], "row1_total": R1, "row2_total": R2, "col1_total": C1, "col2_total": C2, "N_total": N, "odds_ratio": odds_ratio, "p_observed_table": p_observed, } significance = "significant" if p_value < alpha else "not significant" interpretation = ( f"Fisher's Exact test is {significance} " f"(odds ratio = {odds_ratio:.4f}, p = {p_value:.4f}). " + ( "A significant association exists between the two categorical variables." if p_value < alpha else "No significant association found between the two categorical variables." ) ) return HypoResult( test_name="Fisher's Exact Test", statistic=odds_ratio, p_value=p_value, effect_size=odds_ratio, effect_size_name="odds ratio", confidence_interval=None, degrees_of_freedom=None, sample_sizes=N, alpha=alpha, alternative=alternative, interpretation=interpretation, data_summary=data_summary, )