Source code for hypotestx.tests.correlation

"""
Correlation tests — pure Python implementations.

Tests
-----
- Pearson product-moment correlation
- Spearman rank-order correlation
- Point-biserial correlation
"""

from typing import List

from ..core.exceptions import DataFormatError, InsufficientDataError
from ..core.result import HypoResult
from ..core.validators import (
    validate_alpha,
    validate_alternative,
    validate_data,
    validate_paired_data,
)
from ..math.basic import abs_value, sqrt
from ..math.distributions import StudentT
from ..math.statistics import mean, std

# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _rank_data(data: List[float]) -> List[float]:
    """Assign average ranks to a list (ties get mean rank)."""
    n = len(data)
    indexed = sorted(enumerate(data), key=lambda x: x[1])
    ranks = [0.0] * n
    i = 0
    while i < n:
        j = i
        while j < n - 1 and indexed[j][1] == indexed[j + 1][1]:
            j += 1
        avg_rank = (i + j + 2) / 2.0
        for k in range(i, j + 1):
            ranks[indexed[k][0]] = avg_rank
        i = j + 1
    return ranks


def _pearson_r(x: List[float], y: List[float]) -> float:
    """Compute Pearson r between two equal-length lists of floats."""
    len(x)
    x_mean = mean(x)
    y_mean = mean(y)

    num = sum((xi - x_mean) * (yi - y_mean) for xi, yi in zip(x, y))
    den_x = sum((xi - x_mean) ** 2 for xi in x)
    den_y = sum((yi - y_mean) ** 2 for yi in y)

    denom = sqrt(den_x * den_y)
    if denom == 0.0:
        return 0.0
    return num / denom


def _r_to_pvalue(r: float, n: int, alternative: str) -> tuple:
    """Convert Pearson r to a p-value using the t-distribution."""
    if abs_value(r) >= 1.0:
        return 0.0, float("inf") * (1 if r >= 0 else -1)

    df = n - 2
    t_stat = r * sqrt(df) / sqrt(1 - r**2)
    t_dist = StudentT(df)

    if alternative == "two-sided":
        p = 2 * (1 - t_dist.cdf(abs_value(t_stat)))
    elif alternative == "greater":
        p = 1 - t_dist.cdf(t_stat)
    else:  # less
        p = t_dist.cdf(t_stat)

    return max(0.0, min(1.0, p)), t_stat


# ---------------------------------------------------------------------------
# Pearson Correlation
# ---------------------------------------------------------------------------


[docs] def pearson_correlation( x: List[float], y: List[float], alpha: float = 0.05, alternative: str = "two-sided", ) -> HypoResult: """ Pearson product-moment correlation coefficient and test. Tests whether the linear relationship between *x* and *y* is significantly different from zero. Assumes both variables are approximately continuous and bivariate normal. Args: x: First variable y: Second variable (same length as *x*) alpha: Significance level alternative: 'two-sided', 'greater' (positive correlation), or 'less' Returns: HypoResult with statistic=t, effect_size=r (Pearson r) Examples: >>> result = pearson_correlation([1,2,3,4,5], [2,4,5,4,5]) >>> print(f"r = {result.effect_size:.3f}, p = {result.p_value:.4f}") """ x, y = validate_paired_data(x, y) validate_alpha(alpha) validate_alternative(alternative) n = len(x) if n < 3: raise InsufficientDataError("Pearson correlation requires at least 3 data points") r = _pearson_r(x, y) p_value, t_stat = _r_to_pvalue(r, n, alternative) df = n - 2 df = n - 2 # Fisher's z transformation for confidence interval of r import math if abs_value(r) < 1.0: z_r = 0.5 * math.log((1 + r) / (1 - r)) se_z = 1.0 / sqrt(n - 3) if n > 3 else float("inf") z_crit = 1.96 if alternative == "two-sided" else 1.645 ci_z_low = z_r - z_crit * se_z ci_z_high = z_r + z_crit * se_z r_low = (math.exp(2 * ci_z_low) - 1) / (math.exp(2 * ci_z_low) + 1) r_high = (math.exp(2 * ci_z_high) - 1) / (math.exp(2 * ci_z_high) + 1) ci = (r_low, r_high) else: ci = (r, r) data_summary = { "n": n, "pearson_r": r, "t_statistic": t_stat, "df": df, "x_mean": mean(x), "y_mean": mean(y), "x_std": std(x), "y_std": std(y), } significance = "significant" if p_value < alpha else "not significant" interpretation = ( f"The Pearson correlation is {significance} " f"(r({df}) = {r:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). " + ( f"There is a {'positive' if r > 0 else 'negative'} linear relationship between the variables." # noqa: E501 if p_value < alpha else "No significant linear relationship detected." ) ) return HypoResult( test_name="Pearson Correlation", statistic=t_stat, p_value=p_value, effect_size=r, effect_size_name="r", confidence_interval=ci, degrees_of_freedom=df, sample_sizes=n, alpha=alpha, alternative=alternative, interpretation=interpretation, data_summary=data_summary, )
# --------------------------------------------------------------------------- # Spearman Rank Correlation # ---------------------------------------------------------------------------
[docs] def spearman_correlation( x: List[float], y: List[float], alpha: float = 0.05, alternative: str = "two-sided", ) -> HypoResult: """ Spearman rank-order correlation coefficient and test. A non-parametric measure of monotonic association. Computed by ranking both variables and calculating their Pearson correlation. Args: x: First variable y: Second variable (same length as *x*) alpha: Significance level alternative: 'two-sided', 'greater', or 'less' Returns: HypoResult with statistic=t, effect_size=rho (Spearman rho) Examples: >>> result = spearman_correlation([3,1,4,1,5], [9,2,6,5,3]) >>> print(f"rho = {result.effect_size:.3f}") """ x, y = validate_paired_data(x, y) validate_alpha(alpha) validate_alternative(alternative) n = len(x) if n < 3: raise InsufficientDataError("Spearman correlation requires at least 3 data points") # Compute ranks x_ranks = _rank_data(x) y_ranks = _rank_data(y) # Spearman rho is Pearson r on the ranks rho = _pearson_r(x_ranks, y_ranks) p_value, t_stat = _r_to_pvalue(rho, n, alternative) df = n - 2 data_summary = { "n": n, "spearman_rho": rho, "t_statistic": t_stat, "df": df, } significance = "significant" if p_value < alpha else "not significant" interpretation = ( f"The Spearman correlation is {significance} " f"(rho({df}) = {rho:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). " + ( f"There is a {'positive' if rho > 0 else 'negative'} monotonic relationship between the variables." # noqa: E501 if p_value < alpha else "No significant monotonic relationship detected." ) ) return HypoResult( test_name="Spearman Rank Correlation", statistic=t_stat, p_value=p_value, effect_size=rho, effect_size_name="r", confidence_interval=None, degrees_of_freedom=df, sample_sizes=n, alpha=alpha, alternative=alternative, interpretation=interpretation, data_summary=data_summary, )
# --------------------------------------------------------------------------- # Point-Biserial Correlation # ---------------------------------------------------------------------------
[docs] def point_biserial_correlation( continuous: List[float], binary: List, alpha: float = 0.05, alternative: str = "two-sided", ) -> HypoResult: """ Point-biserial correlation between a continuous and a dichotomous variable. Equivalent to Pearson correlation when one variable is binary (0/1 coded). Tests whether the mean of the continuous variable differs across the two groups. Args: continuous: Continuous-scale measurements binary: Binary group indicator (must contain exactly 2 unique values) alpha: Significance level alternative: 'two-sided', 'greater', or 'less' Returns: HypoResult with statistic=t, effect_size=r_pb Examples: >>> result = point_biserial_correlation( ... [5.5, 6.1, 7.2, 4.8, 5.9], ... [0, 1, 1, 0, 0] ... ) """ continuous = validate_data(continuous, 2, "continuous") validate_alpha(alpha) validate_alternative(alternative) if len(binary) != len(continuous): raise DataFormatError( f"continuous and binary must have the same length, " f"got {len(continuous)} and {len(binary)}" ) unique_vals = list(set(binary)) if len(unique_vals) != 2: raise DataFormatError( f"binary must contain exactly 2 unique values, got {len(unique_vals)}: {unique_vals}" ) # Encode binary variable as 0/1 lo, hi = sorted(unique_vals) binary_coded = [0.0 if v == lo else 1.0 for v in binary] n = len(continuous) n1 = sum(binary_coded) n0 = n - n1 if n1 == 0 or n0 == 0: raise DataFormatError("Both groups must have at least one observation") group1 = [continuous[i] for i in range(n) if binary_coded[i] == 1.0] group0 = [continuous[i] for i in range(n) if binary_coded[i] == 0.0] M1 = mean(group1) M0 = mean(group0) S_y = std(continuous, ddof=0) # population std for r_pb formula if S_y == 0.0: raise ValueError("Standard deviation of 'continuous' is zero") # Point-biserial r formula r_pb = ((M1 - M0) / S_y) * sqrt((n1 * n0) / (n * n)) p_value, t_stat = _r_to_pvalue(r_pb, n, alternative) df = n - 2 data_summary = { "n": n, "n_group0": int(n0), "n_group1": int(n1), "mean_group0": M0, "mean_group1": M1, "overall_std": S_y, "r_pb": r_pb, "t_statistic": t_stat, "df": df, "group0_label": lo, "group1_label": hi, } significance = "significant" if p_value < alpha else "not significant" interpretation = ( f"The point-biserial correlation is {significance} " f"(r_pb({df}) = {r_pb:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). " + ( f"The continuous variable differs significantly across the two groups " f"(mean_{hi} = {M1:.3f}, mean_{lo} = {M0:.3f})." if p_value < alpha else "No significant difference in the continuous variable across the two groups." ) ) return HypoResult( test_name="Point-Biserial Correlation", statistic=t_stat, p_value=p_value, effect_size=r_pb, effect_size_name="r", confidence_interval=None, degrees_of_freedom=df, sample_sizes=n, alpha=alpha, alternative=alternative, interpretation=interpretation, data_summary=data_summary, )