Source code for hypotestx.tests.correlation

"""
Correlation tests — pure Python implementations.

Tests
-----
- Pearson product-moment correlation
- Spearman rank-order correlation
- Point-biserial correlation
"""

from typing import List

from ..core.exceptions import DataFormatError, InsufficientDataError
from ..core.result import HypoResult
from ..core.validators import (
    validate_alpha,
    validate_alternative,
    validate_data,
    validate_paired_data,
)
from ..math.basic import abs_value, sqrt
from ..math.distributions import StudentT
from ..math.statistics import mean, std

# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _rank_data(data: List[float]) -> List[float]:
    """Assign average ranks to a list (ties get mean rank)."""
    n = len(data)
    indexed = sorted(enumerate(data), key=lambda x: x[1])
    ranks = [0.0] * n
    i = 0
    while i < n:
        j = i
        while j < n - 1 and indexed[j][1] == indexed[j + 1][1]:
            j += 1
        avg_rank = (i + j + 2) / 2.0
        for k in range(i, j + 1):
            ranks[indexed[k][0]] = avg_rank
        i = j + 1
    return ranks


def _pearson_r(x: List[float], y: List[float]) -> float:
    """Compute Pearson r between two equal-length lists of floats."""
    len(x)
    x_mean = mean(x)
    y_mean = mean(y)

    num = sum((xi - x_mean) * (yi - y_mean) for xi, yi in zip(x, y))
    den_x = sum((xi - x_mean) ** 2 for xi in x)
    den_y = sum((yi - y_mean) ** 2 for yi in y)

    denom = sqrt(den_x * den_y)
    if denom == 0.0:
        return 0.0
    return num / denom


def _r_to_pvalue(r: float, n: int, alternative: str) -> tuple:
    """Convert Pearson r to a p-value using the t-distribution."""
    if abs_value(r) >= 1.0:
        return 0.0, float("inf") * (1 if r >= 0 else -1)

    df = n - 2
    t_stat = r * sqrt(df) / sqrt(1 - r**2)
    t_dist = StudentT(df)

    if alternative == "two-sided":
        p = 2 * (1 - t_dist.cdf(abs_value(t_stat)))
    elif alternative == "greater":
        p = 1 - t_dist.cdf(t_stat)
    else:  # less
        p = t_dist.cdf(t_stat)

    return max(0.0, min(1.0, p)), t_stat


# ---------------------------------------------------------------------------
# Pearson Correlation
# ---------------------------------------------------------------------------



[docs]
def pearson_correlation(
    x: List[float],
    y: List[float],
    alpha: float = 0.05,
    alternative: str = "two-sided",
) -> HypoResult:
    """
    Pearson product-moment correlation coefficient and test.

    Tests whether the linear relationship between *x* and *y* is significantly
    different from zero.  Assumes both variables are approximately continuous
    and bivariate normal.

    Args:
        x: First variable
        y: Second variable (same length as *x*)
        alpha: Significance level
        alternative: 'two-sided', 'greater' (positive correlation), or 'less'

    Returns:
        HypoResult with statistic=t, effect_size=r (Pearson r)

    Examples:
        >>> result = pearson_correlation([1,2,3,4,5], [2,4,5,4,5])
        >>> print(f"r = {result.effect_size:.3f}, p = {result.p_value:.4f}")
    """
    x, y = validate_paired_data(x, y)
    validate_alpha(alpha)
    validate_alternative(alternative)
    n = len(x)

    if n < 3:
        raise InsufficientDataError("Pearson correlation requires at least 3 data points")

    r = _pearson_r(x, y)
    p_value, t_stat = _r_to_pvalue(r, n, alternative)

    df = n - 2
    df = n - 2
    # Fisher's z transformation for confidence interval of r
    import math

    if abs_value(r) < 1.0:
        z_r = 0.5 * math.log((1 + r) / (1 - r))
        se_z = 1.0 / sqrt(n - 3) if n > 3 else float("inf")
        z_crit = 1.96 if alternative == "two-sided" else 1.645
        ci_z_low = z_r - z_crit * se_z
        ci_z_high = z_r + z_crit * se_z
        r_low = (math.exp(2 * ci_z_low) - 1) / (math.exp(2 * ci_z_low) + 1)
        r_high = (math.exp(2 * ci_z_high) - 1) / (math.exp(2 * ci_z_high) + 1)
        ci = (r_low, r_high)
    else:
        ci = (r, r)

    data_summary = {
        "n": n,
        "pearson_r": r,
        "t_statistic": t_stat,
        "df": df,
        "x_mean": mean(x),
        "y_mean": mean(y),
        "x_std": std(x),
        "y_std": std(y),
    }

    significance = "significant" if p_value < alpha else "not significant"
    interpretation = (
        f"The Pearson correlation is {significance} "
        f"(r({df}) = {r:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). "
        + (
            f"There is a {'positive' if r > 0 else 'negative'} linear relationship between the variables."  # noqa: E501
            if p_value < alpha
            else "No significant linear relationship detected."
        )
    )

    return HypoResult(
        test_name="Pearson Correlation",
        statistic=t_stat,
        p_value=p_value,
        effect_size=r,
        effect_size_name="r",
        confidence_interval=ci,
        degrees_of_freedom=df,
        sample_sizes=n,
        alpha=alpha,
        alternative=alternative,
        interpretation=interpretation,
        data_summary=data_summary,
    )



# ---------------------------------------------------------------------------
# Spearman Rank Correlation
# ---------------------------------------------------------------------------



[docs]
def spearman_correlation(
    x: List[float],
    y: List[float],
    alpha: float = 0.05,
    alternative: str = "two-sided",
) -> HypoResult:
    """
    Spearman rank-order correlation coefficient and test.

    A non-parametric measure of monotonic association.  Computed by ranking
    both variables and calculating their Pearson correlation.

    Args:
        x: First variable
        y: Second variable (same length as *x*)
        alpha: Significance level
        alternative: 'two-sided', 'greater', or 'less'

    Returns:
        HypoResult with statistic=t, effect_size=rho (Spearman rho)

    Examples:
        >>> result = spearman_correlation([3,1,4,1,5], [9,2,6,5,3])
        >>> print(f"rho = {result.effect_size:.3f}")
    """
    x, y = validate_paired_data(x, y)
    validate_alpha(alpha)
    validate_alternative(alternative)
    n = len(x)

    if n < 3:
        raise InsufficientDataError("Spearman correlation requires at least 3 data points")

    # Compute ranks
    x_ranks = _rank_data(x)
    y_ranks = _rank_data(y)

    # Spearman rho is Pearson r on the ranks
    rho = _pearson_r(x_ranks, y_ranks)
    p_value, t_stat = _r_to_pvalue(rho, n, alternative)

    df = n - 2

    data_summary = {
        "n": n,
        "spearman_rho": rho,
        "t_statistic": t_stat,
        "df": df,
    }

    significance = "significant" if p_value < alpha else "not significant"
    interpretation = (
        f"The Spearman correlation is {significance} "
        f"(rho({df}) = {rho:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). "
        + (
            f"There is a {'positive' if rho > 0 else 'negative'} monotonic relationship between the variables."  # noqa: E501
            if p_value < alpha
            else "No significant monotonic relationship detected."
        )
    )

    return HypoResult(
        test_name="Spearman Rank Correlation",
        statistic=t_stat,
        p_value=p_value,
        effect_size=rho,
        effect_size_name="r",
        confidence_interval=None,
        degrees_of_freedom=df,
        sample_sizes=n,
        alpha=alpha,
        alternative=alternative,
        interpretation=interpretation,
        data_summary=data_summary,
    )



# ---------------------------------------------------------------------------
# Point-Biserial Correlation
# ---------------------------------------------------------------------------



[docs]
def point_biserial_correlation(
    continuous: List[float],
    binary: List,
    alpha: float = 0.05,
    alternative: str = "two-sided",
) -> HypoResult:
    """
    Point-biserial correlation between a continuous and a dichotomous variable.

    Equivalent to Pearson correlation when one variable is binary (0/1 coded).
    Tests whether the mean of the continuous variable differs across the two groups.

    Args:
        continuous: Continuous-scale measurements
        binary: Binary group indicator (must contain exactly 2 unique values)
        alpha: Significance level
        alternative: 'two-sided', 'greater', or 'less'

    Returns:
        HypoResult with statistic=t, effect_size=r_pb

    Examples:
        >>> result = point_biserial_correlation(
        ...     [5.5, 6.1, 7.2, 4.8, 5.9],
        ...     [0, 1, 1, 0, 0]
        ... )
    """
    continuous = validate_data(continuous, 2, "continuous")
    validate_alpha(alpha)
    validate_alternative(alternative)

    if len(binary) != len(continuous):
        raise DataFormatError(
            f"continuous and binary must have the same length, "
            f"got {len(continuous)} and {len(binary)}"
        )

    unique_vals = list(set(binary))
    if len(unique_vals) != 2:
        raise DataFormatError(
            f"binary must contain exactly 2 unique values, got {len(unique_vals)}: {unique_vals}"
        )

    # Encode binary variable as 0/1
    lo, hi = sorted(unique_vals)
    binary_coded = [0.0 if v == lo else 1.0 for v in binary]

    n = len(continuous)
    n1 = sum(binary_coded)
    n0 = n - n1

    if n1 == 0 or n0 == 0:
        raise DataFormatError("Both groups must have at least one observation")

    group1 = [continuous[i] for i in range(n) if binary_coded[i] == 1.0]
    group0 = [continuous[i] for i in range(n) if binary_coded[i] == 0.0]

    M1 = mean(group1)
    M0 = mean(group0)
    S_y = std(continuous, ddof=0)  # population std for r_pb formula

    if S_y == 0.0:
        raise ValueError("Standard deviation of 'continuous' is zero")

    # Point-biserial r formula
    r_pb = ((M1 - M0) / S_y) * sqrt((n1 * n0) / (n * n))

    p_value, t_stat = _r_to_pvalue(r_pb, n, alternative)
    df = n - 2

    data_summary = {
        "n": n,
        "n_group0": int(n0),
        "n_group1": int(n1),
        "mean_group0": M0,
        "mean_group1": M1,
        "overall_std": S_y,
        "r_pb": r_pb,
        "t_statistic": t_stat,
        "df": df,
        "group0_label": lo,
        "group1_label": hi,
    }

    significance = "significant" if p_value < alpha else "not significant"
    interpretation = (
        f"The point-biserial correlation is {significance} "
        f"(r_pb({df}) = {r_pb:.4f}, t = {t_stat:.4f}, p = {p_value:.4f}). "
        + (
            f"The continuous variable differs significantly across the two groups "
            f"(mean_{hi} = {M1:.3f}, mean_{lo} = {M0:.3f})."
            if p_value < alpha
            else "No significant difference in the continuous variable across the two groups."
        )
    )

    return HypoResult(
        test_name="Point-Biserial Correlation",
        statistic=t_stat,
        p_value=p_value,
        effect_size=r_pb,
        effect_size_name="r",
        confidence_interval=None,
        degrees_of_freedom=df,
        sample_sizes=n,
        alpha=alpha,
        alternative=alternative,
        interpretation=interpretation,
        data_summary=data_summary,
    )