Statistical Tests

This module contains various statistical tests for AB testing.

AbsoluteIndependentTTest

Bases: BaseTest

Source code in aboba/tests/absolute_ttest.py

class AbsoluteIndependentTTest(BaseTest):
    def __init__(
        self,
        value_column="target",
        equal_var=True,
        random_state=None,
        alternative="two-sided",
        alpha=0.05
    ):
        """
        Independent t-test for absolute difference between two groups.

        This test compares the means of two independent groups to determine if there
        is a statistically significant difference between them in absolute terms.

        Args:
            value_column (str): Name of the column containing the values to test.
            equal_var (bool): If True, perform a standard independent 2 sample test
                that assumes equal population variances. If False, perform Welch's
                t-test, which does not assume equal population variances.
            random_state (int or None): Seed for the random number generator.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

            # Create sample data
            np.random.seed(42)
            group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
            group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

            # Perform the test
            test = AbsoluteIndependentTTest(value_column='target')
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """

        super().__init__()
        self.value_column = value_column
        self.equal_var = equal_var
        self.random_state = random_state
        self.alternative = alternative
        self.alpha = alpha

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "Expected exactly two groups"

        a_group, b_group = groups
        a = np.asarray(a_group[self.value_column], dtype=float)
        b = np.asarray(b_group[self.value_column], dtype=float)
        n1 = a.size
        n2 = b.size
        mean1 = a.mean()
        mean2 = b.mean()
        var1 = a.var(ddof=1)
        var2 = b.var(ddof=1)

        if self.equal_var:
            df = n1 + n2 - 2   
            sp2 = ((n1 - 1) * var1 + (n2 - 1) * var2) / df
            se = np.sqrt(sp2 * (1.0 / n1 + 1.0 / n2))
        else:
            se1 = var1 / n1
            se2 = var2 / n2
            se = np.sqrt(se1 + se2)
            num = (se1 + se2) ** 2
            den = (se1 ** 2) / (n1 - 1) + (se2 ** 2) / (n2 - 1)
            df = num / den

        if se == 0.0:
            t_stat = 0.0
            pvalue = 1.0
        else:
            t_stat = (mean2 - mean1) / se
            pvalue = compute_pvalue(t_stat, df, self.alternative)

        effect = mean2 - mean1
        q = sps.t.ppf(1 - self.alpha / 2, df)
        left_bound, right_bound = (effect - q*se, effect + q*se)

        return TestResult(pvalue=pvalue, effect=effect, effect_interval = (left_bound, right_bound))

init

__init__(value_column='target', equal_var=True, random_state=None, alternative='two-sided', alpha=0.05)

Independent t-test for absolute difference between two groups.

This test compares the means of two independent groups to determine if there is a statistically significant difference between them in absolute terms.

PARAMETER	DESCRIPTION
`value_column`	Name of the column containing the values to test. TYPE: `str` DEFAULT: `'target'`
`equal_var`	If True, perform a standard independent 2 sample test that assumes equal population variances. If False, perform Welch's t-test, which does not assume equal population variances. TYPE: `bool` DEFAULT: `True`
`random_state`	Seed for the random number generator. TYPE: `int or None` DEFAULT: `None`
`alternative`	Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'. TYPE: `str` DEFAULT: `'two-sided'`

Examples:

import pandas as pd
import numpy as np
from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

# Create sample data
np.random.seed(42)
group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

# Perform the test
test = AbsoluteIndependentTTest(value_column='target')
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")

Source code in aboba/tests/absolute_ttest.py

def __init__(
    self,
    value_column="target",
    equal_var=True,
    random_state=None,
    alternative="two-sided",
    alpha=0.05
):
    """
    Independent t-test for absolute difference between two groups.

    This test compares the means of two independent groups to determine if there
    is a statistically significant difference between them in absolute terms.

    Args:
        value_column (str): Name of the column containing the values to test.
        equal_var (bool): If True, perform a standard independent 2 sample test
            that assumes equal population variances. If False, perform Welch's
            t-test, which does not assume equal population variances.
        random_state (int or None): Seed for the random number generator.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

        # Create sample data
        np.random.seed(42)
        group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
        group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

        # Perform the test
        test = AbsoluteIndependentTTest(value_column='target')
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """

    super().__init__()
    self.value_column = value_column
    self.equal_var = equal_var
    self.random_state = random_state
    self.alternative = alternative
    self.alpha = alpha

AbsoluteRelatedTTest

Bases: BaseTest

Performs a paired (related) two-sample t-test on absolute data.

ATTRIBUTE	DESCRIPTION
`value_column`	Name of the column containing the values to test. TYPE: `str`
`alternative`	Defines the alternative hypothesis. The following options are available (default is 'two-sided'): - 'two-sided': the means of the distributions underlying the samples are unequal. - 'greater': the mean of the distribution underlying the first sample is greater. - 'less': the mean of the distribution underlying the first sample is smaller. TYPE: `str`
`alpha`	Significance level for confidence intervals TYPE: `float), default=0.05`

Source code in aboba/tests/absolute_ttest.py

class AbsoluteRelatedTTest(BaseTest):
    """
    Performs a paired (related) two-sample t-test on absolute data.

    Attributes:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. The following options are
            available (default is 'two-sided'):
            - 'two-sided': the means of the distributions underlying the samples are unequal.
            - 'greater': the mean of the distribution underlying the first sample is greater.
            - 'less': the mean of the distribution underlying the first sample is smaller.
        alpha (float), default=0.05:  Significance level for confidence intervals
    """

    def __init__(
        self,
        value_column="target",
        alternative="two-sided",
        alpha = 0.05
    ):
        """
        Related (paired) t-test for absolute difference between two groups.

        This test compares the means of two related groups to determine if there
        is a statistically significant difference between them in absolute terms.
        It is typically used when the same subjects are measured twice (before/after).

        Args:
            value_column (str): Name of the column containing the values to test.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

            # Create sample paired data
            np.random.seed(42)
            before = np.random.normal(10, 2, 50)
            after = before + np.random.normal(0.5, 1, 50)  # Adding effect
            group_a = pd.DataFrame({'target': before})
            group_b = pd.DataFrame({'target': after})

            # Perform the test
            test = AbsoluteRelatedTTest(value_column='target')
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """

        super().__init__()
        self.value_column = value_column
        self.alternative = alternative
        self.alpha = alpha

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "Expected exactly two groups"

        a_group, b_group = groups

        a = np.asarray(a_group[self.value_column], dtype=float)
        b = np.asarray(b_group[self.value_column], dtype=float)

        mask = np.isfinite(a) & np.isfinite(b)
        d = b[mask] - a[mask]
        n = d.size
        mean_diff = d.mean()
        std_diff = d.std(ddof=1)
        df = n - 1

        if std_diff == 0.0:
            se = 0.0
            if mean_diff == 0.0:
                t_stat = 0.0
                pvalue = 1.0
            else:
                t_stat = float("inf") if mean_diff > 0 else float("-inf")
                pvalue = compute_pvalue(t_stat, df, self.alternative)
        else:
            se = std_diff / np.sqrt(n)
            t_stat = mean_diff / se
            pvalue = compute_pvalue(t_stat, df, self.alternative)

        effect = mean_diff

        q = sps.t.ppf(1 - self.alpha / 2, df)
        left_bound, right_bound = effect - q * se, effect + q *se

        return TestResult(pvalue=pvalue, effect=effect, effect_type="absolute", effect_interval=(left_bound, right_bound))

init

__init__(value_column='target', alternative='two-sided', alpha=0.05)

Related (paired) t-test for absolute difference between two groups.

This test compares the means of two related groups to determine if there is a statistically significant difference between them in absolute terms. It is typically used when the same subjects are measured twice (before/after).

PARAMETER	DESCRIPTION
`value_column`	Name of the column containing the values to test. TYPE: `str` DEFAULT: `'target'`
`alternative`	Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'. TYPE: `str` DEFAULT: `'two-sided'`

Examples:

import pandas as pd
import numpy as np
from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

# Create sample paired data
np.random.seed(42)
before = np.random.normal(10, 2, 50)
after = before + np.random.normal(0.5, 1, 50)  # Adding effect
group_a = pd.DataFrame({'target': before})
group_b = pd.DataFrame({'target': after})

# Perform the test
test = AbsoluteRelatedTTest(value_column='target')
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")

Source code in aboba/tests/absolute_ttest.py

def __init__(
    self,
    value_column="target",
    alternative="two-sided",
    alpha = 0.05
):
    """
    Related (paired) t-test for absolute difference between two groups.

    This test compares the means of two related groups to determine if there
    is a statistically significant difference between them in absolute terms.
    It is typically used when the same subjects are measured twice (before/after).

    Args:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

        # Create sample paired data
        np.random.seed(42)
        before = np.random.normal(10, 2, 50)
        after = before + np.random.normal(0.5, 1, 50)  # Adding effect
        group_a = pd.DataFrame({'target': before})
        group_b = pd.DataFrame({'target': after})

        # Perform the test
        test = AbsoluteRelatedTTest(value_column='target')
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """

    super().__init__()
    self.value_column = value_column
    self.alternative = alternative
    self.alpha = alpha

RelativeIndependentTTest

Bases: BaseTest

Performs an independent t-test using a ratio-based measure for effect size relative to the control group.

Compatible with CUPED preprocessing: when used with CupedProcessor, automatically uses the original (pre-CUPED) control mean for denominator calculation, ensuring correct interpretation of relative effects while benefiting from variance reduction.

ATTRIBUTE	DESCRIPTION
`value_column`	Name of the column containing the values to test. TYPE: `str`
`alternative`	Defines the alternative hypothesis. Must be one of {'two-sided', 'less', 'greater'}. TYPE: `str`
`alpha`	Significance level for confidence intervals (default: 0.05). TYPE: `float`

Source code in aboba/tests/relative_ttest.py

class RelativeIndependentTTest(BaseTest):
    """
    Performs an independent t-test using a ratio-based measure for effect size relative
    to the control group.

    Compatible with CUPED preprocessing: when used with CupedProcessor, automatically
    uses the original (pre-CUPED) control mean for denominator calculation, ensuring
    correct interpretation of relative effects while benefiting from variance reduction.

    Attributes:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Must be one of
            {'two-sided', 'less', 'greater'}.
        alpha (float): Significance level for confidence intervals (default: 0.05).
    """

    def __init__(
        self,
        value_column="target",
        alternative="two-sided",
        alpha=0.05,
    ):
        """
        Independent t-test for relative difference between two groups.

        This test compares the means of two independent groups to determine if there
        is a statistically significant relative difference between them. The relative
        difference is calculated as (test_mean - control_mean) / control_mean.

        When used after CUPED preprocessing, automatically uses the original control
        mean (before CUPED transformation) for correct relative effect calculation.
        This provides the best of both worlds: variance reduction from CUPED and
        correct relative effect interpretation.

        Args:
            value_column (str): Name of the column containing the values to test.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.
            alpha (float): Significance level for confidence intervals (default: 0.05).

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.relative_ttest import RelativeIndependentTTest

            # Example 1: Basic usage without CUPED
            np.random.seed(42)
            control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
            test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

            test_instance = RelativeIndependentTTest(value_column='target')
            result = test_instance.test([control, test])
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

            # Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
            # The test will automatically detect and use original control mean from artifacts
            ```
        """
        super().__init__()
        self.value_column = value_column
        self.alternative = alternative
        self.alpha = alpha
        assert alternative in {"two-sided", "less", "greater"}

    def test(self, groups: List[pd.DataFrame], artefacts: Optional[Dict] = None) -> TestResult:
        """
        Perform the relative independent t-test on the provided groups.

        Args:
            groups (List[pd.DataFrame]): List of two DataFrames representing the groups to compare.
                The first group is treated as the control group, the second as the test group.
            artefacts (Optional[Dict]): Artifacts from preprocessing pipeline. 
                If 'cuped_original_control_mean' is present, uses it as denominator for relative effect.
                This enables correct relative effect calculation when using CUPED.

        Returns:
            TestResult: Object containing the p-value and relative effect size.
                - pvalue: Statistical significance
                - effect: Relative effect (test - control) / control_mean
                - effect_type: "relative_control"
                - effect_interval: Confidence interval for the effect
        """

        control_group, test_group = groups

        Y = control_group[self.value_column].to_numpy(float)
        X = test_group[self.value_column].to_numpy(float)

        var_1, var_2 = np.var(X, ddof=1), np.var(Y, ddof=1)
        a_1, a_2 = np.mean(X), np.mean(Y)

        # Check if we have CUPED artifacts with original control mean
        if artefacts is not None and 'cuped_original_control_mean' in artefacts:
            original_control_mean = artefacts['cuped_original_control_mean']
            R = (a_1 - a_2) / original_control_mean
            # var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2
            var_R = (var_1 + var_2) / (original_control_mean ** 2)
        else:
            R = (a_1 - a_2) / a_2
            var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2

        n = len(test_group)
        stat = np.sqrt(n) * R / np.sqrt(var_R)

        if self.alternative == "two-sided":
            pvalue = 2 * min(sps.norm.cdf(stat), sps.norm.sf(stat))
            pvalue = min(pvalue, 1.0)
        elif self.alternative == "less":
            pvalue = sps.norm.cdf(stat)
        elif self.alternative == "greater":
            pvalue = sps.norm.sf(stat)
        else:
            raise ValueError(f"Unknown alternative: {self.alternative}")

        q = sps.norm.ppf(1 - self.alpha/2)
        left_bound = R - q * np.sqrt(var_R / n)
        right_bound = R + q * np.sqrt(var_R / n)

        return TestResult(
            pvalue=pvalue, 
            effect=R, 
            effect_type="relative_control", 
            effect_interval=(left_bound, right_bound)
        )

init

__init__(value_column='target', alternative='two-sided', alpha=0.05)

Independent t-test for relative difference between two groups.

This test compares the means of two independent groups to determine if there is a statistically significant relative difference between them. The relative difference is calculated as (test_mean - control_mean) / control_mean.

When used after CUPED preprocessing, automatically uses the original control mean (before CUPED transformation) for correct relative effect calculation. This provides the best of both worlds: variance reduction from CUPED and correct relative effect interpretation.

PARAMETER	DESCRIPTION
`value_column`	Name of the column containing the values to test. TYPE: `str` DEFAULT: `'target'`
`alternative`	Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'. TYPE: `str` DEFAULT: `'two-sided'`
`alpha`	Significance level for confidence intervals (default: 0.05). TYPE: `float` DEFAULT: `0.05`

Examples:

import pandas as pd
import numpy as np
from aboba.tests.relative_ttest import RelativeIndependentTTest

# Example 1: Basic usage without CUPED
np.random.seed(42)
control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

test_instance = RelativeIndependentTTest(value_column='target')
result = test_instance.test([control, test])
print(f"P-value: {result.pvalue:.4f}")
print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

# Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
# The test will automatically detect and use original control mean from artifacts

Source code in aboba/tests/relative_ttest.py

def __init__(
    self,
    value_column="target",
    alternative="two-sided",
    alpha=0.05,
):
    """
    Independent t-test for relative difference between two groups.

    This test compares the means of two independent groups to determine if there
    is a statistically significant relative difference between them. The relative
    difference is calculated as (test_mean - control_mean) / control_mean.

    When used after CUPED preprocessing, automatically uses the original control
    mean (before CUPED transformation) for correct relative effect calculation.
    This provides the best of both worlds: variance reduction from CUPED and
    correct relative effect interpretation.

    Args:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.
        alpha (float): Significance level for confidence intervals (default: 0.05).

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.relative_ttest import RelativeIndependentTTest

        # Example 1: Basic usage without CUPED
        np.random.seed(42)
        control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
        test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

        test_instance = RelativeIndependentTTest(value_column='target')
        result = test_instance.test([control, test])
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

        # Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
        # The test will automatically detect and use original control mean from artifacts
        ```
    """
    super().__init__()
    self.value_column = value_column
    self.alternative = alternative
    self.alpha = alpha
    assert alternative in {"two-sided", "less", "greater"}

test

test(groups: List[DataFrame], artefacts: Optional[Dict] = None) -> TestResult

Perform the relative independent t-test on the provided groups.

PARAMETER	DESCRIPTION
`groups`	List of two DataFrames representing the groups to compare. The first group is treated as the control group, the second as the test group. TYPE: `List[DataFrame]`
`artefacts`	Artifacts from preprocessing pipeline. If 'cuped_original_control_mean' is present, uses it as denominator for relative effect. This enables correct relative effect calculation when using CUPED. TYPE: `Optional[Dict]` DEFAULT: `None`

RETURNS	DESCRIPTION
`TestResult`	Object containing the p-value and relative effect size. - pvalue: Statistical significance - effect: Relative effect (test - control) / control_mean - effect_type: "relative_control" - effect_interval: Confidence interval for the effect TYPE: `TestResult`

Source code in aboba/tests/relative_ttest.py

def test(self, groups: List[pd.DataFrame], artefacts: Optional[Dict] = None) -> TestResult:
    """
    Perform the relative independent t-test on the provided groups.

    Args:
        groups (List[pd.DataFrame]): List of two DataFrames representing the groups to compare.
            The first group is treated as the control group, the second as the test group.
        artefacts (Optional[Dict]): Artifacts from preprocessing pipeline. 
            If 'cuped_original_control_mean' is present, uses it as denominator for relative effect.
            This enables correct relative effect calculation when using CUPED.

    Returns:
        TestResult: Object containing the p-value and relative effect size.
            - pvalue: Statistical significance
            - effect: Relative effect (test - control) / control_mean
            - effect_type: "relative_control"
            - effect_interval: Confidence interval for the effect
    """

    control_group, test_group = groups

    Y = control_group[self.value_column].to_numpy(float)
    X = test_group[self.value_column].to_numpy(float)

    var_1, var_2 = np.var(X, ddof=1), np.var(Y, ddof=1)
    a_1, a_2 = np.mean(X), np.mean(Y)

    # Check if we have CUPED artifacts with original control mean
    if artefacts is not None and 'cuped_original_control_mean' in artefacts:
        original_control_mean = artefacts['cuped_original_control_mean']
        R = (a_1 - a_2) / original_control_mean
        # var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2
        var_R = (var_1 + var_2) / (original_control_mean ** 2)
    else:
        R = (a_1 - a_2) / a_2
        var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2

    n = len(test_group)
    stat = np.sqrt(n) * R / np.sqrt(var_R)

    if self.alternative == "two-sided":
        pvalue = 2 * min(sps.norm.cdf(stat), sps.norm.sf(stat))
        pvalue = min(pvalue, 1.0)
    elif self.alternative == "less":
        pvalue = sps.norm.cdf(stat)
    elif self.alternative == "greater":
        pvalue = sps.norm.sf(stat)
    else:
        raise ValueError(f"Unknown alternative: {self.alternative}")

    q = sps.norm.ppf(1 - self.alpha/2)
    left_bound = R - q * np.sqrt(var_R / n)
    right_bound = R + q * np.sqrt(var_R / n)

    return TestResult(
        pvalue=pvalue, 
        effect=R, 
        effect_type="relative_control", 
        effect_interval=(left_bound, right_bound)
    )

StratifiedTTest

Bases: BaseTest

Source code in aboba/tests/stratified_ttest.py

class StratifiedTTest(BaseTest):
    def __init__(
        self,
        group_column: str,
        group_size: int,
        method: str,
        strata_columns: List[str],
        strata_weights: Union[pd.Series, dict],
        col_name: str = "target",
        alpha: float = 0.05,
    ):
        """
        Performs a stratified t-test on the data.

        This test performs a t-test while accounting for stratification in the data.
        Strata weights must be provided by the caller and represent global (population)
        proportions for each stratum — they are not inferred from the sample.

        Args:
            group_column (str): Name of the column containing group identifiers.
            group_size (int): Size of groups to sample (used externally by splitters).
            method (str): Weighting method. One of 'random', 'stratified', 'post_stratified'.
            strata_columns (List[str]): List of columns to stratify by.
            strata_weights (Union[pd.Series, dict]): Global (population) weights for each
                stratum. Keys/index must match the unique values of the strata columns.
                Will be normalised to sum to 1.
            col_name (str): Name of the column to test.
            alpha (float): Significance level for confidence interval.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.stratified_ttest import StratifiedTTest

            np.random.seed(42)
            data = pd.DataFrame({
                'group': np.repeat(['A', 'B'], 100),
                'strata': np.tile(['X', 'Y'], 100),
                'target': np.concatenate([
                    np.random.normal(10, 2, 100),
                    np.random.normal(12, 2, 100)
                ])
            })

            strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

            test = StratifiedTTest(
                group_column='group',
                group_size=50,
                method='stratified',
                strata_columns=['strata'],
                strata_weights=strata_weights,
                col_name='target',
            )

            groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
            result = test.test(groups, {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """
        assert method in [
            "random",
            "stratified",
            "post_stratified",
        ], f"Invalid {method = }. Must be one of 'random', 'stratified', 'post_stratified'"
        assert len(strata_columns) > 0, "Must have at least one strata column"

        super().__init__()
        self.col_name = col_name
        self.method = method
        self.strata_columns = strata_columns
        self.alpha = alpha
        self.group_column = group_column
        self.group_size = group_size

        weights = pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
        self.strata_weights = weights / weights.sum()

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        """
        Perform the stratified t-test on the provided groups.

        Args:
            groups (List[pd.DataFrame]): List of two DataFrames representing the groups.
            artefacts: Unused; kept for interface compatibility.

        Returns:
            TestResult: Object containing the p-value and effect size.
        """
        assert len(groups) == 2

        if self.method == "random":
            mean_function = self._simple_mean
            var_function = self._simple_var
        elif self.method == "stratified":
            mean_function = self._weighted_mean
            var_function = self._weighted_var
        elif self.method == "post_stratified":
            mean_function = self._weighted_mean
            var_function = self._weighted_post_var
        else:
            raise RuntimeError(f"method not supported {self.method = }")

        x_mean = mean_function(groups[0])
        y_mean = mean_function(groups[1])
        x_var = var_function(groups[0])
        y_var = var_function(groups[1])

        effect = x_mean - y_mean
        std = np.sqrt(x_var + y_var)
        t_stat = effect / std
        pvalue = 2 * sps.norm.sf(np.abs(t_stat))
        q = sps.norm.ppf(1 - self.alpha / 2)
        left_bound, right_bound = effect - q * std, effect + q * std

        return TestResult(pvalue, effect, effect_interval=(left_bound, right_bound))

    def _weighted_mean(self, data):
        strata_means = data.groupby(by=self.strata_columns)[self.col_name].mean()
        return (strata_means * self.strata_weights).sum()

    def _weighted_var(self, data):
        strata_vars = data.groupby(by=self.strata_columns)[self.col_name].var()
        return (strata_vars * self.strata_weights).sum() / len(data)

    def _weighted_post_var(self, data):
        strata_vars = data.groupby(by=self.strata_columns)[self.col_name].var()
        weighted_var = (strata_vars * self.strata_weights).sum() / len(data)
        post_addition = (strata_vars * (1 - self.strata_weights)).sum() / (
            len(data) ** 2
        )
        return weighted_var + post_addition

    def _simple_mean(self, data):
        return data[self.col_name].mean()

    def _simple_var(self, data):
        return data[self.col_name].var() / len(data)

init

__init__(group_column: str, group_size: int, method: str, strata_columns: List[str], strata_weights: Union[Series, dict], col_name: str = 'target', alpha: float = 0.05)

Performs a stratified t-test on the data.

This test performs a t-test while accounting for stratification in the data. Strata weights must be provided by the caller and represent global (population) proportions for each stratum — they are not inferred from the sample.

PARAMETER	DESCRIPTION
`group_column`	Name of the column containing group identifiers. TYPE: `str`
`group_size`	Size of groups to sample (used externally by splitters). TYPE: `int`
`method`	Weighting method. One of 'random', 'stratified', 'post_stratified'. TYPE: `str`
`strata_columns`	List of columns to stratify by. TYPE: `List[str]`
`strata_weights`	Global (population) weights for each stratum. Keys/index must match the unique values of the strata columns. Will be normalised to sum to 1. TYPE: `Union[Series, dict]`
`col_name`	Name of the column to test. TYPE: `str` DEFAULT: `'target'`
`alpha`	Significance level for confidence interval. TYPE: `float` DEFAULT: `0.05`

Examples:

import pandas as pd
import numpy as np
from aboba.tests.stratified_ttest import StratifiedTTest

np.random.seed(42)
data = pd.DataFrame({
    'group': np.repeat(['A', 'B'], 100),
    'strata': np.tile(['X', 'Y'], 100),
    'target': np.concatenate([
        np.random.normal(10, 2, 100),
        np.random.normal(12, 2, 100)
    ])
})

strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

test = StratifiedTTest(
    group_column='group',
    group_size=50,
    method='stratified',
    strata_columns=['strata'],
    strata_weights=strata_weights,
    col_name='target',
)

groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
result = test.test(groups, {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")

Source code in aboba/tests/stratified_ttest.py

def __init__(
    self,
    group_column: str,
    group_size: int,
    method: str,
    strata_columns: List[str],
    strata_weights: Union[pd.Series, dict],
    col_name: str = "target",
    alpha: float = 0.05,
):
    """
    Performs a stratified t-test on the data.

    This test performs a t-test while accounting for stratification in the data.
    Strata weights must be provided by the caller and represent global (population)
    proportions for each stratum — they are not inferred from the sample.

    Args:
        group_column (str): Name of the column containing group identifiers.
        group_size (int): Size of groups to sample (used externally by splitters).
        method (str): Weighting method. One of 'random', 'stratified', 'post_stratified'.
        strata_columns (List[str]): List of columns to stratify by.
        strata_weights (Union[pd.Series, dict]): Global (population) weights for each
            stratum. Keys/index must match the unique values of the strata columns.
            Will be normalised to sum to 1.
        col_name (str): Name of the column to test.
        alpha (float): Significance level for confidence interval.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.stratified_ttest import StratifiedTTest

        np.random.seed(42)
        data = pd.DataFrame({
            'group': np.repeat(['A', 'B'], 100),
            'strata': np.tile(['X', 'Y'], 100),
            'target': np.concatenate([
                np.random.normal(10, 2, 100),
                np.random.normal(12, 2, 100)
            ])
        })

        strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

        test = StratifiedTTest(
            group_column='group',
            group_size=50,
            method='stratified',
            strata_columns=['strata'],
            strata_weights=strata_weights,
            col_name='target',
        )

        groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
        result = test.test(groups, {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """
    assert method in [
        "random",
        "stratified",
        "post_stratified",
    ], f"Invalid {method = }. Must be one of 'random', 'stratified', 'post_stratified'"
    assert len(strata_columns) > 0, "Must have at least one strata column"

    super().__init__()
    self.col_name = col_name
    self.method = method
    self.strata_columns = strata_columns
    self.alpha = alpha
    self.group_column = group_column
    self.group_size = group_size

    weights = pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
    self.strata_weights = weights / weights.sum()

test

test(groups: List[DataFrame], artefacts: Dict = {}) -> TestResult

Perform the stratified t-test on the provided groups.

PARAMETER	DESCRIPTION
`groups`	List of two DataFrames representing the groups. TYPE: `List[DataFrame]`
`artefacts`	Unused; kept for interface compatibility. TYPE: `Dict` DEFAULT: `{}`

RETURNS	DESCRIPTION
`TestResult`	Object containing the p-value and effect size. TYPE: `TestResult`

Source code in aboba/tests/stratified_ttest.py

def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
    """
    Perform the stratified t-test on the provided groups.

    Args:
        groups (List[pd.DataFrame]): List of two DataFrames representing the groups.
        artefacts: Unused; kept for interface compatibility.

    Returns:
        TestResult: Object containing the p-value and effect size.
    """
    assert len(groups) == 2

    if self.method == "random":
        mean_function = self._simple_mean
        var_function = self._simple_var
    elif self.method == "stratified":
        mean_function = self._weighted_mean
        var_function = self._weighted_var
    elif self.method == "post_stratified":
        mean_function = self._weighted_mean
        var_function = self._weighted_post_var
    else:
        raise RuntimeError(f"method not supported {self.method = }")

    x_mean = mean_function(groups[0])
    y_mean = mean_function(groups[1])
    x_var = var_function(groups[0])
    y_var = var_function(groups[1])

    effect = x_mean - y_mean
    std = np.sqrt(x_var + y_var)
    t_stat = effect / std
    pvalue = 2 * sps.norm.sf(np.abs(t_stat))
    q = sps.norm.ppf(1 - self.alpha / 2)
    left_bound, right_bound = effect - q * std, effect + q * std

    return TestResult(pvalue, effect, effect_interval=(left_bound, right_bound))

CupedLinearRegressionTTest

Bases: BaseTest

Source code in aboba/tests/cuped_lreg.py

class CupedLinearRegressionTTest(BaseTest):
    def __init__(
        self,
        covariate_names: Optional[List[str]] = None,
        group_column: str = "group",
        value_column: str = "target",
        alpha: float = 0.05,
        center_on_control: bool = True,
        weight_column: Optional[str] = None,
        include_extra: bool = False,
        strata_column: Optional[str] = None,
        strata_weights: Optional[Union[Dict, pd.Series]] = None,
    ) -> None:
        """
        CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

        This test uses linear regression to adjust for pre-experiment covariates, reducing
        variance and increasing statistical power. The method centers covariates on the
        control group mean and estimates the treatment effect using OLS or WLS regression
        with heteroscedasticity-robust standard errors (HC3).

        Args:
            covariate_names (List[str], optional): List of pre-experiment covariates to adjust for.
                These should be variables measured before the experiment that correlate with
                the outcome metric.
            group_column (str): Name of column containing group assignment (A/B). Default "group".
            value_column (str): Name of column containing metric values to test. Default "target".
            alpha (float): Significance level for confidence interval. Default 0.05.
            center_on_control (bool): If True, covariates are centered by their mean in control
                group. This is recommended for variance reduction. Default True.
            weight_column (Optional[str]): Column with observation weights for weighted least
                squares regression. If None, uses ordinary least squares.
            include_extra (bool): If True, includes additional regression artifacts (parameters,
                design matrix, residuals) in TestResult.extra. Default False.
            strata_column (Optional[str]): Column containing stratum identifiers. Used together
                with strata_weights to derive per-observation WLS weights when weight_column
                is not provided.
            strata_weights (Optional[Union[Dict, pd.Series]]): Global (population) weight for
                each stratum. When strata_column is set and weight_column is None, the
                per-observation weight is computed as:
                    w_i = strata_weight[stratum_i] / count(stratum_i in group_i)

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

            # Create sample data with pre-experiment covariate
            np.random.seed(42)
            n = 200
            pre_metric = np.random.normal(100, 15, n)

            # Control group
            group_a = pd.DataFrame({
                'target': pre_metric[:100] + np.random.normal(0, 10, 100),
                'pre_metric': pre_metric[:100],
                'group': 0
            })

            # Treatment group with effect
            group_b = pd.DataFrame({
                'target': pre_metric[100:] + np.random.normal(5, 10, 100),
                'pre_metric': pre_metric[100:],
                'group': 1
            })

            # Perform CUPED test
            test = CupedLinearRegressionTTest(
                covariate_names=['pre_metric'],
                value_column='target',
                group_column='group'
            )
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")
            ```
        """
        super().__init__()
        self.value_column = value_column
        self.group_column = group_column
        self.covariate_names = covariate_names or []
        self.alpha = alpha
        self.center_on_control = center_on_control
        self.weight_column = weight_column
        self.include_extra = include_extra
        self.strata_column = strata_column
        self.strata_weights = (
            pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
        )

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "CupedLinearRegressionTTest expects exactly two groups"

        a_group, b_group = groups[0].copy(), groups[1].copy()

        # Derive per-observation WLS weights from strata when weight_column is absent.
        effective_weight_column = self.weight_column
        if (
            self.strata_column is not None
            and self.strata_weights is not None
            and self.weight_column is None
        ):
            for group in (a_group, b_group):
                stratum_counts = group[self.strata_column].value_counts()
                group["_strata_weight"] = (
                    group[self.strata_column].map(self.strata_weights)
                    / group[self.strata_column].map(stratum_counts)
                )
            effective_weight_column = "_strata_weight"

        data = pd.concat([a_group, b_group], ignore_index=True)

        feature_names: List[str] = [self.group_column]
        for name in self.covariate_names:
            if self.center_on_control:
                if effective_weight_column is None:
                    mean_control = a_group[name].mean()
                else:
                    assert effective_weight_column in a_group.columns, (
                        f"Weight column '{effective_weight_column}' not found in control group"
                    )
                    w = a_group[effective_weight_column].to_numpy(float)
                    x = a_group[name].to_numpy(float)
                    mean_control = float(np.average(x, weights=w))
                cname = f"{name}_c"
                data[cname] = data[name] - mean_control
                feature_names.append(cname)
            else:
                feature_names.append(name)

        formula = f"{self.value_column} ~ " + " + ".join(feature_names)

        if effective_weight_column is None:
            model = ols(formula, data=data).fit(cov_type="HC3")
        else:
            assert effective_weight_column in data.columns, (
                f"Weight column '{effective_weight_column}' not found in data"
            )
            model = wls(
                formula,
                data=data,
                weights=data[effective_weight_column],
            ).fit(cov_type="HC3")

        ef = model.params[self.group_column]
        se = model.bse[self.group_column]
        pvalue = model.pvalues[self.group_column]

        q = sps.norm.ppf(1 - self.alpha / 2)
        left_bound, right_bound  = ef - q * se, ef + q * se  
        extra = None
        if self.include_extra:
            extra = {
                "params": model.params,
                "design_matrix": model.model.exog,
                "resid": model.resid
            }

        return TestResult(pvalue=pvalue, effect=ef, effect_interval=(left_bound, right_bound),extra=extra if self.include_extra else None)

init

__init__(covariate_names: Optional[List[str]] = None, group_column: str = 'group', value_column: str = 'target', alpha: float = 0.05, center_on_control: bool = True, weight_column: Optional[str] = None, include_extra: bool = False, strata_column: Optional[str] = None, strata_weights: Optional[Union[Dict, Series]] = None) -> None

CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

This test uses linear regression to adjust for pre-experiment covariates, reducing variance and increasing statistical power. The method centers covariates on the control group mean and estimates the treatment effect using OLS or WLS regression with heteroscedasticity-robust standard errors (HC3).

PARAMETER	DESCRIPTION
`covariate_names`	List of pre-experiment covariates to adjust for. These should be variables measured before the experiment that correlate with the outcome metric. TYPE: `List[str]` DEFAULT: `None`
`group_column`	Name of column containing group assignment (A/B). Default "group". TYPE: `str` DEFAULT: `'group'`
`value_column`	Name of column containing metric values to test. Default "target". TYPE: `str` DEFAULT: `'target'`
`alpha`	Significance level for confidence interval. Default 0.05. TYPE: `float` DEFAULT: `0.05`
`center_on_control`	If True, covariates are centered by their mean in control group. This is recommended for variance reduction. Default True. TYPE: `bool` DEFAULT: `True`
`weight_column`	Column with observation weights for weighted least squares regression. If None, uses ordinary least squares. TYPE: `Optional[str]` DEFAULT: `None`
`include_extra`	If True, includes additional regression artifacts (parameters, design matrix, residuals) in TestResult.extra. Default False. TYPE: `bool` DEFAULT: `False`
`strata_column`	Column containing stratum identifiers. Used together with strata_weights to derive per-observation WLS weights when weight_column is not provided. TYPE: `Optional[str]` DEFAULT: `None`
`strata_weights`	Global (population) weight for each stratum. When strata_column is set and weight_column is None, the per-observation weight is computed as: w_i = strata_weight[stratum_i] / count(stratum_i in group_i) TYPE: `Optional[Union[Dict, Series]]` DEFAULT: `None`

Examples:

import pandas as pd
import numpy as np
from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

# Create sample data with pre-experiment covariate
np.random.seed(42)
n = 200
pre_metric = np.random.normal(100, 15, n)

# Control group
group_a = pd.DataFrame({
    'target': pre_metric[:100] + np.random.normal(0, 10, 100),
    'pre_metric': pre_metric[:100],
    'group': 0
})

# Treatment group with effect
group_b = pd.DataFrame({
    'target': pre_metric[100:] + np.random.normal(5, 10, 100),
    'pre_metric': pre_metric[100:],
    'group': 1
})

# Perform CUPED test
test = CupedLinearRegressionTTest(
    covariate_names=['pre_metric'],
    value_column='target',
    group_column='group'
)
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")
print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")

Source code in aboba/tests/cuped_lreg.py

def __init__(
    self,
    covariate_names: Optional[List[str]] = None,
    group_column: str = "group",
    value_column: str = "target",
    alpha: float = 0.05,
    center_on_control: bool = True,
    weight_column: Optional[str] = None,
    include_extra: bool = False,
    strata_column: Optional[str] = None,
    strata_weights: Optional[Union[Dict, pd.Series]] = None,
) -> None:
    """
    CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

    This test uses linear regression to adjust for pre-experiment covariates, reducing
    variance and increasing statistical power. The method centers covariates on the
    control group mean and estimates the treatment effect using OLS or WLS regression
    with heteroscedasticity-robust standard errors (HC3).

    Args:
        covariate_names (List[str], optional): List of pre-experiment covariates to adjust for.
            These should be variables measured before the experiment that correlate with
            the outcome metric.
        group_column (str): Name of column containing group assignment (A/B). Default "group".
        value_column (str): Name of column containing metric values to test. Default "target".
        alpha (float): Significance level for confidence interval. Default 0.05.
        center_on_control (bool): If True, covariates are centered by their mean in control
            group. This is recommended for variance reduction. Default True.
        weight_column (Optional[str]): Column with observation weights for weighted least
            squares regression. If None, uses ordinary least squares.
        include_extra (bool): If True, includes additional regression artifacts (parameters,
            design matrix, residuals) in TestResult.extra. Default False.
        strata_column (Optional[str]): Column containing stratum identifiers. Used together
            with strata_weights to derive per-observation WLS weights when weight_column
            is not provided.
        strata_weights (Optional[Union[Dict, pd.Series]]): Global (population) weight for
            each stratum. When strata_column is set and weight_column is None, the
            per-observation weight is computed as:
                w_i = strata_weight[stratum_i] / count(stratum_i in group_i)

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

        # Create sample data with pre-experiment covariate
        np.random.seed(42)
        n = 200
        pre_metric = np.random.normal(100, 15, n)

        # Control group
        group_a = pd.DataFrame({
            'target': pre_metric[:100] + np.random.normal(0, 10, 100),
            'pre_metric': pre_metric[:100],
            'group': 0
        })

        # Treatment group with effect
        group_b = pd.DataFrame({
            'target': pre_metric[100:] + np.random.normal(5, 10, 100),
            'pre_metric': pre_metric[100:],
            'group': 1
        })

        # Perform CUPED test
        test = CupedLinearRegressionTTest(
            covariate_names=['pre_metric'],
            value_column='target',
            group_column='group'
        )
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")
        ```
    """
    super().__init__()
    self.value_column = value_column
    self.group_column = group_column
    self.covariate_names = covariate_names or []
    self.alpha = alpha
    self.center_on_control = center_on_control
    self.weight_column = weight_column
    self.include_extra = include_extra
    self.strata_column = strata_column
    self.strata_weights = (
        pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
    )