Skip to content

Statistical Tests

This module contains various statistical tests for AB testing.

AbsoluteIndependentTTest

Bases: BaseTest

Source code in aboba/tests/absolute_ttest.py
class AbsoluteIndependentTTest(BaseTest):
    def __init__(
        self,
        value_column="target",
        equal_var=True,
        random_state=None,
        alternative="two-sided",
        alpha=0.05
    ):
        """
        Independent t-test for absolute difference between two groups.

        This test compares the means of two independent groups to determine if there
        is a statistically significant difference between them in absolute terms.

        Args:
            value_column (str): Name of the column containing the values to test.
            equal_var (bool): If True, perform a standard independent 2 sample test
                that assumes equal population variances. If False, perform Welch's
                t-test, which does not assume equal population variances.
            random_state (int or None): Seed for the random number generator.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

            # Create sample data
            np.random.seed(42)
            group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
            group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

            # Perform the test
            test = AbsoluteIndependentTTest(value_column='target')
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """

        super().__init__()
        self.value_column = value_column
        self.equal_var = equal_var
        self.random_state = random_state
        self.alternative = alternative
        self.alpha = alpha

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "Expected exactly two groups"

        a_group, b_group = groups
        a = np.asarray(a_group[self.value_column], dtype=float)
        b = np.asarray(b_group[self.value_column], dtype=float)
        n1 = a.size
        n2 = b.size
        mean1 = a.mean()
        mean2 = b.mean()
        var1 = a.var(ddof=1)
        var2 = b.var(ddof=1)

        if self.equal_var:
            df = n1 + n2 - 2   
            sp2 = ((n1 - 1) * var1 + (n2 - 1) * var2) / df
            se = np.sqrt(sp2 * (1.0 / n1 + 1.0 / n2))
        else:
            se1 = var1 / n1
            se2 = var2 / n2
            se = np.sqrt(se1 + se2)
            num = (se1 + se2) ** 2
            den = (se1 ** 2) / (n1 - 1) + (se2 ** 2) / (n2 - 1)
            df = num / den

        if se == 0.0:
            t_stat = 0.0
            pvalue = 1.0
        else:
            t_stat = (mean2 - mean1) / se
            pvalue = compute_pvalue(t_stat, df, self.alternative)

        effect = mean2 - mean1
        q = sps.t.ppf(1 - self.alpha / 2, df)
        left_bound, right_bound = (effect - q*se, effect + q*se)

        return TestResult(pvalue=pvalue, effect=effect, effect_interval = (left_bound, right_bound))

__init__

__init__(value_column='target', equal_var=True, random_state=None, alternative='two-sided', alpha=0.05)

Independent t-test for absolute difference between two groups.

This test compares the means of two independent groups to determine if there is a statistically significant difference between them in absolute terms.

PARAMETER DESCRIPTION
value_column

Name of the column containing the values to test.

TYPE: str DEFAULT: 'target'

equal_var

If True, perform a standard independent 2 sample test that assumes equal population variances. If False, perform Welch's t-test, which does not assume equal population variances.

TYPE: bool DEFAULT: True

random_state

Seed for the random number generator.

TYPE: int or None DEFAULT: None

alternative

Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'.

TYPE: str DEFAULT: 'two-sided'

Examples:

import pandas as pd
import numpy as np
from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

# Create sample data
np.random.seed(42)
group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

# Perform the test
test = AbsoluteIndependentTTest(value_column='target')
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")
Source code in aboba/tests/absolute_ttest.py
def __init__(
    self,
    value_column="target",
    equal_var=True,
    random_state=None,
    alternative="two-sided",
    alpha=0.05
):
    """
    Independent t-test for absolute difference between two groups.

    This test compares the means of two independent groups to determine if there
    is a statistically significant difference between them in absolute terms.

    Args:
        value_column (str): Name of the column containing the values to test.
        equal_var (bool): If True, perform a standard independent 2 sample test
            that assumes equal population variances. If False, perform Welch's
            t-test, which does not assume equal population variances.
        random_state (int or None): Seed for the random number generator.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.absolute_ttest import AbsoluteIndependentTTest

        # Create sample data
        np.random.seed(42)
        group_a = pd.DataFrame({'target': np.random.normal(10, 2, 100)})
        group_b = pd.DataFrame({'target': np.random.normal(12, 2, 100)})

        # Perform the test
        test = AbsoluteIndependentTTest(value_column='target')
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """

    super().__init__()
    self.value_column = value_column
    self.equal_var = equal_var
    self.random_state = random_state
    self.alternative = alternative
    self.alpha = alpha

AbsoluteRelatedTTest

Bases: BaseTest

Performs a paired (related) two-sample t-test on absolute data.

ATTRIBUTE DESCRIPTION
value_column

Name of the column containing the values to test.

TYPE: str

alternative

Defines the alternative hypothesis. The following options are available (default is 'two-sided'): - 'two-sided': the means of the distributions underlying the samples are unequal. - 'greater': the mean of the distribution underlying the first sample is greater. - 'less': the mean of the distribution underlying the first sample is smaller.

TYPE: str

alpha

Significance level for confidence intervals

TYPE: float), default=0.05

Source code in aboba/tests/absolute_ttest.py
class AbsoluteRelatedTTest(BaseTest):
    """
    Performs a paired (related) two-sample t-test on absolute data.

    Attributes:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. The following options are
            available (default is 'two-sided'):
            - 'two-sided': the means of the distributions underlying the samples are unequal.
            - 'greater': the mean of the distribution underlying the first sample is greater.
            - 'less': the mean of the distribution underlying the first sample is smaller.
        alpha (float), default=0.05:  Significance level for confidence intervals
    """

    def __init__(
        self,
        value_column="target",
        alternative="two-sided",
        alpha = 0.05
    ):
        """
        Related (paired) t-test for absolute difference between two groups.

        This test compares the means of two related groups to determine if there
        is a statistically significant difference between them in absolute terms.
        It is typically used when the same subjects are measured twice (before/after).

        Args:
            value_column (str): Name of the column containing the values to test.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

            # Create sample paired data
            np.random.seed(42)
            before = np.random.normal(10, 2, 50)
            after = before + np.random.normal(0.5, 1, 50)  # Adding effect
            group_a = pd.DataFrame({'target': before})
            group_b = pd.DataFrame({'target': after})

            # Perform the test
            test = AbsoluteRelatedTTest(value_column='target')
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """

        super().__init__()
        self.value_column = value_column
        self.alternative = alternative
        self.alpha = alpha

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "Expected exactly two groups"

        a_group, b_group = groups

        a = np.asarray(a_group[self.value_column], dtype=float)
        b = np.asarray(b_group[self.value_column], dtype=float)

        mask = np.isfinite(a) & np.isfinite(b)
        d = b[mask] - a[mask]
        n = d.size
        mean_diff = d.mean()
        std_diff = d.std(ddof=1)
        df = n - 1

        if std_diff == 0.0:
            se = 0.0
            if mean_diff == 0.0:
                t_stat = 0.0
                pvalue = 1.0
            else:
                t_stat = float("inf") if mean_diff > 0 else float("-inf")
                pvalue = compute_pvalue(t_stat, df, self.alternative)
        else:
            se = std_diff / np.sqrt(n)
            t_stat = mean_diff / se
            pvalue = compute_pvalue(t_stat, df, self.alternative)

        effect = mean_diff

        q = sps.t.ppf(1 - self.alpha / 2, df)
        left_bound, right_bound = effect - q * se, effect + q *se

        return TestResult(pvalue=pvalue, effect=effect, effect_type="absolute", effect_interval=(left_bound, right_bound))

__init__

__init__(value_column='target', alternative='two-sided', alpha=0.05)

Related (paired) t-test for absolute difference between two groups.

This test compares the means of two related groups to determine if there is a statistically significant difference between them in absolute terms. It is typically used when the same subjects are measured twice (before/after).

PARAMETER DESCRIPTION
value_column

Name of the column containing the values to test.

TYPE: str DEFAULT: 'target'

alternative

Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'.

TYPE: str DEFAULT: 'two-sided'

Examples:

import pandas as pd
import numpy as np
from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

# Create sample paired data
np.random.seed(42)
before = np.random.normal(10, 2, 50)
after = before + np.random.normal(0.5, 1, 50)  # Adding effect
group_a = pd.DataFrame({'target': before})
group_b = pd.DataFrame({'target': after})

# Perform the test
test = AbsoluteRelatedTTest(value_column='target')
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")
Source code in aboba/tests/absolute_ttest.py
def __init__(
    self,
    value_column="target",
    alternative="two-sided",
    alpha = 0.05
):
    """
    Related (paired) t-test for absolute difference between two groups.

    This test compares the means of two related groups to determine if there
    is a statistically significant difference between them in absolute terms.
    It is typically used when the same subjects are measured twice (before/after).

    Args:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.absolute_ttest import AbsoluteRelatedTTest

        # Create sample paired data
        np.random.seed(42)
        before = np.random.normal(10, 2, 50)
        after = before + np.random.normal(0.5, 1, 50)  # Adding effect
        group_a = pd.DataFrame({'target': before})
        group_b = pd.DataFrame({'target': after})

        # Perform the test
        test = AbsoluteRelatedTTest(value_column='target')
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """

    super().__init__()
    self.value_column = value_column
    self.alternative = alternative
    self.alpha = alpha

RelativeIndependentTTest

Bases: BaseTest

Performs an independent t-test using a ratio-based measure for effect size relative to the control group.

Compatible with CUPED preprocessing: when used with CupedProcessor, automatically uses the original (pre-CUPED) control mean for denominator calculation, ensuring correct interpretation of relative effects while benefiting from variance reduction.

ATTRIBUTE DESCRIPTION
value_column

Name of the column containing the values to test.

TYPE: str

alternative

Defines the alternative hypothesis. Must be one of {'two-sided', 'less', 'greater'}.

TYPE: str

alpha

Significance level for confidence intervals (default: 0.05).

TYPE: float

Source code in aboba/tests/relative_ttest.py
class RelativeIndependentTTest(BaseTest):
    """
    Performs an independent t-test using a ratio-based measure for effect size relative
    to the control group.

    Compatible with CUPED preprocessing: when used with CupedProcessor, automatically
    uses the original (pre-CUPED) control mean for denominator calculation, ensuring
    correct interpretation of relative effects while benefiting from variance reduction.

    Attributes:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Must be one of
            {'two-sided', 'less', 'greater'}.
        alpha (float): Significance level for confidence intervals (default: 0.05).
    """

    def __init__(
        self,
        value_column="target",
        alternative="two-sided",
        alpha=0.05,
    ):
        """
        Independent t-test for relative difference between two groups.

        This test compares the means of two independent groups to determine if there
        is a statistically significant relative difference between them. The relative
        difference is calculated as (test_mean - control_mean) / control_mean.

        When used after CUPED preprocessing, automatically uses the original control
        mean (before CUPED transformation) for correct relative effect calculation.
        This provides the best of both worlds: variance reduction from CUPED and
        correct relative effect interpretation.

        Args:
            value_column (str): Name of the column containing the values to test.
            alternative (str): Defines the alternative hypothesis. Options are:
                'two-sided' (default), 'less', or 'greater'.
            alpha (float): Significance level for confidence intervals (default: 0.05).

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.relative_ttest import RelativeIndependentTTest

            # Example 1: Basic usage without CUPED
            np.random.seed(42)
            control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
            test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

            test_instance = RelativeIndependentTTest(value_column='target')
            result = test_instance.test([control, test])
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

            # Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
            # The test will automatically detect and use original control mean from artifacts
            ```
        """
        super().__init__()
        self.value_column = value_column
        self.alternative = alternative
        self.alpha = alpha
        assert alternative in {"two-sided", "less", "greater"}

    def test(self, groups: List[pd.DataFrame], artefacts: Optional[Dict] = None) -> TestResult:
        """
        Perform the relative independent t-test on the provided groups.

        Args:
            groups (List[pd.DataFrame]): List of two DataFrames representing the groups to compare.
                The first group is treated as the control group, the second as the test group.
            artefacts (Optional[Dict]): Artifacts from preprocessing pipeline. 
                If 'cuped_original_control_mean' is present, uses it as denominator for relative effect.
                This enables correct relative effect calculation when using CUPED.

        Returns:
            TestResult: Object containing the p-value and relative effect size.
                - pvalue: Statistical significance
                - effect: Relative effect (test - control) / control_mean
                - effect_type: "relative_control"
                - effect_interval: Confidence interval for the effect
        """

        control_group, test_group = groups

        Y = control_group[self.value_column].to_numpy(float)
        X = test_group[self.value_column].to_numpy(float)

        var_1, var_2 = np.var(X, ddof=1), np.var(Y, ddof=1)
        a_1, a_2 = np.mean(X), np.mean(Y)

        # Check if we have CUPED artifacts with original control mean
        if artefacts is not None and 'cuped_original_control_mean' in artefacts:
            original_control_mean = artefacts['cuped_original_control_mean']
            R = (a_1 - a_2) / original_control_mean
            # var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2
            var_R = (var_1 + var_2) / (original_control_mean ** 2)
        else:
            R = (a_1 - a_2) / a_2
            var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2

        n = len(test_group)
        stat = np.sqrt(n) * R / np.sqrt(var_R)

        if self.alternative == "two-sided":
            pvalue = 2 * min(sps.norm.cdf(stat), sps.norm.sf(stat))
            pvalue = min(pvalue, 1.0)
        elif self.alternative == "less":
            pvalue = sps.norm.cdf(stat)
        elif self.alternative == "greater":
            pvalue = sps.norm.sf(stat)
        else:
            raise ValueError(f"Unknown alternative: {self.alternative}")

        q = sps.norm.ppf(1 - self.alpha/2)
        left_bound = R - q * np.sqrt(var_R / n)
        right_bound = R + q * np.sqrt(var_R / n)

        return TestResult(
            pvalue=pvalue, 
            effect=R, 
            effect_type="relative_control", 
            effect_interval=(left_bound, right_bound)
        )

__init__

__init__(value_column='target', alternative='two-sided', alpha=0.05)

Independent t-test for relative difference between two groups.

This test compares the means of two independent groups to determine if there is a statistically significant relative difference between them. The relative difference is calculated as (test_mean - control_mean) / control_mean.

When used after CUPED preprocessing, automatically uses the original control mean (before CUPED transformation) for correct relative effect calculation. This provides the best of both worlds: variance reduction from CUPED and correct relative effect interpretation.

PARAMETER DESCRIPTION
value_column

Name of the column containing the values to test.

TYPE: str DEFAULT: 'target'

alternative

Defines the alternative hypothesis. Options are: 'two-sided' (default), 'less', or 'greater'.

TYPE: str DEFAULT: 'two-sided'

alpha

Significance level for confidence intervals (default: 0.05).

TYPE: float DEFAULT: 0.05

Examples:

import pandas as pd
import numpy as np
from aboba.tests.relative_ttest import RelativeIndependentTTest

# Example 1: Basic usage without CUPED
np.random.seed(42)
control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

test_instance = RelativeIndependentTTest(value_column='target')
result = test_instance.test([control, test])
print(f"P-value: {result.pvalue:.4f}")
print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

# Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
# The test will automatically detect and use original control mean from artifacts
Source code in aboba/tests/relative_ttest.py
def __init__(
    self,
    value_column="target",
    alternative="two-sided",
    alpha=0.05,
):
    """
    Independent t-test for relative difference between two groups.

    This test compares the means of two independent groups to determine if there
    is a statistically significant relative difference between them. The relative
    difference is calculated as (test_mean - control_mean) / control_mean.

    When used after CUPED preprocessing, automatically uses the original control
    mean (before CUPED transformation) for correct relative effect calculation.
    This provides the best of both worlds: variance reduction from CUPED and
    correct relative effect interpretation.

    Args:
        value_column (str): Name of the column containing the values to test.
        alternative (str): Defines the alternative hypothesis. Options are:
            'two-sided' (default), 'less', or 'greater'.
        alpha (float): Significance level for confidence intervals (default: 0.05).

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.relative_ttest import RelativeIndependentTTest

        # Example 1: Basic usage without CUPED
        np.random.seed(42)
        control = pd.DataFrame({'target': np.random.normal(100, 10, 100)})
        test = pd.DataFrame({'target': np.random.normal(105, 10, 100)})  # 5% increase

        test_instance = RelativeIndependentTTest(value_column='target')
        result = test_instance.test([control, test])
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Relative Effect: {result.effect:.4f} ({result.effect*100:.2f}%)")

        # Example 2: Usage with CUPED (artifacts passed automatically by pipeline)
        # The test will automatically detect and use original control mean from artifacts
        ```
    """
    super().__init__()
    self.value_column = value_column
    self.alternative = alternative
    self.alpha = alpha
    assert alternative in {"two-sided", "less", "greater"}

test

test(groups: List[DataFrame], artefacts: Optional[Dict] = None) -> TestResult

Perform the relative independent t-test on the provided groups.

PARAMETER DESCRIPTION
groups

List of two DataFrames representing the groups to compare. The first group is treated as the control group, the second as the test group.

TYPE: List[DataFrame]

artefacts

Artifacts from preprocessing pipeline. If 'cuped_original_control_mean' is present, uses it as denominator for relative effect. This enables correct relative effect calculation when using CUPED.

TYPE: Optional[Dict] DEFAULT: None

RETURNS DESCRIPTION
TestResult

Object containing the p-value and relative effect size. - pvalue: Statistical significance - effect: Relative effect (test - control) / control_mean - effect_type: "relative_control" - effect_interval: Confidence interval for the effect

TYPE: TestResult

Source code in aboba/tests/relative_ttest.py
def test(self, groups: List[pd.DataFrame], artefacts: Optional[Dict] = None) -> TestResult:
    """
    Perform the relative independent t-test on the provided groups.

    Args:
        groups (List[pd.DataFrame]): List of two DataFrames representing the groups to compare.
            The first group is treated as the control group, the second as the test group.
        artefacts (Optional[Dict]): Artifacts from preprocessing pipeline. 
            If 'cuped_original_control_mean' is present, uses it as denominator for relative effect.
            This enables correct relative effect calculation when using CUPED.

    Returns:
        TestResult: Object containing the p-value and relative effect size.
            - pvalue: Statistical significance
            - effect: Relative effect (test - control) / control_mean
            - effect_type: "relative_control"
            - effect_interval: Confidence interval for the effect
    """

    control_group, test_group = groups

    Y = control_group[self.value_column].to_numpy(float)
    X = test_group[self.value_column].to_numpy(float)

    var_1, var_2 = np.var(X, ddof=1), np.var(Y, ddof=1)
    a_1, a_2 = np.mean(X), np.mean(Y)

    # Check if we have CUPED artifacts with original control mean
    if artefacts is not None and 'cuped_original_control_mean' in artefacts:
        original_control_mean = artefacts['cuped_original_control_mean']
        R = (a_1 - a_2) / original_control_mean
        # var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2
        var_R = (var_1 + var_2) / (original_control_mean ** 2)
    else:
        R = (a_1 - a_2) / a_2
        var_R = var_1 / (a_2**2) + (a_1**2) / (a_2**4) * var_2

    n = len(test_group)
    stat = np.sqrt(n) * R / np.sqrt(var_R)

    if self.alternative == "two-sided":
        pvalue = 2 * min(sps.norm.cdf(stat), sps.norm.sf(stat))
        pvalue = min(pvalue, 1.0)
    elif self.alternative == "less":
        pvalue = sps.norm.cdf(stat)
    elif self.alternative == "greater":
        pvalue = sps.norm.sf(stat)
    else:
        raise ValueError(f"Unknown alternative: {self.alternative}")

    q = sps.norm.ppf(1 - self.alpha/2)
    left_bound = R - q * np.sqrt(var_R / n)
    right_bound = R + q * np.sqrt(var_R / n)

    return TestResult(
        pvalue=pvalue, 
        effect=R, 
        effect_type="relative_control", 
        effect_interval=(left_bound, right_bound)
    )

StratifiedTTest

Bases: BaseTest

Source code in aboba/tests/stratified_ttest.py
class StratifiedTTest(BaseTest):
    def __init__(
        self,
        group_column: str,
        group_size: int,
        method: str,
        strata_columns: List[str],
        strata_weights: Union[pd.Series, dict],
        col_name: str = "target",
        alpha: float = 0.05,
    ):
        """
        Performs a stratified t-test on the data.

        This test performs a t-test while accounting for stratification in the data.
        Strata weights must be provided by the caller and represent global (population)
        proportions for each stratum — they are not inferred from the sample.

        Args:
            group_column (str): Name of the column containing group identifiers.
            group_size (int): Size of groups to sample (used externally by splitters).
            method (str): Weighting method. One of 'random', 'stratified', 'post_stratified'.
            strata_columns (List[str]): List of columns to stratify by.
            strata_weights (Union[pd.Series, dict]): Global (population) weights for each
                stratum. Keys/index must match the unique values of the strata columns.
                Will be normalised to sum to 1.
            col_name (str): Name of the column to test.
            alpha (float): Significance level for confidence interval.

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.stratified_ttest import StratifiedTTest

            np.random.seed(42)
            data = pd.DataFrame({
                'group': np.repeat(['A', 'B'], 100),
                'strata': np.tile(['X', 'Y'], 100),
                'target': np.concatenate([
                    np.random.normal(10, 2, 100),
                    np.random.normal(12, 2, 100)
                ])
            })

            strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

            test = StratifiedTTest(
                group_column='group',
                group_size=50,
                method='stratified',
                strata_columns=['strata'],
                strata_weights=strata_weights,
                col_name='target',
            )

            groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
            result = test.test(groups, {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            ```
        """
        assert method in [
            "random",
            "stratified",
            "post_stratified",
        ], f"Invalid {method = }. Must be one of 'random', 'stratified', 'post_stratified'"
        assert len(strata_columns) > 0, "Must have at least one strata column"

        super().__init__()
        self.col_name = col_name
        self.method = method
        self.strata_columns = strata_columns
        self.alpha = alpha
        self.group_column = group_column
        self.group_size = group_size

        weights = pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
        self.strata_weights = weights / weights.sum()

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        """
        Perform the stratified t-test on the provided groups.

        Args:
            groups (List[pd.DataFrame]): List of two DataFrames representing the groups.
            artefacts: Unused; kept for interface compatibility.

        Returns:
            TestResult: Object containing the p-value and effect size.
        """
        assert len(groups) == 2

        if self.method == "random":
            mean_function = self._simple_mean
            var_function = self._simple_var
        elif self.method == "stratified":
            mean_function = self._weighted_mean
            var_function = self._weighted_var
        elif self.method == "post_stratified":
            mean_function = self._weighted_mean
            var_function = self._weighted_post_var
        else:
            raise RuntimeError(f"method not supported {self.method = }")

        x_mean = mean_function(groups[0])
        y_mean = mean_function(groups[1])
        x_var = var_function(groups[0])
        y_var = var_function(groups[1])

        effect = x_mean - y_mean
        std = np.sqrt(x_var + y_var)
        t_stat = effect / std
        pvalue = 2 * sps.norm.sf(np.abs(t_stat))
        q = sps.norm.ppf(1 - self.alpha / 2)
        left_bound, right_bound = effect - q * std, effect + q * std

        return TestResult(pvalue, effect, effect_interval=(left_bound, right_bound))

    def _weighted_mean(self, data):
        strata_means = data.groupby(by=self.strata_columns)[self.col_name].mean()
        return (strata_means * self.strata_weights).sum()

    def _weighted_var(self, data):
        strata_vars = data.groupby(by=self.strata_columns)[self.col_name].var()
        return (strata_vars * self.strata_weights).sum() / len(data)

    def _weighted_post_var(self, data):
        strata_vars = data.groupby(by=self.strata_columns)[self.col_name].var()
        weighted_var = (strata_vars * self.strata_weights).sum() / len(data)
        post_addition = (strata_vars * (1 - self.strata_weights)).sum() / (
            len(data) ** 2
        )
        return weighted_var + post_addition

    def _simple_mean(self, data):
        return data[self.col_name].mean()

    def _simple_var(self, data):
        return data[self.col_name].var() / len(data)

__init__

__init__(group_column: str, group_size: int, method: str, strata_columns: List[str], strata_weights: Union[Series, dict], col_name: str = 'target', alpha: float = 0.05)

Performs a stratified t-test on the data.

This test performs a t-test while accounting for stratification in the data. Strata weights must be provided by the caller and represent global (population) proportions for each stratum — they are not inferred from the sample.

PARAMETER DESCRIPTION
group_column

Name of the column containing group identifiers.

TYPE: str

group_size

Size of groups to sample (used externally by splitters).

TYPE: int

method

Weighting method. One of 'random', 'stratified', 'post_stratified'.

TYPE: str

strata_columns

List of columns to stratify by.

TYPE: List[str]

strata_weights

Global (population) weights for each stratum. Keys/index must match the unique values of the strata columns. Will be normalised to sum to 1.

TYPE: Union[Series, dict]

col_name

Name of the column to test.

TYPE: str DEFAULT: 'target'

alpha

Significance level for confidence interval.

TYPE: float DEFAULT: 0.05

Examples:

import pandas as pd
import numpy as np
from aboba.tests.stratified_ttest import StratifiedTTest

np.random.seed(42)
data = pd.DataFrame({
    'group': np.repeat(['A', 'B'], 100),
    'strata': np.tile(['X', 'Y'], 100),
    'target': np.concatenate([
        np.random.normal(10, 2, 100),
        np.random.normal(12, 2, 100)
    ])
})

strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

test = StratifiedTTest(
    group_column='group',
    group_size=50,
    method='stratified',
    strata_columns=['strata'],
    strata_weights=strata_weights,
    col_name='target',
)

groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
result = test.test(groups, {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")
Source code in aboba/tests/stratified_ttest.py
def __init__(
    self,
    group_column: str,
    group_size: int,
    method: str,
    strata_columns: List[str],
    strata_weights: Union[pd.Series, dict],
    col_name: str = "target",
    alpha: float = 0.05,
):
    """
    Performs a stratified t-test on the data.

    This test performs a t-test while accounting for stratification in the data.
    Strata weights must be provided by the caller and represent global (population)
    proportions for each stratum — they are not inferred from the sample.

    Args:
        group_column (str): Name of the column containing group identifiers.
        group_size (int): Size of groups to sample (used externally by splitters).
        method (str): Weighting method. One of 'random', 'stratified', 'post_stratified'.
        strata_columns (List[str]): List of columns to stratify by.
        strata_weights (Union[pd.Series, dict]): Global (population) weights for each
            stratum. Keys/index must match the unique values of the strata columns.
            Will be normalised to sum to 1.
        col_name (str): Name of the column to test.
        alpha (float): Significance level for confidence interval.

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.stratified_ttest import StratifiedTTest

        np.random.seed(42)
        data = pd.DataFrame({
            'group': np.repeat(['A', 'B'], 100),
            'strata': np.tile(['X', 'Y'], 100),
            'target': np.concatenate([
                np.random.normal(10, 2, 100),
                np.random.normal(12, 2, 100)
            ])
        })

        strata_weights = pd.Series({'X': 0.4, 'Y': 0.6})

        test = StratifiedTTest(
            group_column='group',
            group_size=50,
            method='stratified',
            strata_columns=['strata'],
            strata_weights=strata_weights,
            col_name='target',
        )

        groups = [data[data['group'] == 'A'], data[data['group'] == 'B']]
        result = test.test(groups, {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        ```
    """
    assert method in [
        "random",
        "stratified",
        "post_stratified",
    ], f"Invalid {method = }. Must be one of 'random', 'stratified', 'post_stratified'"
    assert len(strata_columns) > 0, "Must have at least one strata column"

    super().__init__()
    self.col_name = col_name
    self.method = method
    self.strata_columns = strata_columns
    self.alpha = alpha
    self.group_column = group_column
    self.group_size = group_size

    weights = pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
    self.strata_weights = weights / weights.sum()

test

test(groups: List[DataFrame], artefacts: Dict = {}) -> TestResult

Perform the stratified t-test on the provided groups.

PARAMETER DESCRIPTION
groups

List of two DataFrames representing the groups.

TYPE: List[DataFrame]

artefacts

Unused; kept for interface compatibility.

TYPE: Dict DEFAULT: {}

RETURNS DESCRIPTION
TestResult

Object containing the p-value and effect size.

TYPE: TestResult

Source code in aboba/tests/stratified_ttest.py
def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
    """
    Perform the stratified t-test on the provided groups.

    Args:
        groups (List[pd.DataFrame]): List of two DataFrames representing the groups.
        artefacts: Unused; kept for interface compatibility.

    Returns:
        TestResult: Object containing the p-value and effect size.
    """
    assert len(groups) == 2

    if self.method == "random":
        mean_function = self._simple_mean
        var_function = self._simple_var
    elif self.method == "stratified":
        mean_function = self._weighted_mean
        var_function = self._weighted_var
    elif self.method == "post_stratified":
        mean_function = self._weighted_mean
        var_function = self._weighted_post_var
    else:
        raise RuntimeError(f"method not supported {self.method = }")

    x_mean = mean_function(groups[0])
    y_mean = mean_function(groups[1])
    x_var = var_function(groups[0])
    y_var = var_function(groups[1])

    effect = x_mean - y_mean
    std = np.sqrt(x_var + y_var)
    t_stat = effect / std
    pvalue = 2 * sps.norm.sf(np.abs(t_stat))
    q = sps.norm.ppf(1 - self.alpha / 2)
    left_bound, right_bound = effect - q * std, effect + q * std

    return TestResult(pvalue, effect, effect_interval=(left_bound, right_bound))

CupedLinearRegressionTTest

Bases: BaseTest

Source code in aboba/tests/cuped_lreg.py
class CupedLinearRegressionTTest(BaseTest):
    def __init__(
        self,
        covariate_names: Optional[List[str]] = None,
        group_column: str = "group",
        value_column: str = "target",
        alpha: float = 0.05,
        center_on_control: bool = True,
        weight_column: Optional[str] = None,
        include_extra: bool = False,
        strata_column: Optional[str] = None,
        strata_weights: Optional[Union[Dict, pd.Series]] = None,
    ) -> None:
        """
        CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

        This test uses linear regression to adjust for pre-experiment covariates, reducing
        variance and increasing statistical power. The method centers covariates on the
        control group mean and estimates the treatment effect using OLS or WLS regression
        with heteroscedasticity-robust standard errors (HC3).

        Args:
            covariate_names (List[str], optional): List of pre-experiment covariates to adjust for.
                These should be variables measured before the experiment that correlate with
                the outcome metric.
            group_column (str): Name of column containing group assignment (A/B). Default "group".
            value_column (str): Name of column containing metric values to test. Default "target".
            alpha (float): Significance level for confidence interval. Default 0.05.
            center_on_control (bool): If True, covariates are centered by their mean in control
                group. This is recommended for variance reduction. Default True.
            weight_column (Optional[str]): Column with observation weights for weighted least
                squares regression. If None, uses ordinary least squares.
            include_extra (bool): If True, includes additional regression artifacts (parameters,
                design matrix, residuals) in TestResult.extra. Default False.
            strata_column (Optional[str]): Column containing stratum identifiers. Used together
                with strata_weights to derive per-observation WLS weights when weight_column
                is not provided.
            strata_weights (Optional[Union[Dict, pd.Series]]): Global (population) weight for
                each stratum. When strata_column is set and weight_column is None, the
                per-observation weight is computed as:
                    w_i = strata_weight[stratum_i] / count(stratum_i in group_i)

        Examples:
            ```python
            import pandas as pd
            import numpy as np
            from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

            # Create sample data with pre-experiment covariate
            np.random.seed(42)
            n = 200
            pre_metric = np.random.normal(100, 15, n)

            # Control group
            group_a = pd.DataFrame({
                'target': pre_metric[:100] + np.random.normal(0, 10, 100),
                'pre_metric': pre_metric[:100],
                'group': 0
            })

            # Treatment group with effect
            group_b = pd.DataFrame({
                'target': pre_metric[100:] + np.random.normal(5, 10, 100),
                'pre_metric': pre_metric[100:],
                'group': 1
            })

            # Perform CUPED test
            test = CupedLinearRegressionTTest(
                covariate_names=['pre_metric'],
                value_column='target',
                group_column='group'
            )
            result = test.test([group_a, group_b], {})
            print(f"P-value: {result.pvalue:.4f}")
            print(f"Effect: {result.effect:.4f}")
            print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")
            ```
        """
        super().__init__()
        self.value_column = value_column
        self.group_column = group_column
        self.covariate_names = covariate_names or []
        self.alpha = alpha
        self.center_on_control = center_on_control
        self.weight_column = weight_column
        self.include_extra = include_extra
        self.strata_column = strata_column
        self.strata_weights = (
            pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
        )

    def test(self, groups: List[pd.DataFrame], artefacts: Dict = {}) -> TestResult:
        assert len(groups) == 2, "CupedLinearRegressionTTest expects exactly two groups"

        a_group, b_group = groups[0].copy(), groups[1].copy()

        # Derive per-observation WLS weights from strata when weight_column is absent.
        effective_weight_column = self.weight_column
        if (
            self.strata_column is not None
            and self.strata_weights is not None
            and self.weight_column is None
        ):
            for group in (a_group, b_group):
                stratum_counts = group[self.strata_column].value_counts()
                group["_strata_weight"] = (
                    group[self.strata_column].map(self.strata_weights)
                    / group[self.strata_column].map(stratum_counts)
                )
            effective_weight_column = "_strata_weight"

        data = pd.concat([a_group, b_group], ignore_index=True)

        feature_names: List[str] = [self.group_column]
        for name in self.covariate_names:
            if self.center_on_control:
                if effective_weight_column is None:
                    mean_control = a_group[name].mean()
                else:
                    assert effective_weight_column in a_group.columns, (
                        f"Weight column '{effective_weight_column}' not found in control group"
                    )
                    w = a_group[effective_weight_column].to_numpy(float)
                    x = a_group[name].to_numpy(float)
                    mean_control = float(np.average(x, weights=w))
                cname = f"{name}_c"
                data[cname] = data[name] - mean_control
                feature_names.append(cname)
            else:
                feature_names.append(name)

        formula = f"{self.value_column} ~ " + " + ".join(feature_names)

        if effective_weight_column is None:
            model = ols(formula, data=data).fit(cov_type="HC3")
        else:
            assert effective_weight_column in data.columns, (
                f"Weight column '{effective_weight_column}' not found in data"
            )
            model = wls(
                formula,
                data=data,
                weights=data[effective_weight_column],
            ).fit(cov_type="HC3")

        ef = model.params[self.group_column]
        se = model.bse[self.group_column]
        pvalue = model.pvalues[self.group_column]

        q = sps.norm.ppf(1 - self.alpha / 2)
        left_bound, right_bound  = ef - q * se, ef + q * se  
        extra = None
        if self.include_extra:
            extra = {
                "params": model.params,
                "design_matrix": model.model.exog,
                "resid": model.resid
            }

        return TestResult(pvalue=pvalue, effect=ef, effect_interval=(left_bound, right_bound),extra=extra if self.include_extra else None)

__init__

__init__(covariate_names: Optional[List[str]] = None, group_column: str = 'group', value_column: str = 'target', alpha: float = 0.05, center_on_control: bool = True, weight_column: Optional[str] = None, include_extra: bool = False, strata_column: Optional[str] = None, strata_weights: Optional[Union[Dict, Series]] = None) -> None

CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

This test uses linear regression to adjust for pre-experiment covariates, reducing variance and increasing statistical power. The method centers covariates on the control group mean and estimates the treatment effect using OLS or WLS regression with heteroscedasticity-robust standard errors (HC3).

PARAMETER DESCRIPTION
covariate_names

List of pre-experiment covariates to adjust for. These should be variables measured before the experiment that correlate with the outcome metric.

TYPE: List[str] DEFAULT: None

group_column

Name of column containing group assignment (A/B). Default "group".

TYPE: str DEFAULT: 'group'

value_column

Name of column containing metric values to test. Default "target".

TYPE: str DEFAULT: 'target'

alpha

Significance level for confidence interval. Default 0.05.

TYPE: float DEFAULT: 0.05

center_on_control

If True, covariates are centered by their mean in control group. This is recommended for variance reduction. Default True.

TYPE: bool DEFAULT: True

weight_column

Column with observation weights for weighted least squares regression. If None, uses ordinary least squares.

TYPE: Optional[str] DEFAULT: None

include_extra

If True, includes additional regression artifacts (parameters, design matrix, residuals) in TestResult.extra. Default False.

TYPE: bool DEFAULT: False

strata_column

Column containing stratum identifiers. Used together with strata_weights to derive per-observation WLS weights when weight_column is not provided.

TYPE: Optional[str] DEFAULT: None

strata_weights

Global (population) weight for each stratum. When strata_column is set and weight_column is None, the per-observation weight is computed as: w_i = strata_weight[stratum_i] / count(stratum_i in group_i)

TYPE: Optional[Union[Dict, Series]] DEFAULT: None

Examples:

import pandas as pd
import numpy as np
from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

# Create sample data with pre-experiment covariate
np.random.seed(42)
n = 200
pre_metric = np.random.normal(100, 15, n)

# Control group
group_a = pd.DataFrame({
    'target': pre_metric[:100] + np.random.normal(0, 10, 100),
    'pre_metric': pre_metric[:100],
    'group': 0
})

# Treatment group with effect
group_b = pd.DataFrame({
    'target': pre_metric[100:] + np.random.normal(5, 10, 100),
    'pre_metric': pre_metric[100:],
    'group': 1
})

# Perform CUPED test
test = CupedLinearRegressionTTest(
    covariate_names=['pre_metric'],
    value_column='target',
    group_column='group'
)
result = test.test([group_a, group_b], {})
print(f"P-value: {result.pvalue:.4f}")
print(f"Effect: {result.effect:.4f}")
print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")
Source code in aboba/tests/cuped_lreg.py
def __init__(
    self,
    covariate_names: Optional[List[str]] = None,
    group_column: str = "group",
    value_column: str = "target",
    alpha: float = 0.05,
    center_on_control: bool = True,
    weight_column: Optional[str] = None,
    include_extra: bool = False,
    strata_column: Optional[str] = None,
    strata_weights: Optional[Union[Dict, pd.Series]] = None,
) -> None:
    """
    CUPED (Controlled-experiment Using Pre-Experiment Data) via linear regression.

    This test uses linear regression to adjust for pre-experiment covariates, reducing
    variance and increasing statistical power. The method centers covariates on the
    control group mean and estimates the treatment effect using OLS or WLS regression
    with heteroscedasticity-robust standard errors (HC3).

    Args:
        covariate_names (List[str], optional): List of pre-experiment covariates to adjust for.
            These should be variables measured before the experiment that correlate with
            the outcome metric.
        group_column (str): Name of column containing group assignment (A/B). Default "group".
        value_column (str): Name of column containing metric values to test. Default "target".
        alpha (float): Significance level for confidence interval. Default 0.05.
        center_on_control (bool): If True, covariates are centered by their mean in control
            group. This is recommended for variance reduction. Default True.
        weight_column (Optional[str]): Column with observation weights for weighted least
            squares regression. If None, uses ordinary least squares.
        include_extra (bool): If True, includes additional regression artifacts (parameters,
            design matrix, residuals) in TestResult.extra. Default False.
        strata_column (Optional[str]): Column containing stratum identifiers. Used together
            with strata_weights to derive per-observation WLS weights when weight_column
            is not provided.
        strata_weights (Optional[Union[Dict, pd.Series]]): Global (population) weight for
            each stratum. When strata_column is set and weight_column is None, the
            per-observation weight is computed as:
                w_i = strata_weight[stratum_i] / count(stratum_i in group_i)

    Examples:
        ```python
        import pandas as pd
        import numpy as np
        from aboba.tests.cuped_lreg import CupedLinearRegressionTTest

        # Create sample data with pre-experiment covariate
        np.random.seed(42)
        n = 200
        pre_metric = np.random.normal(100, 15, n)

        # Control group
        group_a = pd.DataFrame({
            'target': pre_metric[:100] + np.random.normal(0, 10, 100),
            'pre_metric': pre_metric[:100],
            'group': 0
        })

        # Treatment group with effect
        group_b = pd.DataFrame({
            'target': pre_metric[100:] + np.random.normal(5, 10, 100),
            'pre_metric': pre_metric[100:],
            'group': 1
        })

        # Perform CUPED test
        test = CupedLinearRegressionTTest(
            covariate_names=['pre_metric'],
            value_column='target',
            group_column='group'
        )
        result = test.test([group_a, group_b], {})
        print(f"P-value: {result.pvalue:.4f}")
        print(f"Effect: {result.effect:.4f}")
        print(f"CI: [{result.effect_interval[0]:.4f}, {result.effect_interval[1]:.4f}]")
        ```
    """
    super().__init__()
    self.value_column = value_column
    self.group_column = group_column
    self.covariate_names = covariate_names or []
    self.alpha = alpha
    self.center_on_control = center_on_control
    self.weight_column = weight_column
    self.include_extra = include_extra
    self.strata_column = strata_column
    self.strata_weights = (
        pd.Series(strata_weights) if isinstance(strata_weights, dict) else strata_weights
    )