Experiment Management

Classes for orchestrating and managing AB test experiments.

AbobaExperiment

Context for conducting and displaying AB tests results.

Results are displayed on a figure with confidence levels. By specifying number of columns, you can generate nice comparisons

Examples:

# First create tests
value_column = 'value'
size = 100

splitter = splitters.GroupSplitter(
    column='b_group',
    size=size,
)
cuped_preprocess = processing.CupedProcessor(...)
test_cuped = tests.AbsoluteIndependentTTest(
    preprocess=cuped_preprocess,
    data_splitter=splitter,
    value_column=value_column,
)
test_regular = tests.AbsoluteIndependentTTest(
    preprocess=None,
    data_splitter=splitter,
    value_column=value_column,
)

# Next create an experiment with relevant name.
# You can also generate several columns
experiment = AbobaExperiment(experiment_name="CUPED vs regular", draw_cols=2)

regular_aa_group = experiment.group("AA, regular")
regular_aa_group.run(test_regular, n_iter=n_iter)

regular_ab_group = experiment.group("AB, regular")
regular_ab_group.run(test_regular, synthetic_effect=effect, n_iter=n_iter)

cuped_aa_group = experiment.group("AA, cuped")
cuped_aa_group.run(test_cuped, n_iter=n_iter)

cuped_ab_group = experiment.group("AB, cuped")
cuped_ab_group.run(test_cuped, synthetic_effect=effect, n_iter=n_iter)

# Get results from each group
ab_results = cuped_ab_group.get_data()

Source code in aboba/experiment/aboba_experiment.py

class AbobaExperiment:
    """

    Context for conducting and displaying AB tests results.

    Results are displayed on a figure with confidence levels.
    By specifying number of columns, you can generate nice comparisons

    Examples:
        ```python
        # First create tests
        value_column = 'value'
        size = 100

        splitter = splitters.GroupSplitter(
            column='b_group',
            size=size,
        )
        cuped_preprocess = processing.CupedProcessor(...)
        test_cuped = tests.AbsoluteIndependentTTest(
            preprocess=cuped_preprocess,
            data_splitter=splitter,
            value_column=value_column,
        )
        test_regular = tests.AbsoluteIndependentTTest(
            preprocess=None,
            data_splitter=splitter,
            value_column=value_column,
        )

        # Next create an experiment with relevant name.
        # You can also generate several columns
        experiment = AbobaExperiment(experiment_name="CUPED vs regular", draw_cols=2)

        regular_aa_group = experiment.group("AA, regular")
        regular_aa_group.run(test_regular, n_iter=n_iter)

        regular_ab_group = experiment.group("AB, regular")
        regular_ab_group.run(test_regular, synthetic_effect=effect, n_iter=n_iter)

        cuped_aa_group = experiment.group("AA, cuped")
        cuped_aa_group.run(test_cuped, n_iter=n_iter)

        cuped_ab_group = experiment.group("AB, cuped")
        cuped_ab_group.run(test_cuped, synthetic_effect=effect, n_iter=n_iter)

        # Get results from each group
        ab_results = cuped_ab_group.get_data()
        ```
    """

    def __init__(
        self,
        alpha=0.05,
        experiment_name: Optional[str] = "AB experiment",
        visualization_method: Optional[
            Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]
        ] = default_visualization_method,
        language: Language = "eng",
        **visualization_kwargs,
    ):
        """
        Create a new experiment.
        Refer to the class description for more information.

        Args:
            alpha (float): Significance level for statistical tests.
            experiment_name (str): Name of the experiment to display.
            visualization_method (Optional[Callable]): Visualization function used to draw experiment results.
            language (Language): Default language for plot labels and titles.
            **visualization_kwargs: Additional arguments for the visualization.
        """


        assert 0.0 < alpha < 1.0

        self.alpha = alpha
        self.experiment_name = experiment_name
        self.visualization_method = visualization_method
        self.default_language = language

        visualization_kwargs["alpha"] = visualization_kwargs.get("alpha", alpha)
        visualization_kwargs["experiment_name"] = visualization_kwargs.get(
            "experiment_name", experiment_name
        )
        self.visualization_kwargs = visualization_kwargs

        self._groups: Dict[str, ExperimentGroup] = {}

    def group(
        self,
        name: str,
        test: BaseTest,
        data: Union[pd.DataFrame, List[pd.DataFrame]],
        data_pipeline: Pipeline,
        synthetic_effect: Optional[EffectModifier] = None,
        n_iter: int = 1,
        joblib_kwargs: Optional[dict] = None,
    ) -> ExperimentGroup:

        """
        Creates new context for experiment with specified name.

        Args:
            name (str): Name to use for this experiment subset.
            test (BaseTest): Statistical test to run.
            data (Union[pd.DataFrame, List[pd.DataFrame]]): Input data for the experiment.
            data_pipeline (Pipeline): Pipeline used to prepare data before testing.
            synthetic_effect (Optional[EffectModifier]): Synthetic effect applied before test execution.
            n_iter (int): Number of repeated test runs.
            joblib_kwargs (Optional[dict]): Additional keyword arguments for parallel execution.

        Returns:
            ExperimentGroup: Registered experiment group.
        """

        # TODO: raise one-time warning, if needed
        # assert name not in self._groups, (
        #     f"Trying to create group with {name = } but "
        #     f"it is already defined ({self._groups.keys()})"
        # )

        group = ExperimentGroup(
            name,
            test,
            data,
            data_pipeline,
            synthetic_effect,
            n_iter,
            joblib_kwargs,
        )
        self._groups[name] = group

        return group


    def draw(
        self,
        groups: Optional[List[str]] = None,
        group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None,
        lang: Language = None,
        figsize: Optional[Tuple[float, float]] = None,
        filter_empty: bool = True,
        **kwargs
    ) -> Tuple[Optional[Figure], Any]:
        """
        Draw visualization in the fixed 3-panel AA layout:
          - Top: confidence interval for alpha
          - Bottom-left: p-value histogram
          - Bottom-right: ECDF of p-values

        This is the only supported layout now.

        Args:
            groups: List of group names to visualize (None = all groups)
            group_configs: Per-group configuration (e.g., color)
            lang: Language for labels ('en' or 'ru')
            figsize: Custom figure size
            filter_empty: Skip empty groups
            **kwargs: Passed to draw_aa_experiment_layout

        Returns:
            Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]
        """
        if self.visualization_method is None:
            return None, None

        if lang is None:
            lang = self.default_language

        # Use our new fixed layout
        from aboba.experiment.visualization import draw_aa_experiment_layout
        viz_method = draw_aa_experiment_layout

        viz_kwargs = {
            **self.visualization_kwargs,
            "groups_list": groups,
            "group_configs": group_configs,
            "lang": lang,
            "figsize": figsize,
            "filter_empty": filter_empty,
            **kwargs
        }

        groups_to_get = groups if groups is not None else list(self._groups.keys())
        values = {key: self._groups[key].get_raw_data() 
                for key in groups_to_get if key in self._groups}

        return viz_method(values, **viz_kwargs)


    def draw_comparison(
        self,
        group_pairs: List[Tuple[str, str]],
        separate_pairs: bool = False,
        **kwargs
    ) -> Tuple[Optional[Figure], Any]:
        """
        Draw side-by-side comparison of group pairs.

        Args:
            group_pairs: List of (group1, group2) tuples
            separate_pairs: If True, groups are arranged in pairs (g1, g2, g1, g2, ...)
                        If False, duplicates are removed
            **kwargs: Additional draw arguments
        """
        if separate_pairs:
            # Показать все группы включая дубликаты (пары рядом)
            all_groups = []
            for g1, g2 in group_pairs:
                all_groups.extend([g1, g2])
        else:
            # Убрать дубликаты, сохранив порядок первого появления
            all_groups = []
            seen = set()
            for g1, g2 in group_pairs:
                for g in [g1, g2]:
                    if g not in seen:
                        all_groups.append(g)
                        seen.add(g)

        return self.draw(groups=all_groups, **kwargs)


    def quick_summary(self) -> pd.DataFrame:
        """
        Get DataFrame with summary statistics for all groups.

        Returns:
            DataFrame with columns: group_name, n_iterations, n_errors,
            real_alpha, ci_left, ci_right, rejection_rate

        Example:
            summary = experiment.quick_summary()
            print(summary.sort_values('real_alpha'))
        """
        from aboba.utils.alpha_interval import calculate_real_alpha

        summary = []
        for name, group in self._groups.items():
            data = group.get_raw_data()
            if data.is_empty():
                continue

            n_iter = len(data.history)
            pvals = [tr.pvalue for tr in data.history]
            n_errors = sum(int(p < self.alpha) for p in pvals)
            real_alpha, left_alpha, right_alpha = calculate_real_alpha(
                n_iter=n_iter, n_errors=n_errors
            )

            summary.append({
                'group_name': name,
                'n_iterations': n_iter,
                'n_errors': n_errors,
                'real_alpha': real_alpha,
                'ci_left': left_alpha,
                'ci_right': right_alpha,
                'rejection_rate': n_errors / n_iter if n_iter > 0 else 0.0,
                'mean_pvalue': np.mean(pvals),
                'median_pvalue': np.median(pvals),
            })

        return pd.DataFrame(summary)


    def draw_power_curve(
        self,
        effect_grid: Optional[List[float]] = None,
        effect_type: Literal["absolute", "relative"] = "absolute",
        n_iter: int = 500,
        target_power: float = 0.8,
        groups: Optional[List[str]] = None, 
        group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, 
        group_col: str = None,
        lang: Language = None,
        figsize: Tuple[float, float] = (9, 6),
        alpha_line_on: bool = True, 
        **kwargs
    ) -> Tuple[plt.Figure, plt.Axes]:
        """
        Draw power curves: statistical power vs effect size for one or more groups using simulation.

        Uses the specified groups' test, pipeline, and data as templates.
        For each group, it runs n_iter simulations for each effect size in `effect_grid`
        and estimates the proportion of rejections (power) with confidence intervals.
        Curves are plotted on the same axes for comparison.

        Parameters
        ----------
        effect_grid : List[float], optional
            Grid of effect values used to simulate power.
            Interpretation depends on `effect_type`:
            - "absolute": additive effect applied to test group values (`value + effect`);
            - "relative": multiplicative effect applied to test group values as (`value * (1 + effect)`).
            For ratio tests (e.g. ``DeltaRatioTtest``), the effect is applied to the numerator column only.
            If None:
            - for "absolute": defaults to np.linspace(0.0, 0.6, 20)
            - for "relative": defaults to np.linspace(0.0, 0.6, 20)
        effect_type : Literal["absolute", "relative"], default "absolute"
            Type of effect application for the test group:
            - "absolute": additive shift;
            - "relative": relative change via factor `1 + effect`.
        n_iter : int, default 500
            Number of simulations per effect size per group (trade-off: speed vs precision).
        target_power : float, default 0.8
            Horizontal line indicating desired power level.
        groups : List[str], optional
            Names of groups to include in the power analysis. If None, defaults to all groups.
        group_configs : Dict[str, GroupVisualizationConfig], optional
            Per-group configuration for styling (e.g., color, linestyle).
            Falls back to default if not provided for a group.
        lang : str, default "en"
            Language for labels ('en' or 'ru').
        figsize: Tuple[float, float], default (9, 6)
            Figure size.
        alpha_line_on : bool, default True
            Whether to draw the horizontal line for the significance level (alpha).
        **kwargs : dict
            Passed to `simulate_power_for_effect`.

        Returns
        -------
        fig : matplotlib.figure.Figure
        ax : matplotlib.axes.Axes
        """
        from aboba.utils.power_analysis import simulate_power_for_effect
        from statsmodels.stats.proportion import proportion_confint
        if lang is None:
            lang = self.default_language
        if isinstance(groups, str):
            groups = [groups]
        # If 'groups' is None, use all group names from the experiment
        groups_to_analyze = groups if groups is not None else list(self._groups.keys())

        # Filter to ensure only existing groups are processed
        groups_to_analyze = [g for g in groups_to_analyze if g in self._groups]

        if not groups_to_analyze:
            print("No groups specified or available for power curve analysis.")
            return None, None

        group_configs = group_configs or {}



        if effect_grid is None:
            effect_grid = np.linspace(0.0, 0.6, 20).tolist()

        if effect_type == "relative":
            invalid = [eff for eff in effect_grid if eff < -1]
            if invalid:
                raise ValueError(
                "For relative effects, each value must not be less than -1, "
                "because 1 + effect must stay non-negative."
                )

        fig, ax = plt.subplots(figsize=figsize)

        for group_name in groups_to_analyze:
            group_obj = self._groups[group_name]
            test = group_obj._test
            data = group_obj._data
            pipeline = group_obj._pipeline

            config = group_configs.get(group_name, GroupVisualizationConfig())

            if hasattr(test, "value_column"):
                value_col = test.value_column
            elif hasattr(test, "numerator_name"):
                # Ratio metrics (e.g. DeltaRatioTtest): apply synthetic effect to the numerator;
                # denominator unchanged matches relative uplift on the ratio when D is fixed.
                value_col = test.numerator_name
            else:
                raise TypeError(
                    f"Power curve simulation requires a test with 'value_column' or "
                    f"'numerator_name'; got {type(test).__name__}."
                )

            powers = []
            ci_lows = []
            ci_ups = []

            for eff in effect_grid:
                if not group_col:
                    splitter = None
                    for transformer in pipeline.transformers:
                        if isinstance(transformer, GroupSplitter):
                            splitter = transformer
                            break

                    if splitter is None:
                        raise ValueError("No GroupSplitter found in the pipeline. Cannot determine group column.")

                    group_col = splitter.column

                if effect_type == "absolute":
                    current_effect_modifier = effect_modifiers.GroupModifier(
                        effects={1: eff},
                        value_column=value_col,
                        group_column=group_col,
                        method=operator.add,   
                    )
                else:  # relative
                    current_effect_modifier = effect_modifiers.GroupModifier(
                        effects={1: 1 + eff},
                        value_column=value_col,
                        group_column=group_col,
                        method=operator.mul,                        
                    )


                try:
                    n_rejects, _ = simulate_power_for_effect(
                        test=test,
                        data=data,
                        pipeline=pipeline,
                        effect_modifier=current_effect_modifier,
                        n_iter=n_iter,
                        alpha=self.alpha,
                        effect_size=eff,
                        progress_desc=f"{group_name}: Effect={eff:.3f}"
                    )
                except Exception as e:
                    print(f"Error simulating for group '{group_name}', effect={eff:.3f}: {e}. Skipping...")
                    powers.append(np.nan)
                    ci_lows.append(np.nan)
                    ci_ups.append(np.nan)
                    continue

                power_est = n_rejects / n_iter
                low, high = proportion_confint(count=n_rejects, nobs=n_iter, method='wilson')
                powers.append(power_est)
                ci_lows.append(low)
                ci_ups.append(high)

            plot_kwargs = {'label': group_name}
            if config.color:
                plot_kwargs['color'] = config.color
                ci_color = plot_kwargs.get('color')

            ax.plot(effect_grid, powers, linewidth=3, **plot_kwargs)

            ax.fill_between(effect_grid, ci_lows, ci_ups, alpha=0.2)

        ax.axhline(target_power, color="#20b2aa", linestyle="--", linewidth=2, label=t('target_power', lang))

        if alpha_line_on:
            ax.axhline(self.alpha, color="#ff2400", linestyle="--", linewidth=2, label=t('significance_level', lang))

        ax.set_xlabel(t('effect_size', lang))
        ax.set_ylabel(t('power', lang))
        ax.set_xlim(left=min(effect_grid), right=max(effect_grid)* 1.03)
        ax.set_ylim(bottom=0, top=1.03)
        ax.legend(loc='lower right')
        ax.grid(True)

        fig.suptitle(t('power_curve_plot', lang), fontsize=14, fontweight='bold')
        fig.tight_layout()

        return fig, ax

init

__init__(alpha=0.05, experiment_name: Optional[str] = 'AB experiment', visualization_method: Optional[Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]] = default_visualization_method, language: Language = 'eng', **visualization_kwargs)

Create a new experiment. Refer to the class description for more information.

PARAMETER	DESCRIPTION
`alpha`	Significance level for statistical tests. TYPE: `float` DEFAULT: `0.05`
`experiment_name`	Name of the experiment to display. TYPE: `str` DEFAULT: `'AB experiment'`
`visualization_method`	Visualization function used to draw experiment results. TYPE: `Optional[Callable]` DEFAULT: `default_visualization_method`
`language`	Default language for plot labels and titles. TYPE: `Language` DEFAULT: `'eng'`
`**visualization_kwargs`	Additional arguments for the visualization. DEFAULT: `{}`

Source code in aboba/experiment/aboba_experiment.py

def __init__(
    self,
    alpha=0.05,
    experiment_name: Optional[str] = "AB experiment",
    visualization_method: Optional[
        Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]
    ] = default_visualization_method,
    language: Language = "eng",
    **visualization_kwargs,
):
    """
    Create a new experiment.
    Refer to the class description for more information.

    Args:
        alpha (float): Significance level for statistical tests.
        experiment_name (str): Name of the experiment to display.
        visualization_method (Optional[Callable]): Visualization function used to draw experiment results.
        language (Language): Default language for plot labels and titles.
        **visualization_kwargs: Additional arguments for the visualization.
    """


    assert 0.0 < alpha < 1.0

    self.alpha = alpha
    self.experiment_name = experiment_name
    self.visualization_method = visualization_method
    self.default_language = language

    visualization_kwargs["alpha"] = visualization_kwargs.get("alpha", alpha)
    visualization_kwargs["experiment_name"] = visualization_kwargs.get(
        "experiment_name", experiment_name
    )
    self.visualization_kwargs = visualization_kwargs

    self._groups: Dict[str, ExperimentGroup] = {}

group

group(name: str, test: BaseTest, data: Union[DataFrame, List[DataFrame]], data_pipeline: Pipeline, synthetic_effect: Optional[EffectModifier] = None, n_iter: int = 1, joblib_kwargs: Optional[dict] = None) -> ExperimentGroup

Creates new context for experiment with specified name.

PARAMETER	DESCRIPTION
`name`	Name to use for this experiment subset. TYPE: `str`
`test`	Statistical test to run. TYPE: `BaseTest`
`data`	Input data for the experiment. TYPE: `Union[DataFrame, List[DataFrame]]`
`data_pipeline`	Pipeline used to prepare data before testing. TYPE: `Pipeline`
`synthetic_effect`	Synthetic effect applied before test execution. TYPE: `Optional[EffectModifier]` DEFAULT: `None`
`n_iter`	Number of repeated test runs. TYPE: `int` DEFAULT: `1`
`joblib_kwargs`	Additional keyword arguments for parallel execution. TYPE: `Optional[dict]` DEFAULT: `None`

RETURNS	DESCRIPTION
`ExperimentGroup`	Registered experiment group. TYPE: `ExperimentGroup`

Source code in aboba/experiment/aboba_experiment.py

def group(
    self,
    name: str,
    test: BaseTest,
    data: Union[pd.DataFrame, List[pd.DataFrame]],
    data_pipeline: Pipeline,
    synthetic_effect: Optional[EffectModifier] = None,
    n_iter: int = 1,
    joblib_kwargs: Optional[dict] = None,
) -> ExperimentGroup:

    """
    Creates new context for experiment with specified name.

    Args:
        name (str): Name to use for this experiment subset.
        test (BaseTest): Statistical test to run.
        data (Union[pd.DataFrame, List[pd.DataFrame]]): Input data for the experiment.
        data_pipeline (Pipeline): Pipeline used to prepare data before testing.
        synthetic_effect (Optional[EffectModifier]): Synthetic effect applied before test execution.
        n_iter (int): Number of repeated test runs.
        joblib_kwargs (Optional[dict]): Additional keyword arguments for parallel execution.

    Returns:
        ExperimentGroup: Registered experiment group.
    """

    # TODO: raise one-time warning, if needed
    # assert name not in self._groups, (
    #     f"Trying to create group with {name = } but "
    #     f"it is already defined ({self._groups.keys()})"
    # )

    group = ExperimentGroup(
        name,
        test,
        data,
        data_pipeline,
        synthetic_effect,
        n_iter,
        joblib_kwargs,
    )
    self._groups[name] = group

    return group

draw

draw(groups: Optional[List[str]] = None, group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, lang: Language = None, figsize: Optional[Tuple[float, float]] = None, filter_empty: bool = True, **kwargs) -> Tuple[Optional[Figure], Any]

Draw visualization in the fixed 3-panel AA layout

Top: confidence interval for alpha
Bottom-left: p-value histogram
Bottom-right: ECDF of p-values

This is the only supported layout now.

PARAMETER	DESCRIPTION
`groups`	List of group names to visualize (None = all groups) TYPE: `Optional[List[str]]` DEFAULT: `None`
`group_configs`	Per-group configuration (e.g., color) TYPE: `Optional[Dict[str, GroupVisualizationConfig]]` DEFAULT: `None`
`lang`	Language for labels ('en' or 'ru') TYPE: `Language` DEFAULT: `None`
`figsize`	Custom figure size TYPE: `Optional[Tuple[float, float]]` DEFAULT: `None`
`filter_empty`	Skip empty groups TYPE: `bool` DEFAULT: `True`
`**kwargs`	Passed to draw_aa_experiment_layout DEFAULT: `{}`

RETURNS	DESCRIPTION
`Tuple[Optional[Figure], Any]`	Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]

Source code in aboba/experiment/aboba_experiment.py

def draw(
    self,
    groups: Optional[List[str]] = None,
    group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None,
    lang: Language = None,
    figsize: Optional[Tuple[float, float]] = None,
    filter_empty: bool = True,
    **kwargs
) -> Tuple[Optional[Figure], Any]:
    """
    Draw visualization in the fixed 3-panel AA layout:
      - Top: confidence interval for alpha
      - Bottom-left: p-value histogram
      - Bottom-right: ECDF of p-values

    This is the only supported layout now.

    Args:
        groups: List of group names to visualize (None = all groups)
        group_configs: Per-group configuration (e.g., color)
        lang: Language for labels ('en' or 'ru')
        figsize: Custom figure size
        filter_empty: Skip empty groups
        **kwargs: Passed to draw_aa_experiment_layout

    Returns:
        Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]
    """
    if self.visualization_method is None:
        return None, None

    if lang is None:
        lang = self.default_language

    # Use our new fixed layout
    from aboba.experiment.visualization import draw_aa_experiment_layout
    viz_method = draw_aa_experiment_layout

    viz_kwargs = {
        **self.visualization_kwargs,
        "groups_list": groups,
        "group_configs": group_configs,
        "lang": lang,
        "figsize": figsize,
        "filter_empty": filter_empty,
        **kwargs
    }

    groups_to_get = groups if groups is not None else list(self._groups.keys())
    values = {key: self._groups[key].get_raw_data() 
            for key in groups_to_get if key in self._groups}

    return viz_method(values, **viz_kwargs)

draw_comparison

draw_comparison(group_pairs: List[Tuple[str, str]], separate_pairs: bool = False, **kwargs) -> Tuple[Optional[Figure], Any]

Draw side-by-side comparison of group pairs.

PARAMETER	DESCRIPTION
`group_pairs`	List of (group1, group2) tuples TYPE: `List[Tuple[str, str]]`
`separate_pairs`	If True, groups are arranged in pairs (g1, g2, g1, g2, ...) If False, duplicates are removed TYPE: `bool` DEFAULT: `False`
`**kwargs`	Additional draw arguments DEFAULT: `{}`

Source code in aboba/experiment/aboba_experiment.py

def draw_comparison(
    self,
    group_pairs: List[Tuple[str, str]],
    separate_pairs: bool = False,
    **kwargs
) -> Tuple[Optional[Figure], Any]:
    """
    Draw side-by-side comparison of group pairs.

    Args:
        group_pairs: List of (group1, group2) tuples
        separate_pairs: If True, groups are arranged in pairs (g1, g2, g1, g2, ...)
                    If False, duplicates are removed
        **kwargs: Additional draw arguments
    """
    if separate_pairs:
        # Показать все группы включая дубликаты (пары рядом)
        all_groups = []
        for g1, g2 in group_pairs:
            all_groups.extend([g1, g2])
    else:
        # Убрать дубликаты, сохранив порядок первого появления
        all_groups = []
        seen = set()
        for g1, g2 in group_pairs:
            for g in [g1, g2]:
                if g not in seen:
                    all_groups.append(g)
                    seen.add(g)

    return self.draw(groups=all_groups, **kwargs)

quick_summary

quick_summary() -> pd.DataFrame

Get DataFrame with summary statistics for all groups.

RETURNS	DESCRIPTION
`DataFrame`	DataFrame with columns: group_name, n_iterations, n_errors,
`DataFrame`	real_alpha, ci_left, ci_right, rejection_rate

Example

summary = experiment.quick_summary() print(summary.sort_values('real_alpha'))

Source code in aboba/experiment/aboba_experiment.py

def quick_summary(self) -> pd.DataFrame:
    """
    Get DataFrame with summary statistics for all groups.

    Returns:
        DataFrame with columns: group_name, n_iterations, n_errors,
        real_alpha, ci_left, ci_right, rejection_rate

    Example:
        summary = experiment.quick_summary()
        print(summary.sort_values('real_alpha'))
    """
    from aboba.utils.alpha_interval import calculate_real_alpha

    summary = []
    for name, group in self._groups.items():
        data = group.get_raw_data()
        if data.is_empty():
            continue

        n_iter = len(data.history)
        pvals = [tr.pvalue for tr in data.history]
        n_errors = sum(int(p < self.alpha) for p in pvals)
        real_alpha, left_alpha, right_alpha = calculate_real_alpha(
            n_iter=n_iter, n_errors=n_errors
        )

        summary.append({
            'group_name': name,
            'n_iterations': n_iter,
            'n_errors': n_errors,
            'real_alpha': real_alpha,
            'ci_left': left_alpha,
            'ci_right': right_alpha,
            'rejection_rate': n_errors / n_iter if n_iter > 0 else 0.0,
            'mean_pvalue': np.mean(pvals),
            'median_pvalue': np.median(pvals),
        })

    return pd.DataFrame(summary)

draw_power_curve

draw_power_curve(effect_grid: Optional[List[float]] = None, effect_type: Literal['absolute', 'relative'] = 'absolute', n_iter: int = 500, target_power: float = 0.8, groups: Optional[List[str]] = None, group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, group_col: str = None, lang: Language = None, figsize: Tuple[float, float] = (9, 6), alpha_line_on: bool = True, **kwargs) -> Tuple[plt.Figure, plt.Axes]

Draw power curves: statistical power vs effect size for one or more groups using simulation.

Uses the specified groups' test, pipeline, and data as templates. For each group, it runs n_iter simulations for each effect size in effect_grid and estimates the proportion of rejections (power) with confidence intervals. Curves are plotted on the same axes for comparison.

Parameters

effect_grid : List[float], optional Grid of effect values used to simulate power. Interpretation depends on effect_type: - "absolute": additive effect applied to test group values (value + effect); - "relative": multiplicative effect applied to test group values as (value * (1 + effect)). For ratio tests (e.g. DeltaRatioTtest), the effect is applied to the numerator column only. If None: - for "absolute": defaults to np.linspace(0.0, 0.6, 20) - for "relative": defaults to np.linspace(0.0, 0.6, 20) effect_type : Literal["absolute", "relative"], default "absolute" Type of effect application for the test group: - "absolute": additive shift; - "relative": relative change via factor 1 + effect. n_iter : int, default 500 Number of simulations per effect size per group (trade-off: speed vs precision). target_power : float, default 0.8 Horizontal line indicating desired power level. groups : List[str], optional Names of groups to include in the power analysis. If None, defaults to all groups. group_configs : Dict[str, GroupVisualizationConfig], optional Per-group configuration for styling (e.g., color, linestyle). Falls back to default if not provided for a group. lang : str, default "en" Language for labels ('en' or 'ru'). figsize: Tuple[float, float], default (9, 6) Figure size. alpha_line_on : bool, default True Whether to draw the horizontal line for the significance level (alpha). **kwargs : dict Passed to simulate_power_for_effect.

Returns

fig : matplotlib.figure.Figure ax : matplotlib.axes.Axes

Source code in aboba/experiment/aboba_experiment.py

def draw_power_curve(
    self,
    effect_grid: Optional[List[float]] = None,
    effect_type: Literal["absolute", "relative"] = "absolute",
    n_iter: int = 500,
    target_power: float = 0.8,
    groups: Optional[List[str]] = None, 
    group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, 
    group_col: str = None,
    lang: Language = None,
    figsize: Tuple[float, float] = (9, 6),
    alpha_line_on: bool = True, 
    **kwargs
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Draw power curves: statistical power vs effect size for one or more groups using simulation.

    Uses the specified groups' test, pipeline, and data as templates.
    For each group, it runs n_iter simulations for each effect size in `effect_grid`
    and estimates the proportion of rejections (power) with confidence intervals.
    Curves are plotted on the same axes for comparison.

    Parameters
    ----------
    effect_grid : List[float], optional
        Grid of effect values used to simulate power.
        Interpretation depends on `effect_type`:
        - "absolute": additive effect applied to test group values (`value + effect`);
        - "relative": multiplicative effect applied to test group values as (`value * (1 + effect)`).
        For ratio tests (e.g. ``DeltaRatioTtest``), the effect is applied to the numerator column only.
        If None:
        - for "absolute": defaults to np.linspace(0.0, 0.6, 20)
        - for "relative": defaults to np.linspace(0.0, 0.6, 20)
    effect_type : Literal["absolute", "relative"], default "absolute"
        Type of effect application for the test group:
        - "absolute": additive shift;
        - "relative": relative change via factor `1 + effect`.
    n_iter : int, default 500
        Number of simulations per effect size per group (trade-off: speed vs precision).
    target_power : float, default 0.8
        Horizontal line indicating desired power level.
    groups : List[str], optional
        Names of groups to include in the power analysis. If None, defaults to all groups.
    group_configs : Dict[str, GroupVisualizationConfig], optional
        Per-group configuration for styling (e.g., color, linestyle).
        Falls back to default if not provided for a group.
    lang : str, default "en"
        Language for labels ('en' or 'ru').
    figsize: Tuple[float, float], default (9, 6)
        Figure size.
    alpha_line_on : bool, default True
        Whether to draw the horizontal line for the significance level (alpha).
    **kwargs : dict
        Passed to `simulate_power_for_effect`.

    Returns
    -------
    fig : matplotlib.figure.Figure
    ax : matplotlib.axes.Axes
    """
    from aboba.utils.power_analysis import simulate_power_for_effect
    from statsmodels.stats.proportion import proportion_confint
    if lang is None:
        lang = self.default_language
    if isinstance(groups, str):
        groups = [groups]
    # If 'groups' is None, use all group names from the experiment
    groups_to_analyze = groups if groups is not None else list(self._groups.keys())

    # Filter to ensure only existing groups are processed
    groups_to_analyze = [g for g in groups_to_analyze if g in self._groups]

    if not groups_to_analyze:
        print("No groups specified or available for power curve analysis.")
        return None, None

    group_configs = group_configs or {}



    if effect_grid is None:
        effect_grid = np.linspace(0.0, 0.6, 20).tolist()

    if effect_type == "relative":
        invalid = [eff for eff in effect_grid if eff < -1]
        if invalid:
            raise ValueError(
            "For relative effects, each value must not be less than -1, "
            "because 1 + effect must stay non-negative."
            )

    fig, ax = plt.subplots(figsize=figsize)

    for group_name in groups_to_analyze:
        group_obj = self._groups[group_name]
        test = group_obj._test
        data = group_obj._data
        pipeline = group_obj._pipeline

        config = group_configs.get(group_name, GroupVisualizationConfig())

        if hasattr(test, "value_column"):
            value_col = test.value_column
        elif hasattr(test, "numerator_name"):
            # Ratio metrics (e.g. DeltaRatioTtest): apply synthetic effect to the numerator;
            # denominator unchanged matches relative uplift on the ratio when D is fixed.
            value_col = test.numerator_name
        else:
            raise TypeError(
                f"Power curve simulation requires a test with 'value_column' or "
                f"'numerator_name'; got {type(test).__name__}."
            )

        powers = []
        ci_lows = []
        ci_ups = []

        for eff in effect_grid:
            if not group_col:
                splitter = None
                for transformer in pipeline.transformers:
                    if isinstance(transformer, GroupSplitter):
                        splitter = transformer
                        break

                if splitter is None:
                    raise ValueError("No GroupSplitter found in the pipeline. Cannot determine group column.")

                group_col = splitter.column

            if effect_type == "absolute":
                current_effect_modifier = effect_modifiers.GroupModifier(
                    effects={1: eff},
                    value_column=value_col,
                    group_column=group_col,
                    method=operator.add,   
                )
            else:  # relative
                current_effect_modifier = effect_modifiers.GroupModifier(
                    effects={1: 1 + eff},
                    value_column=value_col,
                    group_column=group_col,
                    method=operator.mul,                        
                )


            try:
                n_rejects, _ = simulate_power_for_effect(
                    test=test,
                    data=data,
                    pipeline=pipeline,
                    effect_modifier=current_effect_modifier,
                    n_iter=n_iter,
                    alpha=self.alpha,
                    effect_size=eff,
                    progress_desc=f"{group_name}: Effect={eff:.3f}"
                )
            except Exception as e:
                print(f"Error simulating for group '{group_name}', effect={eff:.3f}: {e}. Skipping...")
                powers.append(np.nan)
                ci_lows.append(np.nan)
                ci_ups.append(np.nan)
                continue

            power_est = n_rejects / n_iter
            low, high = proportion_confint(count=n_rejects, nobs=n_iter, method='wilson')
            powers.append(power_est)
            ci_lows.append(low)
            ci_ups.append(high)

        plot_kwargs = {'label': group_name}
        if config.color:
            plot_kwargs['color'] = config.color
            ci_color = plot_kwargs.get('color')

        ax.plot(effect_grid, powers, linewidth=3, **plot_kwargs)

        ax.fill_between(effect_grid, ci_lows, ci_ups, alpha=0.2)

    ax.axhline(target_power, color="#20b2aa", linestyle="--", linewidth=2, label=t('target_power', lang))

    if alpha_line_on:
        ax.axhline(self.alpha, color="#ff2400", linestyle="--", linewidth=2, label=t('significance_level', lang))

    ax.set_xlabel(t('effect_size', lang))
    ax.set_ylabel(t('power', lang))
    ax.set_xlim(left=min(effect_grid), right=max(effect_grid)* 1.03)
    ax.set_ylim(bottom=0, top=1.03)
    ax.legend(loc='lower right')
    ax.grid(True)

    fig.suptitle(t('power_curve_plot', lang), fontsize=14, fontweight='bold')
    fig.tight_layout()

    return fig, ax

ExperimentGroup

Manages experiment subset.

Handles running tests multiple times, applying synthetic effects, and collecting results from the pipeline and test.

Source code in aboba/experiment/experiment_group.py

class ExperimentGroup:
    """
    Manages experiment subset.

    Handles running tests multiple times, applying synthetic effects,
    and collecting results from the pipeline and test.
    """

    def __init__(
        self,
        name: str,
        test: BaseTest,
        data: Union[pd.DataFrame, List[pd.DataFrame]],
        data_pipeline: Pipeline,
        synthetic_effect: Optional[EffectModifier] = None,
        n_iter: int = 1,
        joblib_kwargs: Optional[dict] = None,
    ):
        self._experiment_data = ExperimentData()
        self._name = name
        self._test = test
        self._data = data
        self._pipeline = data_pipeline
        self._synthetic_effect = synthetic_effect
        self._n_iter = n_iter
        self._joblib_kwargs = joblib_kwargs

        self._pipeline.fit(self._data)

        if self._joblib_kwargs is None:
            self._joblib_kwargs = dict()


    def run(self):
        """
        Run test multiple times in parallel and store results in currently activated experiment group.

        Returns:
            self: For method chaining
        """
        results = joblib.Parallel(**self._joblib_kwargs)(
            joblib.delayed(self._run_one)() for _ in range(self._n_iter)
        )

        for result in results:
            self._experiment_data.record(result)

        return self


    def get_raw_data(self) -> ExperimentData:
        return self._experiment_data


    def get_data(self) -> pd.DataFrame:
        test_result_cols = [field.name for field in fields(TestResult)]
        columns = ['Test name'] + test_result_cols

        result = pd.DataFrame(columns=columns)

        result[columns[0]] = [f'{self._name}-{i}' for i in range(len(self._experiment_data.history))]
        for field_name in test_result_cols:
            result[field_name] = [asdict(item)[field_name] for item in self._experiment_data.history]

        return result


    def _run_one(self):
        pipeline_result = self._pipeline.transform(self._data)

        if isinstance(pipeline_result, tuple) and len(pipeline_result) == 2:
            groups, artifacts = pipeline_result
        else:
            groups = pipeline_result
            artifacts = None

        groups = self._add_effect(groups, self._synthetic_effect)

        result = self._test.test(groups, artefacts=artifacts)
        assert isinstance(result, TestResult), f"Test {self._test} must return TestResult instance"

        return result


    @staticmethod
    def _add_effect(
        groups: List[pd.DataFrame], synthetic_effect: Optional[EffectModifier]
    ) -> List[pd.DataFrame]:
        if synthetic_effect is None:
            return groups

        modified = synthetic_effect([group.copy() for group in groups])
        assert len(modified) == len(
            groups
        ), f"Effect modifier {synthetic_effect} must not change number of groups"
        return modified

run

run()

Run test multiple times in parallel and store results in currently activated experiment group.

RETURNS	DESCRIPTION
`self`	For method chaining

Source code in aboba/experiment/experiment_group.py

def run(self):
    """
    Run test multiple times in parallel and store results in currently activated experiment group.

    Returns:
        self: For method chaining
    """
    results = joblib.Parallel(**self._joblib_kwargs)(
        joblib.delayed(self._run_one)() for _ in range(self._n_iter)
    )

    for result in results:
        self._experiment_data.record(result)

    return self