Skip to content

Experiment Management

Classes for orchestrating and managing AB test experiments.

AbobaExperiment

AbobaExperiment

Context for conducting and displaying AB tests results.

Results are displayed on a figure with confidence levels. By specifying number of columns, you can generate nice comparisons

Examples:

# First create tests
value_column = 'value'
size = 100

splitter = splitters.GroupSplitter(
    column='b_group',
    size=size,
)
cuped_preprocess = processing.CupedProcessor(...)
test_cuped = tests.AbsoluteIndependentTTest(
    preprocess=cuped_preprocess,
    data_splitter=splitter,
    value_column=value_column,
)
test_regular = tests.AbsoluteIndependentTTest(
    preprocess=None,
    data_splitter=splitter,
    value_column=value_column,
)

# Next create an experiment with relevant name.
# You can also generate several columns
experiment = AbobaExperiment(experiment_name="CUPED vs regular", draw_cols=2)

regular_aa_group = experiment.group("AA, regular")
regular_aa_group.run(test_regular, n_iter=n_iter)

regular_ab_group = experiment.group("AB, regular")
regular_ab_group.run(test_regular, synthetic_effect=effect, n_iter=n_iter)

cuped_aa_group = experiment.group("AA, cuped")
cuped_aa_group.run(test_cuped, n_iter=n_iter)

cuped_ab_group = experiment.group("AB, cuped")
cuped_ab_group.run(test_cuped, synthetic_effect=effect, n_iter=n_iter)

# Get results from each group
ab_results = cuped_ab_group.get_data()
Source code in aboba/experiment/aboba_experiment.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
class AbobaExperiment:
    """

    Context for conducting and displaying AB tests results.

    Results are displayed on a figure with confidence levels.
    By specifying number of columns, you can generate nice comparisons

    Examples:
        ```python
        # First create tests
        value_column = 'value'
        size = 100

        splitter = splitters.GroupSplitter(
            column='b_group',
            size=size,
        )
        cuped_preprocess = processing.CupedProcessor(...)
        test_cuped = tests.AbsoluteIndependentTTest(
            preprocess=cuped_preprocess,
            data_splitter=splitter,
            value_column=value_column,
        )
        test_regular = tests.AbsoluteIndependentTTest(
            preprocess=None,
            data_splitter=splitter,
            value_column=value_column,
        )

        # Next create an experiment with relevant name.
        # You can also generate several columns
        experiment = AbobaExperiment(experiment_name="CUPED vs regular", draw_cols=2)

        regular_aa_group = experiment.group("AA, regular")
        regular_aa_group.run(test_regular, n_iter=n_iter)

        regular_ab_group = experiment.group("AB, regular")
        regular_ab_group.run(test_regular, synthetic_effect=effect, n_iter=n_iter)

        cuped_aa_group = experiment.group("AA, cuped")
        cuped_aa_group.run(test_cuped, n_iter=n_iter)

        cuped_ab_group = experiment.group("AB, cuped")
        cuped_ab_group.run(test_cuped, synthetic_effect=effect, n_iter=n_iter)

        # Get results from each group
        ab_results = cuped_ab_group.get_data()
        ```
    """

    def __init__(
        self,
        alpha=0.05,
        experiment_name: Optional[str] = "AB experiment",
        visualization_method: Optional[
            Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]
        ] = default_visualization_method,
        language: Language = "eng",
        **visualization_kwargs,
    ):
        """
        Create a new experiment.
        Refer to the class description for more information.

        Args:
            alpha (float): Significance level for statistical tests.
            experiment_name (str): Name of the experiment to display.
            visualization_method (Optional[Callable]): Visualization function used to draw experiment results.
            language (Language): Default language for plot labels and titles.
            **visualization_kwargs: Additional arguments for the visualization.
        """


        assert 0.0 < alpha < 1.0

        self.alpha = alpha
        self.experiment_name = experiment_name
        self.visualization_method = visualization_method
        self.default_language = language

        visualization_kwargs["alpha"] = visualization_kwargs.get("alpha", alpha)
        visualization_kwargs["experiment_name"] = visualization_kwargs.get(
            "experiment_name", experiment_name
        )
        self.visualization_kwargs = visualization_kwargs

        self._groups: Dict[str, ExperimentGroup] = {}

    def group(
        self,
        name: str,
        test: BaseTest,
        data: Union[pd.DataFrame, List[pd.DataFrame]],
        data_pipeline: Pipeline,
        synthetic_effect: Optional[EffectModifier] = None,
        n_iter: int = 1,
        joblib_kwargs: Optional[dict] = None,
    ) -> ExperimentGroup:

        """
        Creates new context for experiment with specified name.

        Args:
            name (str): Name to use for this experiment subset.
            test (BaseTest): Statistical test to run.
            data (Union[pd.DataFrame, List[pd.DataFrame]]): Input data for the experiment.
            data_pipeline (Pipeline): Pipeline used to prepare data before testing.
            synthetic_effect (Optional[EffectModifier]): Synthetic effect applied before test execution.
            n_iter (int): Number of repeated test runs.
            joblib_kwargs (Optional[dict]): Additional keyword arguments for parallel execution.

        Returns:
            ExperimentGroup: Registered experiment group.
        """

        # TODO: raise one-time warning, if needed
        # assert name not in self._groups, (
        #     f"Trying to create group with {name = } but "
        #     f"it is already defined ({self._groups.keys()})"
        # )

        group = ExperimentGroup(
            name,
            test,
            data,
            data_pipeline,
            synthetic_effect,
            n_iter,
            joblib_kwargs,
        )
        self._groups[name] = group

        return group


    def draw(
        self,
        groups: Optional[List[str]] = None,
        group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None,
        lang: Language = None,
        figsize: Optional[Tuple[float, float]] = None,
        filter_empty: bool = True,
        **kwargs
    ) -> Tuple[Optional[Figure], Any]:
        """
        Draw visualization in the fixed 3-panel AA layout:
          - Top: confidence interval for alpha
          - Bottom-left: p-value histogram
          - Bottom-right: ECDF of p-values

        This is the only supported layout now.

        Args:
            groups: List of group names to visualize (None = all groups)
            group_configs: Per-group configuration (e.g., color)
            lang: Language for labels ('en' or 'ru')
            figsize: Custom figure size
            filter_empty: Skip empty groups
            **kwargs: Passed to draw_aa_experiment_layout

        Returns:
            Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]
        """
        if self.visualization_method is None:
            return None, None

        if lang is None:
            lang = self.default_language

        # Use our new fixed layout
        from aboba.experiment.visualization import draw_aa_experiment_layout
        viz_method = draw_aa_experiment_layout

        viz_kwargs = {
            **self.visualization_kwargs,
            "groups_list": groups,
            "group_configs": group_configs,
            "lang": lang,
            "figsize": figsize,
            "filter_empty": filter_empty,
            **kwargs
        }

        groups_to_get = groups if groups is not None else list(self._groups.keys())
        values = {key: self._groups[key].get_raw_data() 
                for key in groups_to_get if key in self._groups}

        return viz_method(values, **viz_kwargs)


    def draw_comparison(
        self,
        group_pairs: List[Tuple[str, str]],
        separate_pairs: bool = False,
        **kwargs
    ) -> Tuple[Optional[Figure], Any]:
        """
        Draw side-by-side comparison of group pairs.

        Args:
            group_pairs: List of (group1, group2) tuples
            separate_pairs: If True, groups are arranged in pairs (g1, g2, g1, g2, ...)
                        If False, duplicates are removed
            **kwargs: Additional draw arguments
        """
        if separate_pairs:
            # Показать все группы включая дубликаты (пары рядом)
            all_groups = []
            for g1, g2 in group_pairs:
                all_groups.extend([g1, g2])
        else:
            # Убрать дубликаты, сохранив порядок первого появления
            all_groups = []
            seen = set()
            for g1, g2 in group_pairs:
                for g in [g1, g2]:
                    if g not in seen:
                        all_groups.append(g)
                        seen.add(g)

        return self.draw(groups=all_groups, **kwargs)


    def quick_summary(self) -> pd.DataFrame:
        """
        Get DataFrame with summary statistics for all groups.

        Returns:
            DataFrame with columns: group_name, n_iterations, n_errors,
            real_alpha, ci_left, ci_right, rejection_rate

        Example:
            summary = experiment.quick_summary()
            print(summary.sort_values('real_alpha'))
        """
        from aboba.utils.alpha_interval import calculate_real_alpha

        summary = []
        for name, group in self._groups.items():
            data = group.get_raw_data()
            if data.is_empty():
                continue

            n_iter = len(data.history)
            pvals = [tr.pvalue for tr in data.history]
            n_errors = sum(int(p < self.alpha) for p in pvals)
            real_alpha, left_alpha, right_alpha = calculate_real_alpha(
                n_iter=n_iter, n_errors=n_errors
            )

            summary.append({
                'group_name': name,
                'n_iterations': n_iter,
                'n_errors': n_errors,
                'real_alpha': real_alpha,
                'ci_left': left_alpha,
                'ci_right': right_alpha,
                'rejection_rate': n_errors / n_iter if n_iter > 0 else 0.0,
                'mean_pvalue': np.mean(pvals),
                'median_pvalue': np.median(pvals),
            })

        return pd.DataFrame(summary)


    def draw_power_curve(
        self,
        effect_grid: Optional[List[float]] = None,
        effect_type: Literal["absolute", "relative"] = "absolute",
        n_iter: int = 500,
        target_power: float = 0.8,
        groups: Optional[List[str]] = None, 
        group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, 
        group_col: str = None,
        lang: Language = None,
        figsize: Tuple[float, float] = (9, 6),
        alpha_line_on: bool = True, 
        **kwargs
    ) -> Tuple[plt.Figure, plt.Axes]:
        """
        Draw power curves: statistical power vs effect size for one or more groups using simulation.

        Uses the specified groups' test, pipeline, and data as templates.
        For each group, it runs n_iter simulations for each effect size in `effect_grid`
        and estimates the proportion of rejections (power) with confidence intervals.
        Curves are plotted on the same axes for comparison.

        Parameters
        ----------
        effect_grid : List[float], optional
            Grid of effect values used to simulate power.
            Interpretation depends on `effect_type`:
            - "absolute": additive effect applied to test group values (`value + effect`);
            - "relative": multiplicative effect applied to test group values as (`value * (1 + effect)`).
            For ratio tests (e.g. ``DeltaRatioTtest``), the effect is applied to the numerator column only.
            If None:
            - for "absolute": defaults to np.linspace(0.0, 0.6, 20)
            - for "relative": defaults to np.linspace(0.0, 0.6, 20)
        effect_type : Literal["absolute", "relative"], default "absolute"
            Type of effect application for the test group:
            - "absolute": additive shift;
            - "relative": relative change via factor `1 + effect`.
        n_iter : int, default 500
            Number of simulations per effect size per group (trade-off: speed vs precision).
        target_power : float, default 0.8
            Horizontal line indicating desired power level.
        groups : List[str], optional
            Names of groups to include in the power analysis. If None, defaults to all groups.
        group_configs : Dict[str, GroupVisualizationConfig], optional
            Per-group configuration for styling (e.g., color, linestyle).
            Falls back to default if not provided for a group.
        lang : str, default "en"
            Language for labels ('en' or 'ru').
        figsize: Tuple[float, float], default (9, 6)
            Figure size.
        alpha_line_on : bool, default True
            Whether to draw the horizontal line for the significance level (alpha).
        **kwargs : dict
            Passed to `simulate_power_for_effect`.

        Returns
        -------
        fig : matplotlib.figure.Figure
        ax : matplotlib.axes.Axes
        """
        from aboba.utils.power_analysis import simulate_power_for_effect
        from statsmodels.stats.proportion import proportion_confint
        if lang is None:
            lang = self.default_language
        if isinstance(groups, str):
            groups = [groups]
        # If 'groups' is None, use all group names from the experiment
        groups_to_analyze = groups if groups is not None else list(self._groups.keys())

        # Filter to ensure only existing groups are processed
        groups_to_analyze = [g for g in groups_to_analyze if g in self._groups]

        if not groups_to_analyze:
            print("No groups specified or available for power curve analysis.")
            return None, None

        group_configs = group_configs or {}



        if effect_grid is None:
            effect_grid = np.linspace(0.0, 0.6, 20).tolist()

        if effect_type == "relative":
            invalid = [eff for eff in effect_grid if eff < -1]
            if invalid:
                raise ValueError(
                "For relative effects, each value must not be less than -1, "
                "because 1 + effect must stay non-negative."
                )

        fig, ax = plt.subplots(figsize=figsize)

        for group_name in groups_to_analyze:
            group_obj = self._groups[group_name]
            test = group_obj._test
            data = group_obj._data
            pipeline = group_obj._pipeline

            config = group_configs.get(group_name, GroupVisualizationConfig())

            if hasattr(test, "value_column"):
                value_col = test.value_column
            elif hasattr(test, "numerator_name"):
                # Ratio metrics (e.g. DeltaRatioTtest): apply synthetic effect to the numerator;
                # denominator unchanged matches relative uplift on the ratio when D is fixed.
                value_col = test.numerator_name
            else:
                raise TypeError(
                    f"Power curve simulation requires a test with 'value_column' or "
                    f"'numerator_name'; got {type(test).__name__}."
                )

            powers = []
            ci_lows = []
            ci_ups = []

            for eff in effect_grid:
                if not group_col:
                    splitter = None
                    for transformer in pipeline.transformers:
                        if isinstance(transformer, GroupSplitter):
                            splitter = transformer
                            break

                    if splitter is None:
                        raise ValueError("No GroupSplitter found in the pipeline. Cannot determine group column.")

                    group_col = splitter.column

                if effect_type == "absolute":
                    current_effect_modifier = effect_modifiers.GroupModifier(
                        effects={1: eff},
                        value_column=value_col,
                        group_column=group_col,
                        method=operator.add,   
                    )
                else:  # relative
                    current_effect_modifier = effect_modifiers.GroupModifier(
                        effects={1: 1 + eff},
                        value_column=value_col,
                        group_column=group_col,
                        method=operator.mul,                        
                    )


                try:
                    n_rejects, _ = simulate_power_for_effect(
                        test=test,
                        data=data,
                        pipeline=pipeline,
                        effect_modifier=current_effect_modifier,
                        n_iter=n_iter,
                        alpha=self.alpha,
                        effect_size=eff,
                        progress_desc=f"{group_name}: Effect={eff:.3f}"
                    )
                except Exception as e:
                    print(f"Error simulating for group '{group_name}', effect={eff:.3f}: {e}. Skipping...")
                    powers.append(np.nan)
                    ci_lows.append(np.nan)
                    ci_ups.append(np.nan)
                    continue

                power_est = n_rejects / n_iter
                low, high = proportion_confint(count=n_rejects, nobs=n_iter, method='wilson')
                powers.append(power_est)
                ci_lows.append(low)
                ci_ups.append(high)

            plot_kwargs = {'label': group_name}
            if config.color:
                plot_kwargs['color'] = config.color
                ci_color = plot_kwargs.get('color')

            ax.plot(effect_grid, powers, linewidth=3, **plot_kwargs)

            ax.fill_between(effect_grid, ci_lows, ci_ups, alpha=0.2)

        ax.axhline(target_power, color="#20b2aa", linestyle="--", linewidth=2, label=t('target_power', lang))

        if alpha_line_on:
            ax.axhline(self.alpha, color="#ff2400", linestyle="--", linewidth=2, label=t('significance_level', lang))

        ax.set_xlabel(t('effect_size', lang))
        ax.set_ylabel(t('power', lang))
        ax.set_xlim(left=min(effect_grid), right=max(effect_grid)* 1.03)
        ax.set_ylim(bottom=0, top=1.03)
        ax.legend(loc='lower right')
        ax.grid(True)

        fig.suptitle(t('power_curve_plot', lang), fontsize=14, fontweight='bold')
        fig.tight_layout()

        return fig, ax

__init__

__init__(alpha=0.05, experiment_name: Optional[str] = 'AB experiment', visualization_method: Optional[Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]] = default_visualization_method, language: Language = 'eng', **visualization_kwargs)

Create a new experiment. Refer to the class description for more information.

PARAMETER DESCRIPTION
alpha

Significance level for statistical tests.

TYPE: float DEFAULT: 0.05

experiment_name

Name of the experiment to display.

TYPE: str DEFAULT: 'AB experiment'

visualization_method

Visualization function used to draw experiment results.

TYPE: Optional[Callable] DEFAULT: default_visualization_method

language

Default language for plot labels and titles.

TYPE: Language DEFAULT: 'eng'

**visualization_kwargs

Additional arguments for the visualization.

DEFAULT: {}

Source code in aboba/experiment/aboba_experiment.py
def __init__(
    self,
    alpha=0.05,
    experiment_name: Optional[str] = "AB experiment",
    visualization_method: Optional[
        Callable[[Dict[str, ExperimentData], Dict[str, Any]], tuple[Figure, Any]]
    ] = default_visualization_method,
    language: Language = "eng",
    **visualization_kwargs,
):
    """
    Create a new experiment.
    Refer to the class description for more information.

    Args:
        alpha (float): Significance level for statistical tests.
        experiment_name (str): Name of the experiment to display.
        visualization_method (Optional[Callable]): Visualization function used to draw experiment results.
        language (Language): Default language for plot labels and titles.
        **visualization_kwargs: Additional arguments for the visualization.
    """


    assert 0.0 < alpha < 1.0

    self.alpha = alpha
    self.experiment_name = experiment_name
    self.visualization_method = visualization_method
    self.default_language = language

    visualization_kwargs["alpha"] = visualization_kwargs.get("alpha", alpha)
    visualization_kwargs["experiment_name"] = visualization_kwargs.get(
        "experiment_name", experiment_name
    )
    self.visualization_kwargs = visualization_kwargs

    self._groups: Dict[str, ExperimentGroup] = {}

group

group(name: str, test: BaseTest, data: Union[DataFrame, List[DataFrame]], data_pipeline: Pipeline, synthetic_effect: Optional[EffectModifier] = None, n_iter: int = 1, joblib_kwargs: Optional[dict] = None) -> ExperimentGroup

Creates new context for experiment with specified name.

PARAMETER DESCRIPTION
name

Name to use for this experiment subset.

TYPE: str

test

Statistical test to run.

TYPE: BaseTest

data

Input data for the experiment.

TYPE: Union[DataFrame, List[DataFrame]]

data_pipeline

Pipeline used to prepare data before testing.

TYPE: Pipeline

synthetic_effect

Synthetic effect applied before test execution.

TYPE: Optional[EffectModifier] DEFAULT: None

n_iter

Number of repeated test runs.

TYPE: int DEFAULT: 1

joblib_kwargs

Additional keyword arguments for parallel execution.

TYPE: Optional[dict] DEFAULT: None

RETURNS DESCRIPTION
ExperimentGroup

Registered experiment group.

TYPE: ExperimentGroup

Source code in aboba/experiment/aboba_experiment.py
def group(
    self,
    name: str,
    test: BaseTest,
    data: Union[pd.DataFrame, List[pd.DataFrame]],
    data_pipeline: Pipeline,
    synthetic_effect: Optional[EffectModifier] = None,
    n_iter: int = 1,
    joblib_kwargs: Optional[dict] = None,
) -> ExperimentGroup:

    """
    Creates new context for experiment with specified name.

    Args:
        name (str): Name to use for this experiment subset.
        test (BaseTest): Statistical test to run.
        data (Union[pd.DataFrame, List[pd.DataFrame]]): Input data for the experiment.
        data_pipeline (Pipeline): Pipeline used to prepare data before testing.
        synthetic_effect (Optional[EffectModifier]): Synthetic effect applied before test execution.
        n_iter (int): Number of repeated test runs.
        joblib_kwargs (Optional[dict]): Additional keyword arguments for parallel execution.

    Returns:
        ExperimentGroup: Registered experiment group.
    """

    # TODO: raise one-time warning, if needed
    # assert name not in self._groups, (
    #     f"Trying to create group with {name = } but "
    #     f"it is already defined ({self._groups.keys()})"
    # )

    group = ExperimentGroup(
        name,
        test,
        data,
        data_pipeline,
        synthetic_effect,
        n_iter,
        joblib_kwargs,
    )
    self._groups[name] = group

    return group

draw

draw(groups: Optional[List[str]] = None, group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, lang: Language = None, figsize: Optional[Tuple[float, float]] = None, filter_empty: bool = True, **kwargs) -> Tuple[Optional[Figure], Any]
Draw visualization in the fixed 3-panel AA layout
  • Top: confidence interval for alpha
  • Bottom-left: p-value histogram
  • Bottom-right: ECDF of p-values

This is the only supported layout now.

PARAMETER DESCRIPTION
groups

List of group names to visualize (None = all groups)

TYPE: Optional[List[str]] DEFAULT: None

group_configs

Per-group configuration (e.g., color)

TYPE: Optional[Dict[str, GroupVisualizationConfig]] DEFAULT: None

lang

Language for labels ('en' or 'ru')

TYPE: Language DEFAULT: None

figsize

Custom figure size

TYPE: Optional[Tuple[float, float]] DEFAULT: None

filter_empty

Skip empty groups

TYPE: bool DEFAULT: True

**kwargs

Passed to draw_aa_experiment_layout

DEFAULT: {}

RETURNS DESCRIPTION
Tuple[Optional[Figure], Any]

Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]

Source code in aboba/experiment/aboba_experiment.py
def draw(
    self,
    groups: Optional[List[str]] = None,
    group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None,
    lang: Language = None,
    figsize: Optional[Tuple[float, float]] = None,
    filter_empty: bool = True,
    **kwargs
) -> Tuple[Optional[Figure], Any]:
    """
    Draw visualization in the fixed 3-panel AA layout:
      - Top: confidence interval for alpha
      - Bottom-left: p-value histogram
      - Bottom-right: ECDF of p-values

    This is the only supported layout now.

    Args:
        groups: List of group names to visualize (None = all groups)
        group_configs: Per-group configuration (e.g., color)
        lang: Language for labels ('en' or 'ru')
        figsize: Custom figure size
        filter_empty: Skip empty groups
        **kwargs: Passed to draw_aa_experiment_layout

    Returns:
        Tuple of (Figure, axes_array) where axes_array[i] = [ax_interval, ax_hist, ax_ecdf]
    """
    if self.visualization_method is None:
        return None, None

    if lang is None:
        lang = self.default_language

    # Use our new fixed layout
    from aboba.experiment.visualization import draw_aa_experiment_layout
    viz_method = draw_aa_experiment_layout

    viz_kwargs = {
        **self.visualization_kwargs,
        "groups_list": groups,
        "group_configs": group_configs,
        "lang": lang,
        "figsize": figsize,
        "filter_empty": filter_empty,
        **kwargs
    }

    groups_to_get = groups if groups is not None else list(self._groups.keys())
    values = {key: self._groups[key].get_raw_data() 
            for key in groups_to_get if key in self._groups}

    return viz_method(values, **viz_kwargs)

draw_comparison

draw_comparison(group_pairs: List[Tuple[str, str]], separate_pairs: bool = False, **kwargs) -> Tuple[Optional[Figure], Any]

Draw side-by-side comparison of group pairs.

PARAMETER DESCRIPTION
group_pairs

List of (group1, group2) tuples

TYPE: List[Tuple[str, str]]

separate_pairs

If True, groups are arranged in pairs (g1, g2, g1, g2, ...) If False, duplicates are removed

TYPE: bool DEFAULT: False

**kwargs

Additional draw arguments

DEFAULT: {}

Source code in aboba/experiment/aboba_experiment.py
def draw_comparison(
    self,
    group_pairs: List[Tuple[str, str]],
    separate_pairs: bool = False,
    **kwargs
) -> Tuple[Optional[Figure], Any]:
    """
    Draw side-by-side comparison of group pairs.

    Args:
        group_pairs: List of (group1, group2) tuples
        separate_pairs: If True, groups are arranged in pairs (g1, g2, g1, g2, ...)
                    If False, duplicates are removed
        **kwargs: Additional draw arguments
    """
    if separate_pairs:
        # Показать все группы включая дубликаты (пары рядом)
        all_groups = []
        for g1, g2 in group_pairs:
            all_groups.extend([g1, g2])
    else:
        # Убрать дубликаты, сохранив порядок первого появления
        all_groups = []
        seen = set()
        for g1, g2 in group_pairs:
            for g in [g1, g2]:
                if g not in seen:
                    all_groups.append(g)
                    seen.add(g)

    return self.draw(groups=all_groups, **kwargs)

quick_summary

quick_summary() -> pd.DataFrame

Get DataFrame with summary statistics for all groups.

RETURNS DESCRIPTION
DataFrame

DataFrame with columns: group_name, n_iterations, n_errors,

DataFrame

real_alpha, ci_left, ci_right, rejection_rate

Example

summary = experiment.quick_summary() print(summary.sort_values('real_alpha'))

Source code in aboba/experiment/aboba_experiment.py
def quick_summary(self) -> pd.DataFrame:
    """
    Get DataFrame with summary statistics for all groups.

    Returns:
        DataFrame with columns: group_name, n_iterations, n_errors,
        real_alpha, ci_left, ci_right, rejection_rate

    Example:
        summary = experiment.quick_summary()
        print(summary.sort_values('real_alpha'))
    """
    from aboba.utils.alpha_interval import calculate_real_alpha

    summary = []
    for name, group in self._groups.items():
        data = group.get_raw_data()
        if data.is_empty():
            continue

        n_iter = len(data.history)
        pvals = [tr.pvalue for tr in data.history]
        n_errors = sum(int(p < self.alpha) for p in pvals)
        real_alpha, left_alpha, right_alpha = calculate_real_alpha(
            n_iter=n_iter, n_errors=n_errors
        )

        summary.append({
            'group_name': name,
            'n_iterations': n_iter,
            'n_errors': n_errors,
            'real_alpha': real_alpha,
            'ci_left': left_alpha,
            'ci_right': right_alpha,
            'rejection_rate': n_errors / n_iter if n_iter > 0 else 0.0,
            'mean_pvalue': np.mean(pvals),
            'median_pvalue': np.median(pvals),
        })

    return pd.DataFrame(summary)

draw_power_curve

draw_power_curve(effect_grid: Optional[List[float]] = None, effect_type: Literal['absolute', 'relative'] = 'absolute', n_iter: int = 500, target_power: float = 0.8, groups: Optional[List[str]] = None, group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, group_col: str = None, lang: Language = None, figsize: Tuple[float, float] = (9, 6), alpha_line_on: bool = True, **kwargs) -> Tuple[plt.Figure, plt.Axes]

Draw power curves: statistical power vs effect size for one or more groups using simulation.

Uses the specified groups' test, pipeline, and data as templates. For each group, it runs n_iter simulations for each effect size in effect_grid and estimates the proportion of rejections (power) with confidence intervals. Curves are plotted on the same axes for comparison.

Parameters

effect_grid : List[float], optional Grid of effect values used to simulate power. Interpretation depends on effect_type: - "absolute": additive effect applied to test group values (value + effect); - "relative": multiplicative effect applied to test group values as (value * (1 + effect)). For ratio tests (e.g. DeltaRatioTtest), the effect is applied to the numerator column only. If None: - for "absolute": defaults to np.linspace(0.0, 0.6, 20) - for "relative": defaults to np.linspace(0.0, 0.6, 20) effect_type : Literal["absolute", "relative"], default "absolute" Type of effect application for the test group: - "absolute": additive shift; - "relative": relative change via factor 1 + effect. n_iter : int, default 500 Number of simulations per effect size per group (trade-off: speed vs precision). target_power : float, default 0.8 Horizontal line indicating desired power level. groups : List[str], optional Names of groups to include in the power analysis. If None, defaults to all groups. group_configs : Dict[str, GroupVisualizationConfig], optional Per-group configuration for styling (e.g., color, linestyle). Falls back to default if not provided for a group. lang : str, default "en" Language for labels ('en' or 'ru'). figsize: Tuple[float, float], default (9, 6) Figure size. alpha_line_on : bool, default True Whether to draw the horizontal line for the significance level (alpha). **kwargs : dict Passed to simulate_power_for_effect.

Returns

fig : matplotlib.figure.Figure ax : matplotlib.axes.Axes

Source code in aboba/experiment/aboba_experiment.py
def draw_power_curve(
    self,
    effect_grid: Optional[List[float]] = None,
    effect_type: Literal["absolute", "relative"] = "absolute",
    n_iter: int = 500,
    target_power: float = 0.8,
    groups: Optional[List[str]] = None, 
    group_configs: Optional[Dict[str, GroupVisualizationConfig]] = None, 
    group_col: str = None,
    lang: Language = None,
    figsize: Tuple[float, float] = (9, 6),
    alpha_line_on: bool = True, 
    **kwargs
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Draw power curves: statistical power vs effect size for one or more groups using simulation.

    Uses the specified groups' test, pipeline, and data as templates.
    For each group, it runs n_iter simulations for each effect size in `effect_grid`
    and estimates the proportion of rejections (power) with confidence intervals.
    Curves are plotted on the same axes for comparison.

    Parameters
    ----------
    effect_grid : List[float], optional
        Grid of effect values used to simulate power.
        Interpretation depends on `effect_type`:
        - "absolute": additive effect applied to test group values (`value + effect`);
        - "relative": multiplicative effect applied to test group values as (`value * (1 + effect)`).
        For ratio tests (e.g. ``DeltaRatioTtest``), the effect is applied to the numerator column only.
        If None:
        - for "absolute": defaults to np.linspace(0.0, 0.6, 20)
        - for "relative": defaults to np.linspace(0.0, 0.6, 20)
    effect_type : Literal["absolute", "relative"], default "absolute"
        Type of effect application for the test group:
        - "absolute": additive shift;
        - "relative": relative change via factor `1 + effect`.
    n_iter : int, default 500
        Number of simulations per effect size per group (trade-off: speed vs precision).
    target_power : float, default 0.8
        Horizontal line indicating desired power level.
    groups : List[str], optional
        Names of groups to include in the power analysis. If None, defaults to all groups.
    group_configs : Dict[str, GroupVisualizationConfig], optional
        Per-group configuration for styling (e.g., color, linestyle).
        Falls back to default if not provided for a group.
    lang : str, default "en"
        Language for labels ('en' or 'ru').
    figsize: Tuple[float, float], default (9, 6)
        Figure size.
    alpha_line_on : bool, default True
        Whether to draw the horizontal line for the significance level (alpha).
    **kwargs : dict
        Passed to `simulate_power_for_effect`.

    Returns
    -------
    fig : matplotlib.figure.Figure
    ax : matplotlib.axes.Axes
    """
    from aboba.utils.power_analysis import simulate_power_for_effect
    from statsmodels.stats.proportion import proportion_confint
    if lang is None:
        lang = self.default_language
    if isinstance(groups, str):
        groups = [groups]
    # If 'groups' is None, use all group names from the experiment
    groups_to_analyze = groups if groups is not None else list(self._groups.keys())

    # Filter to ensure only existing groups are processed
    groups_to_analyze = [g for g in groups_to_analyze if g in self._groups]

    if not groups_to_analyze:
        print("No groups specified or available for power curve analysis.")
        return None, None

    group_configs = group_configs or {}



    if effect_grid is None:
        effect_grid = np.linspace(0.0, 0.6, 20).tolist()

    if effect_type == "relative":
        invalid = [eff for eff in effect_grid if eff < -1]
        if invalid:
            raise ValueError(
            "For relative effects, each value must not be less than -1, "
            "because 1 + effect must stay non-negative."
            )

    fig, ax = plt.subplots(figsize=figsize)

    for group_name in groups_to_analyze:
        group_obj = self._groups[group_name]
        test = group_obj._test
        data = group_obj._data
        pipeline = group_obj._pipeline

        config = group_configs.get(group_name, GroupVisualizationConfig())

        if hasattr(test, "value_column"):
            value_col = test.value_column
        elif hasattr(test, "numerator_name"):
            # Ratio metrics (e.g. DeltaRatioTtest): apply synthetic effect to the numerator;
            # denominator unchanged matches relative uplift on the ratio when D is fixed.
            value_col = test.numerator_name
        else:
            raise TypeError(
                f"Power curve simulation requires a test with 'value_column' or "
                f"'numerator_name'; got {type(test).__name__}."
            )

        powers = []
        ci_lows = []
        ci_ups = []

        for eff in effect_grid:
            if not group_col:
                splitter = None
                for transformer in pipeline.transformers:
                    if isinstance(transformer, GroupSplitter):
                        splitter = transformer
                        break

                if splitter is None:
                    raise ValueError("No GroupSplitter found in the pipeline. Cannot determine group column.")

                group_col = splitter.column

            if effect_type == "absolute":
                current_effect_modifier = effect_modifiers.GroupModifier(
                    effects={1: eff},
                    value_column=value_col,
                    group_column=group_col,
                    method=operator.add,   
                )
            else:  # relative
                current_effect_modifier = effect_modifiers.GroupModifier(
                    effects={1: 1 + eff},
                    value_column=value_col,
                    group_column=group_col,
                    method=operator.mul,                        
                )


            try:
                n_rejects, _ = simulate_power_for_effect(
                    test=test,
                    data=data,
                    pipeline=pipeline,
                    effect_modifier=current_effect_modifier,
                    n_iter=n_iter,
                    alpha=self.alpha,
                    effect_size=eff,
                    progress_desc=f"{group_name}: Effect={eff:.3f}"
                )
            except Exception as e:
                print(f"Error simulating for group '{group_name}', effect={eff:.3f}: {e}. Skipping...")
                powers.append(np.nan)
                ci_lows.append(np.nan)
                ci_ups.append(np.nan)
                continue

            power_est = n_rejects / n_iter
            low, high = proportion_confint(count=n_rejects, nobs=n_iter, method='wilson')
            powers.append(power_est)
            ci_lows.append(low)
            ci_ups.append(high)

        plot_kwargs = {'label': group_name}
        if config.color:
            plot_kwargs['color'] = config.color
            ci_color = plot_kwargs.get('color')

        ax.plot(effect_grid, powers, linewidth=3, **plot_kwargs)

        ax.fill_between(effect_grid, ci_lows, ci_ups, alpha=0.2)

    ax.axhline(target_power, color="#20b2aa", linestyle="--", linewidth=2, label=t('target_power', lang))

    if alpha_line_on:
        ax.axhline(self.alpha, color="#ff2400", linestyle="--", linewidth=2, label=t('significance_level', lang))

    ax.set_xlabel(t('effect_size', lang))
    ax.set_ylabel(t('power', lang))
    ax.set_xlim(left=min(effect_grid), right=max(effect_grid)* 1.03)
    ax.set_ylim(bottom=0, top=1.03)
    ax.legend(loc='lower right')
    ax.grid(True)

    fig.suptitle(t('power_curve_plot', lang), fontsize=14, fontweight='bold')
    fig.tight_layout()

    return fig, ax

ExperimentGroup

ExperimentGroup

Manages experiment subset.

Handles running tests multiple times, applying synthetic effects, and collecting results from the pipeline and test.

Source code in aboba/experiment/experiment_group.py
class ExperimentGroup:
    """
    Manages experiment subset.

    Handles running tests multiple times, applying synthetic effects,
    and collecting results from the pipeline and test.
    """

    def __init__(
        self,
        name: str,
        test: BaseTest,
        data: Union[pd.DataFrame, List[pd.DataFrame]],
        data_pipeline: Pipeline,
        synthetic_effect: Optional[EffectModifier] = None,
        n_iter: int = 1,
        joblib_kwargs: Optional[dict] = None,
    ):
        self._experiment_data = ExperimentData()
        self._name = name
        self._test = test
        self._data = data
        self._pipeline = data_pipeline
        self._synthetic_effect = synthetic_effect
        self._n_iter = n_iter
        self._joblib_kwargs = joblib_kwargs

        self._pipeline.fit(self._data)

        if self._joblib_kwargs is None:
            self._joblib_kwargs = dict()


    def run(self):
        """
        Run test multiple times in parallel and store results in currently activated experiment group.

        Returns:
            self: For method chaining
        """
        results = joblib.Parallel(**self._joblib_kwargs)(
            joblib.delayed(self._run_one)() for _ in range(self._n_iter)
        )

        for result in results:
            self._experiment_data.record(result)

        return self


    def get_raw_data(self) -> ExperimentData:
        return self._experiment_data


    def get_data(self) -> pd.DataFrame:
        test_result_cols = [field.name for field in fields(TestResult)]
        columns = ['Test name'] + test_result_cols

        result = pd.DataFrame(columns=columns)

        result[columns[0]] = [f'{self._name}-{i}' for i in range(len(self._experiment_data.history))]
        for field_name in test_result_cols:
            result[field_name] = [asdict(item)[field_name] for item in self._experiment_data.history]

        return result


    def _run_one(self):
        pipeline_result = self._pipeline.transform(self._data)

        if isinstance(pipeline_result, tuple) and len(pipeline_result) == 2:
            groups, artifacts = pipeline_result
        else:
            groups = pipeline_result
            artifacts = None

        groups = self._add_effect(groups, self._synthetic_effect)

        result = self._test.test(groups, artefacts=artifacts)
        assert isinstance(result, TestResult), f"Test {self._test} must return TestResult instance"

        return result


    @staticmethod
    def _add_effect(
        groups: List[pd.DataFrame], synthetic_effect: Optional[EffectModifier]
    ) -> List[pd.DataFrame]:
        if synthetic_effect is None:
            return groups

        modified = synthetic_effect([group.copy() for group in groups])
        assert len(modified) == len(
            groups
        ), f"Effect modifier {synthetic_effect} must not change number of groups"
        return modified

run

run()

Run test multiple times in parallel and store results in currently activated experiment group.

RETURNS DESCRIPTION
self

For method chaining

Source code in aboba/experiment/experiment_group.py
def run(self):
    """
    Run test multiple times in parallel and store results in currently activated experiment group.

    Returns:
        self: For method chaining
    """
    results = joblib.Parallel(**self._joblib_kwargs)(
        joblib.delayed(self._run_one)() for _ in range(self._n_iter)
    )

    for result in results:
        self._experiment_data.record(result)

    return self