Skip to content

Benchmarks

Performance benchmarking and comparison utilities for statistical metrics.

Performance benchmarks and accuracy verification for statistical functions (Aero Protocol Compliant).

This module provides tools to benchmark the execution time of various metrics and verify their mathematical accuracy against known values.

AccuracyVerification

Suite for verifying the mathematical correctness of statistical functions.

Source code in src/monet_stats/benchmarks.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
class AccuracyVerification:
    """
    Suite for verifying the mathematical correctness of statistical functions.
    """

    def __init__(self, tolerance: float = 1e-10) -> None:
        """
        Initialize the verification suite.

        Parameters
        ----------
        tolerance : float, optional
            Numerical tolerance for floating-point comparisons. Default is 1e-10.
        """
        self.tolerance = tolerance

    def test_known_values(self) -> Dict[str, Dict[str, Any]]:
        """
        Run a series of tests against analytically known values.

        Returns
        -------
        Dict[str, Dict[str, Any]]
            Dictionary of test results including computed vs expected values.
        """
        results: Dict[str, Dict[str, Any]] = {}

        # Perfect correlation case
        obs_perfect = np.array([1, 2, 3, 4, 5])
        mod_perfect = np.array([1, 2, 3, 4, 5])

        # Test R2 for perfect correlation
        r2_result = R2(obs_perfect, mod_perfect)
        results["R2_perfect"] = {
            "computed": float(r2_result),
            "expected": 1.0,
            "passed": bool(np.isclose(r2_result, 1.0, atol=self.tolerance)),
        }

        # Test correlation for perfect correlation
        corr_result = correlation(obs_perfect, mod_perfect)
        results["correlation_perfect"] = {
            "computed": float(corr_result),
            "expected": 1.0,
            "passed": bool(np.isclose(corr_result, 1.0, atol=self.tolerance)),
        }

        # Test RMSE for perfect match
        rmse_result = RMSE(obs_perfect, mod_perfect)
        results["RMSE_perfect"] = {
            "computed": float(rmse_result),
            "expected": 0.0,
            "passed": bool(np.isclose(rmse_result, 0.0, atol=self.tolerance)),
        }

        # Test MAE for perfect match
        mae_result = MAE(obs_perfect, mod_perfect)
        results["MAE_perfect"] = {
            "computed": float(mae_result),
            "expected": 0.0,
            "passed": bool(np.isclose(mae_result, 0.0, atol=self.tolerance)),
        }

        # Test NSE for perfect match
        nse_result = NSE(obs_perfect, mod_perfect)
        results["NSE_perfect"] = {
            "computed": float(nse_result),
            "expected": 1.0,
            "passed": bool(np.isclose(nse_result, 1.0, atol=self.tolerance)),
        }

        # Test with known bias
        obs_bias = np.ones(10)
        mod_bias = np.ones(10) * 2  # 100% bias
        mb_result = MB(obs_bias, mod_bias)
        # MB = mean(mod - obs) = (2 - 1) = 1.0
        expected_mb = 1.0
        results["MB_bias"] = {
            "computed": float(mb_result),
            "expected": expected_mb,
            "passed": bool(np.isclose(mb_result, expected_mb, atol=self.tolerance)),
        }

        # Test with known MAPE
        obs_mape = np.array([10, 10, 10])
        mod_mape = np.array([11, 9, 10])  # 10%, -10%, 0% errors
        mape_result = MAPE(obs_mape, mod_mape)
        expected_mape = (10 + 10 + 0) / 3  # Average absolute percentage error
        results["MAPE_known"] = {
            "computed": float(mape_result),
            "expected": expected_mape,
            "passed": bool(np.isclose(mape_result, expected_mape, atol=0.1)),
        }

        return results

    def print_accuracy_report(self) -> None:
        """Print a formatted accuracy report to the console."""
        results = self.test_known_values()

        print("\n" + "=" * 80)
        print("ACCURACY VERIFICATION REPORT")
        print("=" * 80)

        passed = 0
        total = len(results)

        for test_name, result in results.items():
            status = "PASS" if result["passed"] else "FAIL"
            print(
                f"{test_name:<20}: {status:<4} | Computed: {result['computed']:.6f}, Expected: {result['expected']:.6f}"
            )
            if result["passed"]:
                passed += 1

        print(f"\nAccuracy Summary: {passed}/{total} tests passed")

        if passed == total:
            print("✓ All accuracy tests PASSED!")
        else:
            print("✗ Some accuracy tests FAILED!")

__init__(tolerance=1e-10)

Initialize the verification suite.

Parameters

tolerance : float, optional Numerical tolerance for floating-point comparisons. Default is 1e-10.

Source code in src/monet_stats/benchmarks.py
216
217
218
219
220
221
222
223
224
225
def __init__(self, tolerance: float = 1e-10) -> None:
    """
    Initialize the verification suite.

    Parameters
    ----------
    tolerance : float, optional
        Numerical tolerance for floating-point comparisons. Default is 1e-10.
    """
    self.tolerance = tolerance

print_accuracy_report()

Print a formatted accuracy report to the console.

Source code in src/monet_stats/benchmarks.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
def print_accuracy_report(self) -> None:
    """Print a formatted accuracy report to the console."""
    results = self.test_known_values()

    print("\n" + "=" * 80)
    print("ACCURACY VERIFICATION REPORT")
    print("=" * 80)

    passed = 0
    total = len(results)

    for test_name, result in results.items():
        status = "PASS" if result["passed"] else "FAIL"
        print(
            f"{test_name:<20}: {status:<4} | Computed: {result['computed']:.6f}, Expected: {result['expected']:.6f}"
        )
        if result["passed"]:
            passed += 1

    print(f"\nAccuracy Summary: {passed}/{total} tests passed")

    if passed == total:
        print("✓ All accuracy tests PASSED!")
    else:
        print("✗ Some accuracy tests FAILED!")

test_known_values()

Run a series of tests against analytically known values.

Returns

Dict[str, Dict[str, Any]] Dictionary of test results including computed vs expected values.

Source code in src/monet_stats/benchmarks.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def test_known_values(self) -> Dict[str, Dict[str, Any]]:
    """
    Run a series of tests against analytically known values.

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Dictionary of test results including computed vs expected values.
    """
    results: Dict[str, Dict[str, Any]] = {}

    # Perfect correlation case
    obs_perfect = np.array([1, 2, 3, 4, 5])
    mod_perfect = np.array([1, 2, 3, 4, 5])

    # Test R2 for perfect correlation
    r2_result = R2(obs_perfect, mod_perfect)
    results["R2_perfect"] = {
        "computed": float(r2_result),
        "expected": 1.0,
        "passed": bool(np.isclose(r2_result, 1.0, atol=self.tolerance)),
    }

    # Test correlation for perfect correlation
    corr_result = correlation(obs_perfect, mod_perfect)
    results["correlation_perfect"] = {
        "computed": float(corr_result),
        "expected": 1.0,
        "passed": bool(np.isclose(corr_result, 1.0, atol=self.tolerance)),
    }

    # Test RMSE for perfect match
    rmse_result = RMSE(obs_perfect, mod_perfect)
    results["RMSE_perfect"] = {
        "computed": float(rmse_result),
        "expected": 0.0,
        "passed": bool(np.isclose(rmse_result, 0.0, atol=self.tolerance)),
    }

    # Test MAE for perfect match
    mae_result = MAE(obs_perfect, mod_perfect)
    results["MAE_perfect"] = {
        "computed": float(mae_result),
        "expected": 0.0,
        "passed": bool(np.isclose(mae_result, 0.0, atol=self.tolerance)),
    }

    # Test NSE for perfect match
    nse_result = NSE(obs_perfect, mod_perfect)
    results["NSE_perfect"] = {
        "computed": float(nse_result),
        "expected": 1.0,
        "passed": bool(np.isclose(nse_result, 1.0, atol=self.tolerance)),
    }

    # Test with known bias
    obs_bias = np.ones(10)
    mod_bias = np.ones(10) * 2  # 100% bias
    mb_result = MB(obs_bias, mod_bias)
    # MB = mean(mod - obs) = (2 - 1) = 1.0
    expected_mb = 1.0
    results["MB_bias"] = {
        "computed": float(mb_result),
        "expected": expected_mb,
        "passed": bool(np.isclose(mb_result, expected_mb, atol=self.tolerance)),
    }

    # Test with known MAPE
    obs_mape = np.array([10, 10, 10])
    mod_mape = np.array([11, 9, 10])  # 10%, -10%, 0% errors
    mape_result = MAPE(obs_mape, mod_mape)
    expected_mape = (10 + 10 + 0) / 3  # Average absolute percentage error
    results["MAPE_known"] = {
        "computed": float(mape_result),
        "expected": expected_mape,
        "passed": bool(np.isclose(mape_result, expected_mape, atol=0.1)),
    }

    return results

PerformanceBenchmark

Performance benchmarking suite for statistical functions.

This class enables timing analysis of metrics across different backends (NumPy, Xarray, Dask) and data sizes.

Source code in src/monet_stats/benchmarks.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class PerformanceBenchmark:
    """
    Performance benchmarking suite for statistical functions.

    This class enables timing analysis of metrics across different backends
    (NumPy, Xarray, Dask) and data sizes.
    """

    def __init__(self) -> None:
        """Initialize the benchmark suite with an empty results dictionary."""
        self.results: Dict[int, Dict[str, Any]] = {}

    def generate_test_data(
        self, size: int, data_type: str = "numpy", chunks: Optional[Dict[str, int]] = None
    ) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
        """
        Generate synthetic test data for benchmarking.

        Parameters
        ----------
        size : int
            Number of data points to generate.
        data_type : str, optional
            Type of data to generate ('numpy' or 'xarray'). Default is 'numpy'.
        chunks : Optional[Dict[str, int]], optional
            Dask chunk sizes if data_type is 'xarray' and lazy evaluation is desired.

        Returns
        -------
        Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]
            A tuple containing (obs, mod) arrays.
        """
        np.random.seed(42)  # For reproducible results
        obs_raw = np.random.normal(10, 3, size)
        mod_raw = obs_raw + np.random.normal(0, 1, size)  # Add some error

        if data_type == "xarray":
            obs = xr.DataArray(obs_raw, dims=["time"], coords={"time": np.arange(size)}, name="obs")
            mod = xr.DataArray(mod_raw, dims=["time"], coords={"time": np.arange(size)}, name="mod")
            if chunks:
                obs = obs.chunk(chunks)
                mod = mod.chunk(chunks)
            return obs, mod

        return obs_raw, mod_raw

    def benchmark_function(
        self,
        func: Callable,
        obs: Union[np.ndarray, xr.DataArray],
        mod: Union[np.ndarray, xr.DataArray],
        runs: int = 100,
    ) -> Dict[str, Any]:
        """
        Benchmark a single statistical function.

        .. note::
           For Dask-backed arrays, this function explicitly calls `.compute()`
           to measure the full execution time of the calculation.

        Parameters
        ----------
        func : Callable
            The function to benchmark.
        obs : Union[np.ndarray, xr.DataArray]
            Observed values.
        mod : Union[np.ndarray, xr.DataArray]
            Model values.
        runs : int, optional
            Number of iterations for averaging. Default is 100.

        Returns
        -------
        Dict[str, Any]
            Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.
        """
        times: List[float] = []
        results: List[Any] = []

        # Warm-up run (especially for Dask/JIT if applicable)
        _ = func(obs, mod)

        for _ in range(runs):
            start_time = time.perf_counter()
            result = func(obs, mod)
            # If dask-backed, we might want to trigger compute to measure execution time
            # however, the protocol says "No Hidden Computes".
            # For benchmarking, we might explicitly compute if we want to measure wall time of the calculation.
            if hasattr(result, "compute"):
                result = result.compute()

            end_time = time.perf_counter()

            times.append(end_time - start_time)
            results.append(result)

        avg_time = float(np.mean(times))
        std_time = float(np.std(times))

        return {
            "avg_time": avg_time,
            "std_time": std_time,
            "result": results[0],
            "runs": runs,
        }

    def run_all_benchmarks(self, sizes: Optional[List[int]] = None) -> Dict[int, Dict[str, Any]]:
        """
        Run benchmarks for a standard set of functions across multiple sizes.

        Parameters
        ----------
        sizes : Optional[List[int]], optional
            List of data sizes to test. Defaults to [100, 1000, 10000].

        Returns
        -------
        Dict[int, Dict[str, Any]]
            Comprehensive benchmark results indexed by size.
        """
        if sizes is None:
            sizes = [100, 1000, 10000]

        functions = {
            "MAE": MAE,
            "RMSE": RMSE,
            "MB": MB,
            "R2": R2,
            "NSE": NSE,
            "MAPE": MAPE,
            "MASE": MASE,
            "MedAE": MedAE,
            "sMAPE": sMAPE,
            "NMB": NMB,
            "FB": FB,
            "FE": FE,
            "stats_pearsonr": stats_pearsonr,
            "rmse_util": rmse,
            "mae_util": mae,
            "corr_util": correlation,
        }

        results: Dict[int, Dict[str, Any]] = {}

        for size in sizes:
            print(f"Benchmarking with data size: {size:,}")
            obs, mod = self.generate_test_data(size)

            size_results: Dict[str, Any] = {}
            for name, func in functions.items():
                try:
                    bench_result = self.benchmark_function(func, obs, mod)
                    size_results[name] = bench_result
                except Exception as e:
                    print(f"Error benchmarking {name}: {e!s}")
                    size_results[name] = {"error": str(e)}

            results[size] = size_results

        self.results = results
        return results

    def print_benchmark_report(self) -> None:
        """Print a formatted performance report to the console."""
        print("\n" + "=" * 80)
        print("PERFORMANCE BENCHMARK REPORT")
        print("=" * 80)

        for size, size_results in self.results.items():
            print(f"\nData Size: {size:,} elements")
            print("-" * 40)

            # Sort by average time
            sorted_results = sorted(
                size_results.items(),
                key=lambda x: x[1].get("avg_time", float("inf")) if isinstance(x[1], dict) else float("inf"),
            )

            for name, result in sorted_results:
                if isinstance(result, dict) and "error" not in result:
                    avg_time = result["avg_time"]
                    std_time = result["std_time"]
                    print(f"{name:<20}: {avg_time * 1000:>8.4f}±{std_time * 1000:.4f} ms")
                elif isinstance(result, dict):
                    print(f"{name:<20}: ERROR - {result.get('error')}")
                else:
                    print(f"{name:<20}: ERROR - Unknown result format")

__init__()

Initialize the benchmark suite with an empty results dictionary.

Source code in src/monet_stats/benchmarks.py
30
31
32
def __init__(self) -> None:
    """Initialize the benchmark suite with an empty results dictionary."""
    self.results: Dict[int, Dict[str, Any]] = {}

benchmark_function(func, obs, mod, runs=100)

Benchmark a single statistical function.

.. note:: For Dask-backed arrays, this function explicitly calls .compute() to measure the full execution time of the calculation.

Parameters

func : Callable The function to benchmark. obs : Union[np.ndarray, xr.DataArray] Observed values. mod : Union[np.ndarray, xr.DataArray] Model values. runs : int, optional Number of iterations for averaging. Default is 100.

Returns

Dict[str, Any] Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.

Source code in src/monet_stats/benchmarks.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def benchmark_function(
    self,
    func: Callable,
    obs: Union[np.ndarray, xr.DataArray],
    mod: Union[np.ndarray, xr.DataArray],
    runs: int = 100,
) -> Dict[str, Any]:
    """
    Benchmark a single statistical function.

    .. note::
       For Dask-backed arrays, this function explicitly calls `.compute()`
       to measure the full execution time of the calculation.

    Parameters
    ----------
    func : Callable
        The function to benchmark.
    obs : Union[np.ndarray, xr.DataArray]
        Observed values.
    mod : Union[np.ndarray, xr.DataArray]
        Model values.
    runs : int, optional
        Number of iterations for averaging. Default is 100.

    Returns
    -------
    Dict[str, Any]
        Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.
    """
    times: List[float] = []
    results: List[Any] = []

    # Warm-up run (especially for Dask/JIT if applicable)
    _ = func(obs, mod)

    for _ in range(runs):
        start_time = time.perf_counter()
        result = func(obs, mod)
        # If dask-backed, we might want to trigger compute to measure execution time
        # however, the protocol says "No Hidden Computes".
        # For benchmarking, we might explicitly compute if we want to measure wall time of the calculation.
        if hasattr(result, "compute"):
            result = result.compute()

        end_time = time.perf_counter()

        times.append(end_time - start_time)
        results.append(result)

    avg_time = float(np.mean(times))
    std_time = float(np.std(times))

    return {
        "avg_time": avg_time,
        "std_time": std_time,
        "result": results[0],
        "runs": runs,
    }

generate_test_data(size, data_type='numpy', chunks=None)

Generate synthetic test data for benchmarking.

Parameters

size : int Number of data points to generate. data_type : str, optional Type of data to generate ('numpy' or 'xarray'). Default is 'numpy'. chunks : Optional[Dict[str, int]], optional Dask chunk sizes if data_type is 'xarray' and lazy evaluation is desired.

Returns

Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]] A tuple containing (obs, mod) arrays.

Source code in src/monet_stats/benchmarks.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def generate_test_data(
    self, size: int, data_type: str = "numpy", chunks: Optional[Dict[str, int]] = None
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Generate synthetic test data for benchmarking.

    Parameters
    ----------
    size : int
        Number of data points to generate.
    data_type : str, optional
        Type of data to generate ('numpy' or 'xarray'). Default is 'numpy'.
    chunks : Optional[Dict[str, int]], optional
        Dask chunk sizes if data_type is 'xarray' and lazy evaluation is desired.

    Returns
    -------
    Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]
        A tuple containing (obs, mod) arrays.
    """
    np.random.seed(42)  # For reproducible results
    obs_raw = np.random.normal(10, 3, size)
    mod_raw = obs_raw + np.random.normal(0, 1, size)  # Add some error

    if data_type == "xarray":
        obs = xr.DataArray(obs_raw, dims=["time"], coords={"time": np.arange(size)}, name="obs")
        mod = xr.DataArray(mod_raw, dims=["time"], coords={"time": np.arange(size)}, name="mod")
        if chunks:
            obs = obs.chunk(chunks)
            mod = mod.chunk(chunks)
        return obs, mod

    return obs_raw, mod_raw

print_benchmark_report()

Print a formatted performance report to the console.

Source code in src/monet_stats/benchmarks.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def print_benchmark_report(self) -> None:
    """Print a formatted performance report to the console."""
    print("\n" + "=" * 80)
    print("PERFORMANCE BENCHMARK REPORT")
    print("=" * 80)

    for size, size_results in self.results.items():
        print(f"\nData Size: {size:,} elements")
        print("-" * 40)

        # Sort by average time
        sorted_results = sorted(
            size_results.items(),
            key=lambda x: x[1].get("avg_time", float("inf")) if isinstance(x[1], dict) else float("inf"),
        )

        for name, result in sorted_results:
            if isinstance(result, dict) and "error" not in result:
                avg_time = result["avg_time"]
                std_time = result["std_time"]
                print(f"{name:<20}: {avg_time * 1000:>8.4f}±{std_time * 1000:.4f} ms")
            elif isinstance(result, dict):
                print(f"{name:<20}: ERROR - {result.get('error')}")
            else:
                print(f"{name:<20}: ERROR - Unknown result format")

run_all_benchmarks(sizes=None)

Run benchmarks for a standard set of functions across multiple sizes.

Parameters

sizes : Optional[List[int]], optional List of data sizes to test. Defaults to [100, 1000, 10000].

Returns

Dict[int, Dict[str, Any]] Comprehensive benchmark results indexed by size.

Source code in src/monet_stats/benchmarks.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def run_all_benchmarks(self, sizes: Optional[List[int]] = None) -> Dict[int, Dict[str, Any]]:
    """
    Run benchmarks for a standard set of functions across multiple sizes.

    Parameters
    ----------
    sizes : Optional[List[int]], optional
        List of data sizes to test. Defaults to [100, 1000, 10000].

    Returns
    -------
    Dict[int, Dict[str, Any]]
        Comprehensive benchmark results indexed by size.
    """
    if sizes is None:
        sizes = [100, 1000, 10000]

    functions = {
        "MAE": MAE,
        "RMSE": RMSE,
        "MB": MB,
        "R2": R2,
        "NSE": NSE,
        "MAPE": MAPE,
        "MASE": MASE,
        "MedAE": MedAE,
        "sMAPE": sMAPE,
        "NMB": NMB,
        "FB": FB,
        "FE": FE,
        "stats_pearsonr": stats_pearsonr,
        "rmse_util": rmse,
        "mae_util": mae,
        "corr_util": correlation,
    }

    results: Dict[int, Dict[str, Any]] = {}

    for size in sizes:
        print(f"Benchmarking with data size: {size:,}")
        obs, mod = self.generate_test_data(size)

        size_results: Dict[str, Any] = {}
        for name, func in functions.items():
            try:
                bench_result = self.benchmark_function(func, obs, mod)
                size_results[name] = bench_result
            except Exception as e:
                print(f"Error benchmarking {name}: {e!s}")
                size_results[name] = {"error": str(e)}

        results[size] = size_results

    self.results = results
    return results

run_comprehensive_benchmarks()

Execute both performance and accuracy suites.

Source code in src/monet_stats/benchmarks.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
def run_comprehensive_benchmarks() -> None:
    """
    Execute both performance and accuracy suites.
    """
    print("Running comprehensive benchmarks and accuracy verification...")

    # Performance benchmark
    perf_bench = PerformanceBenchmark()
    perf_bench.run_all_benchmarks(sizes=[100, 1000, 10000])
    perf_bench.print_benchmark_report()

    # Accuracy verification
    acc_verify = AccuracyVerification()
    acc_verify.print_accuracy_report()