Skip to content

Benchmarks

Performance benchmarking and comparison utilities for statistical metrics.

Performance benchmarks and accuracy verification for statistical functions (Aero Protocol Compliant).

This module provides tools to benchmark the execution time of various metrics and verify their mathematical accuracy against known values.

AccuracyVerification

Suite for verifying the mathematical correctness of statistical functions.

Source code in src/monet_stats/benchmarks.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
class AccuracyVerification:
    """
    Suite for verifying the mathematical correctness of statistical functions.
    """

    def __init__(self, tolerance: float = 1e-10) -> None:
        """
        Initialize the verification suite.

        Parameters
        ----------
        tolerance : float, optional
            Numerical tolerance for floating-point comparisons. Default is 1e-10.
        """
        self.tolerance = tolerance

    def test_known_values(self) -> Dict[str, Dict[str, Any]]:
        """
        Run a series of tests against analytically known values.

        Returns
        -------
        Dict[str, Dict[str, Any]]
            Dictionary of test results including computed vs expected values.
        """
        results: Dict[str, Dict[str, Any]] = {}

        # Perfect correlation case
        obs_perfect = np.array([1, 2, 3, 4, 5])
        mod_perfect = np.array([1, 2, 3, 4, 5])

        # Test R2 for perfect correlation
        r2_result = R2(obs_perfect, mod_perfect)
        results["R2_perfect"] = {
            "computed": float(r2_result),
            "expected": 1.0,
            "passed": bool(np.isclose(r2_result, 1.0, atol=self.tolerance)),
        }

        # Test correlation for perfect correlation
        corr_result = correlation(obs_perfect, mod_perfect)
        results["correlation_perfect"] = {
            "computed": float(corr_result),
            "expected": 1.0,
            "passed": bool(np.isclose(corr_result, 1.0, atol=self.tolerance)),
        }

        # Test RMSE for perfect match
        rmse_result = RMSE(obs_perfect, mod_perfect)
        results["RMSE_perfect"] = {
            "computed": float(rmse_result),
            "expected": 0.0,
            "passed": bool(np.isclose(rmse_result, 0.0, atol=self.tolerance)),
        }

        # Test MAE for perfect match
        mae_result = MAE(obs_perfect, mod_perfect)
        results["MAE_perfect"] = {
            "computed": float(mae_result),
            "expected": 0.0,
            "passed": bool(np.isclose(mae_result, 0.0, atol=self.tolerance)),
        }

        # Test NSE for perfect match
        nse_result = NSE(obs_perfect, mod_perfect)
        results["NSE_perfect"] = {
            "computed": float(nse_result),
            "expected": 1.0,
            "passed": bool(np.isclose(nse_result, 1.0, atol=self.tolerance)),
        }

        # Test with known bias
        obs_bias = np.ones(10)
        mod_bias = np.ones(10) * 2  # 100% bias
        mb_result = MB(obs_bias, mod_bias)
        # MB = mean(mod - obs) = (2 - 1) = 1.0
        expected_mb = 1.0
        results["MB_bias"] = {
            "computed": float(mb_result),
            "expected": expected_mb,
            "passed": bool(np.isclose(mb_result, expected_mb, atol=self.tolerance)),
        }

        # Test with known MAPE
        obs_mape = np.array([10, 10, 10])
        mod_mape = np.array([11, 9, 10])  # 10%, -10%, 0% errors
        mape_result = MAPE(obs_mape, mod_mape)
        expected_mape = (10 + 10 + 0) / 3  # Average absolute percentage error
        results["MAPE_known"] = {
            "computed": float(mape_result),
            "expected": expected_mape,
            "passed": bool(np.isclose(mape_result, expected_mape, atol=0.1)),
        }

        # Test CRMSE
        crmse_result = CRMSE(obs_perfect, mod_perfect)
        results["CRMSE_perfect"] = {
            "computed": float(crmse_result),
            "expected": 0.0,
            "passed": bool(np.isclose(crmse_result, 0.0, atol=self.tolerance)),
        }

        # Test IOA
        ioa_result = IOA(obs_perfect, mod_perfect)
        results["IOA_perfect"] = {
            "computed": float(ioa_result),
            "expected": 1.0,
            "passed": bool(np.isclose(ioa_result, 1.0, atol=self.tolerance)),
        }

        # Test NMB
        nmb_result = NMB(obs_perfect, mod_perfect)
        results["NMB_perfect"] = {
            "computed": float(nmb_result),
            "expected": 0.0,
            "passed": bool(np.isclose(nmb_result, 0.0, atol=self.tolerance)),
        }

        # Test NME
        nme_result = NME(obs_perfect, mod_perfect)
        results["NME_perfect"] = {
            "computed": float(nme_result),
            "expected": 0.0,
            "passed": bool(np.isclose(nme_result, 0.0, atol=self.tolerance)),
        }

        # Test KGE
        kge_result = KGE(obs_perfect, mod_perfect)
        results["KGE_perfect"] = {
            "computed": float(kge_result),
            "expected": 1.0,
            "passed": bool(np.isclose(kge_result, 1.0, atol=self.tolerance)),
        }

        return results

    def cross_backend_verification(self, size: int = 1000) -> Dict[str, Dict[str, Any]]:
        """
        Verify consistency of metrics across different backends.

        Parameters
        ----------
        size : int, optional
            Size of test data. Default is 1000.

        Returns
        -------
        Dict[str, Dict[str, Any]]
            Results of cross-backend verification.
        """
        results: Dict[str, Dict[str, Any]] = {}

        # Use the same data payload for both backends to ensure exact comparison
        obs_np, mod_np = PerformanceBenchmark().generate_test_data(size, backend="numpy")
        obs_xr = xr.DataArray(obs_np, dims=["time"], coords={"time": np.arange(size)}, name="obs")
        mod_xr = xr.DataArray(mod_np, dims=["time"], coords={"time": np.arange(size)}, name="mod")

        metrics = {
            "MAE": MAE,
            "RMSE": RMSE,
            "MB": MB,
            "R2": R2,
            "NSE": NSE,
            "KGE": KGE,
            "IOA": IOA,
            "NMB": NMB,
            "NME": NME,
        }

        for name, func in metrics.items():
            res_np = float(func(obs_np, mod_np))
            res_xr = float(func(obs_xr, mod_xr))

            passed = bool(np.isclose(res_np, res_xr, atol=self.tolerance))
            results[name] = {
                "numpy": res_np,
                "xarray": res_xr,
                "diff": abs(res_np - res_xr),
                "passed": passed,
            }

        return results

    def print_accuracy_report(self) -> None:
        """Print a formatted accuracy report to the console."""
        results = self.test_known_values()

        print("\n" + "=" * 80)
        print("ACCURACY VERIFICATION REPORT")
        print("=" * 80)

        passed = 0
        total = len(results)

        for test_name, result in results.items():
            status = "PASS" if result["passed"] else "FAIL"
            print(
                f"{test_name:<25}: {status:<4} | Computed: {result['computed']:.6f}, Expected: {result['expected']:.6f}"
            )
            if result["passed"]:
                passed += 1

        print(f"\nAccuracy Summary: {passed}/{total} tests passed")

        # Cross-backend verification
        print("\nCROSS-BACKEND CONSISTENCY (NumPy vs Xarray)")
        print("-" * 45)
        cross_results = self.cross_backend_verification()
        cross_passed = 0
        for name, res in cross_results.items():
            status = "PASS" if res["passed"] else "FAIL"
            print(f"{name:<10}: {status:<4} | Diff: {res['diff']:.2e}")
            if res["passed"]:
                cross_passed += 1
        print(f"Consistency Summary: {cross_passed}/{len(cross_results)} passed")

        if passed == total and cross_passed == len(cross_results):
            print("\n✓ All accuracy and consistency tests PASSED!")
        else:
            print("\n✗ Some tests FAILED!")

__init__(tolerance=1e-10)

Initialize the verification suite.

Parameters

tolerance : float, optional Numerical tolerance for floating-point comparisons. Default is 1e-10.

Source code in src/monet_stats/benchmarks.py
247
248
249
250
251
252
253
254
255
256
def __init__(self, tolerance: float = 1e-10) -> None:
    """
    Initialize the verification suite.

    Parameters
    ----------
    tolerance : float, optional
        Numerical tolerance for floating-point comparisons. Default is 1e-10.
    """
    self.tolerance = tolerance

cross_backend_verification(size=1000)

Verify consistency of metrics across different backends.

Parameters

size : int, optional Size of test data. Default is 1000.

Returns

Dict[str, Dict[str, Any]] Results of cross-backend verification.

Source code in src/monet_stats/benchmarks.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def cross_backend_verification(self, size: int = 1000) -> Dict[str, Dict[str, Any]]:
    """
    Verify consistency of metrics across different backends.

    Parameters
    ----------
    size : int, optional
        Size of test data. Default is 1000.

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Results of cross-backend verification.
    """
    results: Dict[str, Dict[str, Any]] = {}

    # Use the same data payload for both backends to ensure exact comparison
    obs_np, mod_np = PerformanceBenchmark().generate_test_data(size, backend="numpy")
    obs_xr = xr.DataArray(obs_np, dims=["time"], coords={"time": np.arange(size)}, name="obs")
    mod_xr = xr.DataArray(mod_np, dims=["time"], coords={"time": np.arange(size)}, name="mod")

    metrics = {
        "MAE": MAE,
        "RMSE": RMSE,
        "MB": MB,
        "R2": R2,
        "NSE": NSE,
        "KGE": KGE,
        "IOA": IOA,
        "NMB": NMB,
        "NME": NME,
    }

    for name, func in metrics.items():
        res_np = float(func(obs_np, mod_np))
        res_xr = float(func(obs_xr, mod_xr))

        passed = bool(np.isclose(res_np, res_xr, atol=self.tolerance))
        results[name] = {
            "numpy": res_np,
            "xarray": res_xr,
            "diff": abs(res_np - res_xr),
            "passed": passed,
        }

    return results

print_accuracy_report()

Print a formatted accuracy report to the console.

Source code in src/monet_stats/benchmarks.py
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
def print_accuracy_report(self) -> None:
    """Print a formatted accuracy report to the console."""
    results = self.test_known_values()

    print("\n" + "=" * 80)
    print("ACCURACY VERIFICATION REPORT")
    print("=" * 80)

    passed = 0
    total = len(results)

    for test_name, result in results.items():
        status = "PASS" if result["passed"] else "FAIL"
        print(
            f"{test_name:<25}: {status:<4} | Computed: {result['computed']:.6f}, Expected: {result['expected']:.6f}"
        )
        if result["passed"]:
            passed += 1

    print(f"\nAccuracy Summary: {passed}/{total} tests passed")

    # Cross-backend verification
    print("\nCROSS-BACKEND CONSISTENCY (NumPy vs Xarray)")
    print("-" * 45)
    cross_results = self.cross_backend_verification()
    cross_passed = 0
    for name, res in cross_results.items():
        status = "PASS" if res["passed"] else "FAIL"
        print(f"{name:<10}: {status:<4} | Diff: {res['diff']:.2e}")
        if res["passed"]:
            cross_passed += 1
    print(f"Consistency Summary: {cross_passed}/{len(cross_results)} passed")

    if passed == total and cross_passed == len(cross_results):
        print("\n✓ All accuracy and consistency tests PASSED!")
    else:
        print("\n✗ Some tests FAILED!")

test_known_values()

Run a series of tests against analytically known values.

Returns

Dict[str, Dict[str, Any]] Dictionary of test results including computed vs expected values.

Source code in src/monet_stats/benchmarks.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def test_known_values(self) -> Dict[str, Dict[str, Any]]:
    """
    Run a series of tests against analytically known values.

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Dictionary of test results including computed vs expected values.
    """
    results: Dict[str, Dict[str, Any]] = {}

    # Perfect correlation case
    obs_perfect = np.array([1, 2, 3, 4, 5])
    mod_perfect = np.array([1, 2, 3, 4, 5])

    # Test R2 for perfect correlation
    r2_result = R2(obs_perfect, mod_perfect)
    results["R2_perfect"] = {
        "computed": float(r2_result),
        "expected": 1.0,
        "passed": bool(np.isclose(r2_result, 1.0, atol=self.tolerance)),
    }

    # Test correlation for perfect correlation
    corr_result = correlation(obs_perfect, mod_perfect)
    results["correlation_perfect"] = {
        "computed": float(corr_result),
        "expected": 1.0,
        "passed": bool(np.isclose(corr_result, 1.0, atol=self.tolerance)),
    }

    # Test RMSE for perfect match
    rmse_result = RMSE(obs_perfect, mod_perfect)
    results["RMSE_perfect"] = {
        "computed": float(rmse_result),
        "expected": 0.0,
        "passed": bool(np.isclose(rmse_result, 0.0, atol=self.tolerance)),
    }

    # Test MAE for perfect match
    mae_result = MAE(obs_perfect, mod_perfect)
    results["MAE_perfect"] = {
        "computed": float(mae_result),
        "expected": 0.0,
        "passed": bool(np.isclose(mae_result, 0.0, atol=self.tolerance)),
    }

    # Test NSE for perfect match
    nse_result = NSE(obs_perfect, mod_perfect)
    results["NSE_perfect"] = {
        "computed": float(nse_result),
        "expected": 1.0,
        "passed": bool(np.isclose(nse_result, 1.0, atol=self.tolerance)),
    }

    # Test with known bias
    obs_bias = np.ones(10)
    mod_bias = np.ones(10) * 2  # 100% bias
    mb_result = MB(obs_bias, mod_bias)
    # MB = mean(mod - obs) = (2 - 1) = 1.0
    expected_mb = 1.0
    results["MB_bias"] = {
        "computed": float(mb_result),
        "expected": expected_mb,
        "passed": bool(np.isclose(mb_result, expected_mb, atol=self.tolerance)),
    }

    # Test with known MAPE
    obs_mape = np.array([10, 10, 10])
    mod_mape = np.array([11, 9, 10])  # 10%, -10%, 0% errors
    mape_result = MAPE(obs_mape, mod_mape)
    expected_mape = (10 + 10 + 0) / 3  # Average absolute percentage error
    results["MAPE_known"] = {
        "computed": float(mape_result),
        "expected": expected_mape,
        "passed": bool(np.isclose(mape_result, expected_mape, atol=0.1)),
    }

    # Test CRMSE
    crmse_result = CRMSE(obs_perfect, mod_perfect)
    results["CRMSE_perfect"] = {
        "computed": float(crmse_result),
        "expected": 0.0,
        "passed": bool(np.isclose(crmse_result, 0.0, atol=self.tolerance)),
    }

    # Test IOA
    ioa_result = IOA(obs_perfect, mod_perfect)
    results["IOA_perfect"] = {
        "computed": float(ioa_result),
        "expected": 1.0,
        "passed": bool(np.isclose(ioa_result, 1.0, atol=self.tolerance)),
    }

    # Test NMB
    nmb_result = NMB(obs_perfect, mod_perfect)
    results["NMB_perfect"] = {
        "computed": float(nmb_result),
        "expected": 0.0,
        "passed": bool(np.isclose(nmb_result, 0.0, atol=self.tolerance)),
    }

    # Test NME
    nme_result = NME(obs_perfect, mod_perfect)
    results["NME_perfect"] = {
        "computed": float(nme_result),
        "expected": 0.0,
        "passed": bool(np.isclose(nme_result, 0.0, atol=self.tolerance)),
    }

    # Test KGE
    kge_result = KGE(obs_perfect, mod_perfect)
    results["KGE_perfect"] = {
        "computed": float(kge_result),
        "expected": 1.0,
        "passed": bool(np.isclose(kge_result, 1.0, atol=self.tolerance)),
    }

    return results

PerformanceBenchmark

Performance benchmarking suite for statistical functions.

This class enables timing analysis of metrics across different backends (NumPy, Xarray, Dask) and data sizes.

Source code in src/monet_stats/benchmarks.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class PerformanceBenchmark:
    """
    Performance benchmarking suite for statistical functions.

    This class enables timing analysis of metrics across different backends
    (NumPy, Xarray, Dask) and data sizes.
    """

    def __init__(self) -> None:
        """Initialize the benchmark suite with an empty results dictionary."""
        self.results: Dict[str, Dict[int, Dict[str, Any]]] = {}

    def generate_test_data(
        self, size: int, backend: str = "numpy", chunks: Optional[Dict[str, int]] = None
    ) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
        """
        Generate synthetic test data for benchmarking.

        Parameters
        ----------
        size : int
            Number of data points to generate.
        backend : str, optional
            Backend to use ('numpy', 'xarray', or 'dask'). Default is 'numpy'.
        chunks : Optional[Dict[str, int]], optional
            Dask chunk sizes if backend is 'dask'. If None, defaults to {'time': 12500000}
            (approx 100MB for float64).

        Returns
        -------
        Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]
            A tuple containing (obs, mod) arrays.
        """
        np.random.seed(42)  # For reproducible results
        obs_raw = np.random.normal(10, 3, size)
        mod_raw = obs_raw + np.random.normal(0, 1, size)  # Add some error

        if backend in ["xarray", "dask"]:
            obs = xr.DataArray(obs_raw, dims=["time"], coords={"time": np.arange(size)}, name="obs")
            mod = xr.DataArray(mod_raw, dims=["time"], coords={"time": np.arange(size)}, name="mod")

            _update_history(obs, "Generated benchmark data")
            _update_history(mod, "Generated benchmark data")

            if backend == "dask":
                if chunks is None:
                    # ~100MB chunk size for float64 (8 bytes per element)
                    chunks = {"time": 12500000}
                obs = obs.chunk(chunks)
                mod = mod.chunk(chunks)
            return obs, mod

        return obs_raw, mod_raw

    def benchmark_function(
        self,
        func: Callable,
        obs: Union[np.ndarray, xr.DataArray],
        mod: Union[np.ndarray, xr.DataArray],
        runs: int = 100,
    ) -> Dict[str, Any]:
        """
        Benchmark a single statistical function.

        .. note::
           For Dask-backed arrays, this function explicitly calls `.compute()`
           to measure the full execution time of the calculation.

        Parameters
        ----------
        func : Callable
            The function to benchmark.
        obs : Union[np.ndarray, xr.DataArray]
            Observed values.
        mod : Union[np.ndarray, xr.DataArray]
            Model values.
        runs : int, optional
            Number of iterations for averaging. Default is 100.

        Returns
        -------
        Dict[str, Any]
            Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.
        """
        times: List[float] = []
        results: List[Any] = []

        # Warm-up run (especially for Dask/JIT if applicable)
        _ = func(obs, mod)

        for _ in range(runs):
            start_time = time.perf_counter()
            result = func(obs, mod)
            # If dask-backed, we might want to trigger compute to measure execution time
            # however, the protocol says "No Hidden Computes".
            # For benchmarking, we might explicitly compute if we want to measure wall time of the calculation.
            if hasattr(result, "compute"):
                result = result.compute()

            end_time = time.perf_counter()

            times.append(end_time - start_time)
            results.append(result)

        avg_time = float(np.mean(times))
        std_time = float(np.std(times))

        return {
            "avg_time": avg_time,
            "std_time": std_time,
            "result": results[0],
            "runs": runs,
        }

    def run_all_benchmarks(
        self,
        sizes: Optional[List[int]] = None,
        backends: Optional[List[str]] = None,
    ) -> Dict[str, Dict[int, Dict[str, Any]]]:
        """
        Run benchmarks for a standard set of functions across multiple sizes and backends.

        Parameters
        ----------
        sizes : Optional[List[int]], optional
            List of data sizes to test. Defaults to [100, 1000, 10000].
        backends : Optional[List[str]], optional
            List of backends to test. Defaults to ['numpy', 'xarray', 'dask'].

        Returns
        -------
        Dict[str, Dict[int, Dict[str, Any]]]
            Comprehensive benchmark results indexed by backend, then size.
        """
        if sizes is None:
            sizes = [100, 1000, 10000]
        if backends is None:
            backends = ["numpy", "xarray", "dask"]

        functions = {
            "MAE": MAE,
            "RMSE": RMSE,
            "CRMSE": CRMSE,
            "MB": MB,
            "R2": R2,
            "NSE": NSE,
            "KGE": KGE,
            "IOA": IOA,
            "MAPE": MAPE,
            "MASE": MASE,
            "MedAE": MedAE,
            "sMAPE": sMAPE,
            "NMB": NMB,
            "NME": NME,
            "FB": FB,
            "FE": FE,
            "stats_pearsonr": stats_pearsonr,
            "rmse_util": rmse,
            "mae_util": mae,
            "corr_util": correlation,
        }

        results: Dict[str, Dict[int, Dict[str, Any]]] = {}

        for backend in backends:
            print(f"Benchmarking backend: {backend}")
            backend_results: Dict[int, Dict[str, Any]] = {}
            for size in sizes:
                print(f"  Data size: {size:,}")
                obs, mod = self.generate_test_data(size, backend=backend)

                size_results: Dict[str, Any] = {}
                for name, func in functions.items():
                    try:
                        bench_result = self.benchmark_function(func, obs, mod)
                        size_results[name] = bench_result
                    except Exception as e:
                        print(f"    Error benchmarking {name}: {e!s}")
                        size_results[name] = {"error": str(e)}

                backend_results[size] = size_results
            results[backend] = backend_results

        self.results = results
        return results

    def print_benchmark_report(self) -> None:
        """Print a formatted performance report to the console."""
        print("\n" + "=" * 80)
        print("PERFORMANCE BENCHMARK REPORT")
        print("=" * 80)

        if not self.results:
            print("No benchmark results found. Run run_all_benchmarks() first.")
            return

        for backend, backend_results in self.results.items():
            print(f"\nBackend: {backend.upper()}")
            print("=" * 20)
            for size, size_results in backend_results.items():
                print(f"\nData Size: {size:,} elements")
                print("-" * 40)

                # Sort by average time
                sorted_results = sorted(
                    size_results.items(),
                    key=lambda x: x[1].get("avg_time", float("inf")) if isinstance(x[1], dict) else float("inf"),
                )

                for name, result in sorted_results:
                    if isinstance(result, dict) and "error" not in result:
                        avg_time = result["avg_time"]
                        std_time = result["std_time"]
                        print(f"{name:<20}: {avg_time * 1000:>8.4f}±{std_time * 1000:.4f} ms")
                    elif isinstance(result, dict):
                        print(f"{name:<20}: ERROR - {result.get('error')}")
                    else:
                        print(f"{name:<20}: ERROR - Unknown result format")

__init__()

Initialize the benchmark suite with an empty results dictionary.

Source code in src/monet_stats/benchmarks.py
30
31
32
def __init__(self) -> None:
    """Initialize the benchmark suite with an empty results dictionary."""
    self.results: Dict[str, Dict[int, Dict[str, Any]]] = {}

benchmark_function(func, obs, mod, runs=100)

Benchmark a single statistical function.

.. note:: For Dask-backed arrays, this function explicitly calls .compute() to measure the full execution time of the calculation.

Parameters

func : Callable The function to benchmark. obs : Union[np.ndarray, xr.DataArray] Observed values. mod : Union[np.ndarray, xr.DataArray] Model values. runs : int, optional Number of iterations for averaging. Default is 100.

Returns

Dict[str, Any] Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.

Source code in src/monet_stats/benchmarks.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def benchmark_function(
    self,
    func: Callable,
    obs: Union[np.ndarray, xr.DataArray],
    mod: Union[np.ndarray, xr.DataArray],
    runs: int = 100,
) -> Dict[str, Any]:
    """
    Benchmark a single statistical function.

    .. note::
       For Dask-backed arrays, this function explicitly calls `.compute()`
       to measure the full execution time of the calculation.

    Parameters
    ----------
    func : Callable
        The function to benchmark.
    obs : Union[np.ndarray, xr.DataArray]
        Observed values.
    mod : Union[np.ndarray, xr.DataArray]
        Model values.
    runs : int, optional
        Number of iterations for averaging. Default is 100.

    Returns
    -------
    Dict[str, Any]
        Dictionary containing 'avg_time', 'std_time', 'result', and 'runs'.
    """
    times: List[float] = []
    results: List[Any] = []

    # Warm-up run (especially for Dask/JIT if applicable)
    _ = func(obs, mod)

    for _ in range(runs):
        start_time = time.perf_counter()
        result = func(obs, mod)
        # If dask-backed, we might want to trigger compute to measure execution time
        # however, the protocol says "No Hidden Computes".
        # For benchmarking, we might explicitly compute if we want to measure wall time of the calculation.
        if hasattr(result, "compute"):
            result = result.compute()

        end_time = time.perf_counter()

        times.append(end_time - start_time)
        results.append(result)

    avg_time = float(np.mean(times))
    std_time = float(np.std(times))

    return {
        "avg_time": avg_time,
        "std_time": std_time,
        "result": results[0],
        "runs": runs,
    }

generate_test_data(size, backend='numpy', chunks=None)

Generate synthetic test data for benchmarking.

Parameters

size : int Number of data points to generate. backend : str, optional Backend to use ('numpy', 'xarray', or 'dask'). Default is 'numpy'. chunks : Optional[Dict[str, int]], optional Dask chunk sizes if backend is 'dask'. If None, defaults to {'time': 12500000} (approx 100MB for float64).

Returns

Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]] A tuple containing (obs, mod) arrays.

Source code in src/monet_stats/benchmarks.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def generate_test_data(
    self, size: int, backend: str = "numpy", chunks: Optional[Dict[str, int]] = None
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Generate synthetic test data for benchmarking.

    Parameters
    ----------
    size : int
        Number of data points to generate.
    backend : str, optional
        Backend to use ('numpy', 'xarray', or 'dask'). Default is 'numpy'.
    chunks : Optional[Dict[str, int]], optional
        Dask chunk sizes if backend is 'dask'. If None, defaults to {'time': 12500000}
        (approx 100MB for float64).

    Returns
    -------
    Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]
        A tuple containing (obs, mod) arrays.
    """
    np.random.seed(42)  # For reproducible results
    obs_raw = np.random.normal(10, 3, size)
    mod_raw = obs_raw + np.random.normal(0, 1, size)  # Add some error

    if backend in ["xarray", "dask"]:
        obs = xr.DataArray(obs_raw, dims=["time"], coords={"time": np.arange(size)}, name="obs")
        mod = xr.DataArray(mod_raw, dims=["time"], coords={"time": np.arange(size)}, name="mod")

        _update_history(obs, "Generated benchmark data")
        _update_history(mod, "Generated benchmark data")

        if backend == "dask":
            if chunks is None:
                # ~100MB chunk size for float64 (8 bytes per element)
                chunks = {"time": 12500000}
            obs = obs.chunk(chunks)
            mod = mod.chunk(chunks)
        return obs, mod

    return obs_raw, mod_raw

print_benchmark_report()

Print a formatted performance report to the console.

Source code in src/monet_stats/benchmarks.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def print_benchmark_report(self) -> None:
    """Print a formatted performance report to the console."""
    print("\n" + "=" * 80)
    print("PERFORMANCE BENCHMARK REPORT")
    print("=" * 80)

    if not self.results:
        print("No benchmark results found. Run run_all_benchmarks() first.")
        return

    for backend, backend_results in self.results.items():
        print(f"\nBackend: {backend.upper()}")
        print("=" * 20)
        for size, size_results in backend_results.items():
            print(f"\nData Size: {size:,} elements")
            print("-" * 40)

            # Sort by average time
            sorted_results = sorted(
                size_results.items(),
                key=lambda x: x[1].get("avg_time", float("inf")) if isinstance(x[1], dict) else float("inf"),
            )

            for name, result in sorted_results:
                if isinstance(result, dict) and "error" not in result:
                    avg_time = result["avg_time"]
                    std_time = result["std_time"]
                    print(f"{name:<20}: {avg_time * 1000:>8.4f}±{std_time * 1000:.4f} ms")
                elif isinstance(result, dict):
                    print(f"{name:<20}: ERROR - {result.get('error')}")
                else:
                    print(f"{name:<20}: ERROR - Unknown result format")

run_all_benchmarks(sizes=None, backends=None)

Run benchmarks for a standard set of functions across multiple sizes and backends.

Parameters

sizes : Optional[List[int]], optional List of data sizes to test. Defaults to [100, 1000, 10000]. backends : Optional[List[str]], optional List of backends to test. Defaults to ['numpy', 'xarray', 'dask'].

Returns

Dict[str, Dict[int, Dict[str, Any]]] Comprehensive benchmark results indexed by backend, then size.

Source code in src/monet_stats/benchmarks.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def run_all_benchmarks(
    self,
    sizes: Optional[List[int]] = None,
    backends: Optional[List[str]] = None,
) -> Dict[str, Dict[int, Dict[str, Any]]]:
    """
    Run benchmarks for a standard set of functions across multiple sizes and backends.

    Parameters
    ----------
    sizes : Optional[List[int]], optional
        List of data sizes to test. Defaults to [100, 1000, 10000].
    backends : Optional[List[str]], optional
        List of backends to test. Defaults to ['numpy', 'xarray', 'dask'].

    Returns
    -------
    Dict[str, Dict[int, Dict[str, Any]]]
        Comprehensive benchmark results indexed by backend, then size.
    """
    if sizes is None:
        sizes = [100, 1000, 10000]
    if backends is None:
        backends = ["numpy", "xarray", "dask"]

    functions = {
        "MAE": MAE,
        "RMSE": RMSE,
        "CRMSE": CRMSE,
        "MB": MB,
        "R2": R2,
        "NSE": NSE,
        "KGE": KGE,
        "IOA": IOA,
        "MAPE": MAPE,
        "MASE": MASE,
        "MedAE": MedAE,
        "sMAPE": sMAPE,
        "NMB": NMB,
        "NME": NME,
        "FB": FB,
        "FE": FE,
        "stats_pearsonr": stats_pearsonr,
        "rmse_util": rmse,
        "mae_util": mae,
        "corr_util": correlation,
    }

    results: Dict[str, Dict[int, Dict[str, Any]]] = {}

    for backend in backends:
        print(f"Benchmarking backend: {backend}")
        backend_results: Dict[int, Dict[str, Any]] = {}
        for size in sizes:
            print(f"  Data size: {size:,}")
            obs, mod = self.generate_test_data(size, backend=backend)

            size_results: Dict[str, Any] = {}
            for name, func in functions.items():
                try:
                    bench_result = self.benchmark_function(func, obs, mod)
                    size_results[name] = bench_result
                except Exception as e:
                    print(f"    Error benchmarking {name}: {e!s}")
                    size_results[name] = {"error": str(e)}

            backend_results[size] = size_results
        results[backend] = backend_results

    self.results = results
    return results

run_comprehensive_benchmarks()

Execute both performance and accuracy suites.

Source code in src/monet_stats/benchmarks.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def run_comprehensive_benchmarks() -> None:
    """
    Execute both performance and accuracy suites.
    """
    print("Running comprehensive benchmarks and accuracy verification...")

    # Performance benchmark
    perf_bench = PerformanceBenchmark()
    perf_bench.run_all_benchmarks(sizes=[100, 1000, 10000])
    perf_bench.print_benchmark_report()

    # Accuracy verification
    acc_verify = AccuracyVerification()
    acc_verify.print_accuracy_report()