Skip to content

Data Processing

Utilities for data input, output, and preprocessing.

Data processing utilities for statistical computations (Aero Protocol Compliant).

align_arrays(obs, mod)

Align two arrays for comparison.

Parameters

obs : numpy.ndarray or xarray.DataArray Observed values. mod : numpy.ndarray or xarray.DataArray Model/predicted values.

Returns

tuple of (numpy.ndarray or xarray.DataArray) Aligned (obs, mod) arrays.

Examples

import xarray as xr obs = xr.DataArray([1, 2], coords={'x': [0, 1]}, dims='x') mod = xr.DataArray([2, 3], coords={'x': [1, 2]}, dims='x') obs_a, mod_a = align_arrays(obs, mod) obs_a.x.values array([1])

Source code in src/monet_stats/data_processing.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def align_arrays(
    obs: Union[np.ndarray, xr.DataArray], mod: Union[np.ndarray, xr.DataArray]
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Align two arrays for comparison.

    Parameters
    ----------
    obs : numpy.ndarray or xarray.DataArray
        Observed values.
    mod : numpy.ndarray or xarray.DataArray
        Model/predicted values.

    Returns
    -------
    tuple of (numpy.ndarray or xarray.DataArray)
        Aligned (obs, mod) arrays.

    Examples
    --------
    >>> import xarray as xr
    >>> obs = xr.DataArray([1, 2], coords={'x': [0, 1]}, dims='x')
    >>> mod = xr.DataArray([2, 3], coords={'x': [1, 2]}, dims='x')
    >>> obs_a, mod_a = align_arrays(obs, mod)
    >>> obs_a.x.values
    array([1])
    """
    if isinstance(obs, xr.DataArray) and isinstance(mod, xr.DataArray):
        return xr.align(obs, mod, join="inner")

    # Fallback for numpy or mixed types
    obs_arr = np.asanyarray(obs)
    mod_arr = np.asanyarray(mod)

    if obs_arr.shape != mod_arr.shape:
        try:
            obs_arr, mod_arr = np.broadcast_arrays(obs_arr, mod_arr)
        except ValueError:
            raise ValueError(f"Arrays must have compatible shapes, got {obs_arr.shape} and {mod_arr.shape}")

    return obs_arr, mod_arr

compute_anomalies(obs, mod, climatology=None)

Compute anomalies relative to climatology (Lazy-friendly).

Parameters

obs : numpy.ndarray or xarray.DataArray Observed values. mod : numpy.ndarray or xarray.DataArray Model/predicted values. climatology : numpy.ndarray or xarray.DataArray, optional Climatology to subtract. If None, the mean of each array is used.

Returns

tuple of (numpy.ndarray or xarray.DataArray) (obs_anom, mod_anom)

Examples

import numpy as np obs = np.array([1, 2, 3, 4, 5]) mod = np.array([1, 2, 3, 4, 5]) obs_anom, _ = compute_anomalies(obs, mod) np.isclose(np.mean(obs_anom), 0) True

Source code in src/monet_stats/data_processing.py
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
def compute_anomalies(
    obs: Union[np.ndarray, xr.DataArray],
    mod: Union[np.ndarray, xr.DataArray],
    climatology: Optional[Union[np.ndarray, xr.DataArray]] = None,
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Compute anomalies relative to climatology (Lazy-friendly).

    Parameters
    ----------
    obs : numpy.ndarray or xarray.DataArray
        Observed values.
    mod : numpy.ndarray or xarray.DataArray
        Model/predicted values.
    climatology : numpy.ndarray or xarray.DataArray, optional
        Climatology to subtract. If None, the mean of each array is used.

    Returns
    -------
    tuple of (numpy.ndarray or xarray.DataArray)
        (obs_anom, mod_anom)

    Examples
    --------
    >>> import numpy as np
    >>> obs = np.array([1, 2, 3, 4, 5])
    >>> mod = np.array([1, 2, 3, 4, 5])
    >>> obs_anom, _ = compute_anomalies(obs, mod)
    >>> np.isclose(np.mean(obs_anom), 0)
    True
    """
    obs, mod = align_arrays(obs, mod)

    if climatology is not None:
        obs_anom = obs - climatology
        mod_anom = mod - climatology
    else:
        obs_anom = obs - obs.mean()
        mod_anom = mod - mod.mean()

    return _update_history(obs_anom, "Anomaly computation"), _update_history(mod_anom, "Anomaly computation")

detrend_data(obs, mod, method='linear', dim=None, axis=-1)

Remove trend from data (Lazy-friendly).

Parameters

obs : numpy.ndarray or xarray.DataArray Observed values. mod : numpy.ndarray or xarray.DataArray Model/predicted values. method : str, optional Detrending method ('linear', 'constant'). - 'linear': least-squares linear detrend. - 'constant': subtract mean. dim : str, optional Dimension along which to detrend (xarray only). axis : int, optional Axis along which to detrend (numpy only, or if dim is None). Default is -1.

Returns

tuple of (numpy.ndarray or xarray.DataArray) Detrended (obs, mod) arrays.

Examples

import numpy as np obs = np.array([1, 2, 3]) mod = np.array([1, 2, 3]) obs_d, mod_d = detrend_data(obs, mod, method='linear') np.allclose(obs_d, 0) True

Source code in src/monet_stats/data_processing.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def detrend_data(
    obs: Union[np.ndarray, xr.DataArray],
    mod: Union[np.ndarray, xr.DataArray],
    method: str = "linear",
    dim: Optional[str] = None,
    axis: int = -1,
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Remove trend from data (Lazy-friendly).

    Parameters
    ----------
    obs : numpy.ndarray or xarray.DataArray
        Observed values.
    mod : numpy.ndarray or xarray.DataArray
        Model/predicted values.
    method : str, optional
        Detrending method ('linear', 'constant').
        - 'linear': least-squares linear detrend.
        - 'constant': subtract mean.
    dim : str, optional
        Dimension along which to detrend (xarray only).
    axis : int, optional
        Axis along which to detrend (numpy only, or if dim is None). Default is -1.

    Returns
    -------
    tuple of (numpy.ndarray or xarray.DataArray)
        Detrended (obs, mod) arrays.

    Examples
    --------
    >>> import numpy as np
    >>> obs = np.array([1, 2, 3])
    >>> mod = np.array([1, 2, 3])
    >>> obs_d, mod_d = detrend_data(obs, mod, method='linear')
    >>> np.allclose(obs_d, 0)
    True
    """
    obs, mod = align_arrays(obs, mod)

    if method == "linear":
        from scipy.signal import detrend

        if isinstance(obs, xr.DataArray):
            if dim is None:
                dim = obs.dims[axis]

            # Core dimensions for apply_ufunc must be a single chunk if using dask
            if hasattr(obs.data, "chunks"):
                obs = obs.chunk({dim: -1})
            if hasattr(mod.data, "chunks"):
                mod = mod.chunk({dim: -1})

            # Use xr.apply_ufunc for laziness and dask support
            obs_detrended = xr.apply_ufunc(
                detrend,
                obs,
                input_core_dims=[[dim]],
                output_core_dims=[[dim]],
                kwargs={"axis": -1},
                dask="parallelized",
                output_dtypes=[obs.dtype],
            )
            mod_detrended = xr.apply_ufunc(
                detrend,
                mod,
                input_core_dims=[[dim]],
                output_core_dims=[[dim]],
                kwargs={"axis": -1},
                dask="parallelized",
                output_dtypes=[mod.dtype],
            )
        else:
            obs_detrended = detrend(obs, axis=axis)
            mod_detrended = detrend(mod, axis=axis)

    elif method == "constant":
        obs_detrended = obs - obs.mean()
        mod_detrended = mod - mod.mean()
    else:
        raise ValueError(f"Unknown detrending method: {method}")

    return _update_history(obs_detrended, f"Detrending ({method})"), _update_history(
        mod_detrended, f"Detrending ({method})"
    )

handle_missing_values(obs, mod, strategy='pairwise')

Handle missing values in arrays (Aero Protocol: Lazy-friendly).

Parameters

obs : numpy.ndarray or xarray.DataArray Observed values. mod : numpy.ndarray or xarray.DataArray Model/predicted values. strategy : str, optional Strategy for handling missing values ('pairwise', 'listwise'). For xarray, ensures NaNs are matched across both arrays without dropping coordinates. For numpy, returns flattened arrays with NaNs removed.

Returns

tuple of (numpy.ndarray or xarray.DataArray) (obs, mod) with missing values handled.

Examples

import numpy as np obs = np.array([1, np.nan, 3]) mod = np.array([1, 2, np.nan]) handle_missing_values(obs, mod) (array([1.]), array([1.]))

Source code in src/monet_stats/data_processing.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def handle_missing_values(
    obs: Union[np.ndarray, xr.DataArray], mod: Union[np.ndarray, xr.DataArray], strategy: str = "pairwise"
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Handle missing values in arrays (Aero Protocol: Lazy-friendly).

    Parameters
    ----------
    obs : numpy.ndarray or xarray.DataArray
        Observed values.
    mod : numpy.ndarray or xarray.DataArray
        Model/predicted values.
    strategy : str, optional
        Strategy for handling missing values ('pairwise', 'listwise').
        For xarray, ensures NaNs are matched across both arrays without dropping coordinates.
        For numpy, returns flattened arrays with NaNs removed.

    Returns
    -------
    tuple of (numpy.ndarray or xarray.DataArray)
        (obs, mod) with missing values handled.

    Examples
    --------
    >>> import numpy as np
    >>> obs = np.array([1, np.nan, 3])
    >>> mod = np.array([1, 2, np.nan])
    >>> handle_missing_values(obs, mod)
    (array([1.]), array([1.]))
    """
    obs, mod = align_arrays(obs, mod)

    if isinstance(obs, xr.DataArray) and isinstance(mod, xr.DataArray):
        mask = obs.isnull() | mod.isnull()
        res_obs = obs.where(~mask)
        res_mod = mod.where(~mask)
        return _update_history(res_obs, "Missing value handling"), _update_history(res_mod, "Missing value handling")
    else:
        mask = np.isnan(obs) | np.isnan(mod)
        if strategy in ["pairwise", "listwise"]:
            return obs[~mask], mod[~mask]
        else:
            raise ValueError(f"Unknown strategy: {strategy}")

normalize_data(obs, mod, method='zscore')

Normalize data using various methods (Lazy-friendly).

Parameters

obs : numpy.ndarray or xarray.DataArray Observed values. mod : numpy.ndarray or xarray.DataArray Model/predicted values. method : str, optional Normalization method ('zscore', 'minmax', 'robust'). - 'zscore': (x - mean) / std - 'minmax': (x - min) / (max - min) - 'robust': (x - median) / MAD (Median Absolute Deviation)

Returns

tuple of (numpy.ndarray or xarray.DataArray) Normalized (obs, mod) arrays.

Examples

import xarray as xr import numpy as np obs = xr.DataArray(np.random.rand(10, 10)) mod = xr.DataArray(np.random.rand(10, 10)) obs_norm, mod_norm = normalize_data(obs, mod, method='zscore')

Source code in src/monet_stats/data_processing.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def normalize_data(
    obs: Union[np.ndarray, xr.DataArray],
    mod: Union[np.ndarray, xr.DataArray],
    method: str = "zscore",
) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
    """
    Normalize data using various methods (Lazy-friendly).

    Parameters
    ----------
    obs : numpy.ndarray or xarray.DataArray
        Observed values.
    mod : numpy.ndarray or xarray.DataArray
        Model/predicted values.
    method : str, optional
        Normalization method ('zscore', 'minmax', 'robust').
        - 'zscore': (x - mean) / std
        - 'minmax': (x - min) / (max - min)
        - 'robust': (x - median) / MAD (Median Absolute Deviation)

    Returns
    -------
    tuple of (numpy.ndarray or xarray.DataArray)
        Normalized (obs, mod) arrays.

    Examples
    --------
    >>> import xarray as xr
    >>> import numpy as np
    >>> obs = xr.DataArray(np.random.rand(10, 10))
    >>> mod = xr.DataArray(np.random.rand(10, 10))
    >>> obs_norm, mod_norm = normalize_data(obs, mod, method='zscore')
    """
    obs, mod = align_arrays(obs, mod)

    if method == "zscore":
        obs_norm = (obs - obs.mean()) / obs.std()
        mod_norm = (mod - mod.mean()) / mod.std()
    elif method == "minmax":
        obs_norm = (obs - obs.min()) / (obs.max() - obs.min())
        mod_norm = (mod - mod.min()) / (mod.max() - mod.min())
    elif method == "robust":
        if isinstance(obs, xr.DataArray):
            obs_median = obs.median()
            obs_mad = abs(obs - obs_median).median()
        else:
            obs_median = np.median(obs)
            obs_mad = np.median(np.abs(obs - obs_median))

        if isinstance(mod, xr.DataArray):
            mod_median = mod.median()
            mod_mad = abs(mod - mod_median).median()
        else:
            mod_median = np.median(mod)
            mod_mad = np.median(np.abs(mod - mod_median))

        obs_norm = (obs - obs_median) / obs_mad
        mod_norm = (mod - mod_median) / mod_mad
    else:
        raise ValueError(f"Unknown normalization method: {method}")

    return _update_history(obs_norm, f"Normalization ({method})"), _update_history(
        mod_norm, f"Normalization ({method})"
    )

to_numpy(data)

Convert data to numpy array (Eager operation).

.. warning:: This operation triggers immediate computation if the input is a Dask-backed xarray object. Use with caution in lazy pipelines.

Parameters

data : Any Input data to convert (xarray.DataArray, xarray.Dataset, pandas.Series/DataFrame, list, etc.).

Returns

numpy.ndarray Converted numpy array.

Examples

import xarray as xr da = xr.DataArray([1, 2, 3]) to_numpy(da) array([1, 2, 3])

Source code in src/monet_stats/data_processing.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def to_numpy(
    data: Any,
) -> np.ndarray:
    """
    Convert data to numpy array (Eager operation).

    .. warning::
        This operation triggers immediate computation if the input is a Dask-backed
        xarray object. Use with caution in lazy pipelines.

    Parameters
    ----------
    data : Any
        Input data to convert (xarray.DataArray, xarray.Dataset, pandas.Series/DataFrame, list, etc.).

    Returns
    -------
    numpy.ndarray
        Converted numpy array.

    Examples
    --------
    >>> import xarray as xr
    >>> da = xr.DataArray([1, 2, 3])
    >>> to_numpy(da)
    array([1, 2, 3])
    """
    if isinstance(data, xr.DataArray):
        return data.values
    elif isinstance(data, xr.Dataset):
        return data.to_array().values
    elif isinstance(data, (pd.Series, pd.DataFrame)):
        return data.values
    elif isinstance(data, list):
        return np.array(data)
    else:
        return np.asarray(data)