Skip to content

Distribution Analysis#

statista.time_series.distribution #

Distribution-aware methods mixin for TimeSeries.

Distribution #

Bases: _TimeSeriesStub

Distribution fitting, normality tests, and diagnostic plots.

Source code in src\statista\time_series\distribution.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
class Distribution(_TimeSeriesStub):
    """Distribution fitting, normality tests, and diagnostic plots."""

    def qq_plot(
        self,
        distribution: str = "norm",
        column: str = None,
        confidence: float = 0.95,
        **kwargs: Any,
    ) -> tuple[Figure, Axes]:
        """Quantile-Quantile plot against a theoretical distribution.

        The single most useful diagnostic plot for assessing distributional assumptions.
        Points near the 1:1 line indicate a good fit; deviations in the tails indicate
        heavy or light tails.

        Args:
            distribution: Name of a ``scipy.stats`` distribution (e.g., "norm", "expon",
                "gumbel_r"). Default "norm".
            column: Column to plot. If None, uses first column.
            confidence: Confidence level for the envelope (0-1). Default 0.95.
            **kwargs: Passed to ``_adjust_axes_labels`` (title, xlabel, ylabel, etc.).

        Returns:
            tuple: (Figure, Axes)

        Examples:
            Basic QQ plot against a normal distribution:

            >>> import numpy as np  # doctest: +SKIP
            >>> from statista.time_series import TimeSeries  # doctest: +SKIP
            >>> np.random.seed(42)  # doctest: +SKIP
            >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
            >>> fig, ax = ts.qq_plot()  # doctest: +SKIP

            QQ plot against an exponential distribution:

            >>> ts2 = TimeSeries(np.random.exponential(2, 200))  # doctest: +SKIP
            >>> fig, ax = ts2.qq_plot(distribution="expon")  # doctest: +SKIP

        References:
            Wilk, M.B. and Gnanadesikan, R. (1968). Probability plotting methods for the
            analysis of data. Biometrika, 55(1), 1-17.
        """
        if column is None:
            column = self.columns[0]

        data = np.sort(self[column].dropna().values)
        n = len(data)

        dist = getattr(scipy_stats, distribution)
        params = dist.fit(data)

        theoretical_quantiles = dist.ppf((np.arange(1, n + 1) - 0.5) / n, *params)

        fig, ax = self._get_ax_fig(**kwargs)
        kwargs.pop("fig", None)
        kwargs.pop("ax", None)

        ax.scatter(
            theoretical_quantiles,
            data,
            alpha=0.6,
            s=15,
            color="steelblue",
            edgecolor="white",
            linewidth=0.3,
        )

        # 1:1 reference line
        lims = [
            min(theoretical_quantiles.min(), data.min()),
            max(theoretical_quantiles.max(), data.max()),
        ]
        ax.plot(lims, lims, "r-", linewidth=1, label="1:1 line")

        if "title" not in kwargs:
            kwargs["title"] = f"QQ Plot — {column} vs {distribution}"
        if "xlabel" not in kwargs:
            kwargs["xlabel"] = f"Theoretical ({distribution})"
        if "ylabel" not in kwargs:
            kwargs["ylabel"] = "Sample quantiles"

        ax = self._adjust_axes_labels(ax, **kwargs)
        plt.show()
        return fig, ax

    def pp_plot(
        self,
        distribution: str = "norm",
        column: str = None,
        **kwargs: Any,
    ) -> tuple[Figure, Axes]:
        """Probability-Probability plot.

        Plots the empirical CDF against the theoretical CDF at each data point.
        Complementary to QQ plot -- PP emphasizes the center of the distribution,
        QQ emphasizes the tails.

        Args:
            distribution: Name of a ``scipy.stats`` distribution. Default "norm".
            column: Column to plot. If None, uses first column.
            **kwargs: Passed to ``_adjust_axes_labels``.

        Returns:
            tuple: (Figure, Axes)

        Examples:
            Basic PP plot against a normal distribution:

            >>> import numpy as np  # doctest: +SKIP
            >>> from statista.time_series import TimeSeries  # doctest: +SKIP
            >>> np.random.seed(42)  # doctest: +SKIP
            >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
            >>> fig, ax = ts.pp_plot()  # doctest: +SKIP

            PP plot against a Gumbel distribution:

            >>> ts2 = TimeSeries(np.random.gumbel(0, 1, 200))  # doctest: +SKIP
            >>> fig, ax = ts2.pp_plot(distribution="gumbel_r")  # doctest: +SKIP
        """
        if column is None:
            column = self.columns[0]

        data = np.sort(self[column].dropna().values)
        n = len(data)

        dist = getattr(scipy_stats, distribution)
        params = dist.fit(data)

        # Empirical CDF (Weibull plotting position)
        empirical_cdf = (np.arange(1, n + 1)) / (n + 1)
        # Theoretical CDF
        theoretical_cdf = dist.cdf(data, *params)

        fig, ax = self._get_ax_fig(**kwargs)
        kwargs.pop("fig", None)
        kwargs.pop("ax", None)

        ax.scatter(
            theoretical_cdf,
            empirical_cdf,
            alpha=0.6,
            s=15,
            color="steelblue",
            edgecolor="white",
            linewidth=0.3,
        )

        ax.plot([0, 1], [0, 1], "r-", linewidth=1, label="1:1 line")

        if "title" not in kwargs:
            kwargs["title"] = f"PP Plot — {column} vs {distribution}"
        if "xlabel" not in kwargs:
            kwargs["xlabel"] = f"Theoretical CDF ({distribution})"
        if "ylabel" not in kwargs:
            kwargs["ylabel"] = "Empirical CDF"

        ax = self._adjust_axes_labels(ax, **kwargs)
        plt.show()
        return fig, ax

    def normality_test(
        self,
        method: str = "auto",
        alpha: float = DEFAULT_ALPHA,
    ) -> DataFrame:
        """Test each column for normality.

        Args:
            method: Test to use.
                - "auto": Shapiro-Wilk if n < 5000, D'Agostino-Pearson if n >= 5000.
                - "shapiro": Shapiro-Wilk test (``scipy.stats.shapiro``). Best for n < 5000.
                - "dagostino": D'Agostino-Pearson test (``scipy.stats.normaltest``). For large n.
                - "anderson": Anderson-Darling test (``scipy.stats.anderson``).
                - "jarque_bera": Jarque-Bera test (``scipy.stats.jarque_bera``).
            alpha: Significance level. Default 0.05.

        Returns:
            pandas.DataFrame: One row per column with: test_name, statistic, p_value,
                is_normal, conclusion.

        Examples:
            Test normally distributed data (Shapiro-Wilk auto-selected for n < 5000):

            >>> import numpy as np
            >>> from statista.time_series import TimeSeries
            >>> np.random.seed(42)
            >>> ts = TimeSeries(np.random.randn(200))
            >>> result = ts.normality_test()
            >>> result.loc["Series1", "test_name"]
            'Shapiro-Wilk'
            >>> round(float(result.loc["Series1", "statistic"]), 4)
            0.9956
            >>> round(float(result.loc["Series1", "p_value"]), 4)
            0.829
            >>> bool(result.loc["Series1", "is_normal"])
            True

            Test non-normal data (exponential distribution fails normality):

            >>> np.random.seed(42)
            >>> ts2 = TimeSeries(np.random.exponential(2, 200))
            >>> result2 = ts2.normality_test()
            >>> result2.loc["Series1", "conclusion"]
            'Non-normal'
            >>> round(float(result2.loc["Series1", "statistic"]), 4)
            0.8522

            Use Jarque-Bera test explicitly:

            >>> np.random.seed(42)
            >>> ts3 = TimeSeries(np.random.randn(200))
            >>> result3 = ts3.normality_test(method="jarque_bera")
            >>> round(float(result3.loc["Series1", "p_value"]), 4)
            0.7464
        """
        rows = []

        for col in self.columns:
            data = self[col].dropna().values
            n = len(data)

            if method == "auto":
                chosen = "shapiro" if n < 5000 else "dagostino"
            else:
                chosen = method

            if chosen == "shapiro":
                stat, p = scipy_stats.shapiro(data)
                test_name = "Shapiro-Wilk"
            elif chosen == "dagostino":
                stat, p = scipy_stats.normaltest(data)
                test_name = "D'Agostino-Pearson"
            elif chosen == "anderson":
                result = scipy_stats.anderson(data, dist="norm")
                stat = result.statistic
                # Interpolate p from Anderson-Darling critical values
                # significance_level: [15%, 10%, 5%, 2.5%, 1%]
                sig_levels = result.significance_level / 100.0
                crits = result.critical_values
                if stat >= crits[-1]:
                    p = sig_levels[-1]  # beyond 1% critical
                elif stat <= crits[0]:
                    p = sig_levels[0]  # below 15% critical
                else:
                    from scipy.interpolate import interp1d

                    f = interp1d(crits, sig_levels, kind="linear")
                    p = float(f(stat))
                test_name = "Anderson-Darling"
            elif chosen == "jarque_bera":
                stat, p = scipy_stats.jarque_bera(data)
                test_name = "Jarque-Bera"
            else:
                raise ValueError(
                    f"Unknown method '{method}'. Choose from "
                    "'auto', 'shapiro', 'dagostino', 'anderson', 'jarque_bera'."
                )

            is_normal = bool(p > alpha)
            conclusion = "Normal" if is_normal else "Non-normal"

            rows.append(
                {
                    "column": col,
                    "test_name": test_name,
                    "statistic": float(stat),
                    "p_value": float(p),
                    "is_normal": is_normal,
                    "conclusion": conclusion,
                }
            )

        result_df = DataFrame(rows).set_index("column")
        return result_df

    def empirical_cdf(
        self,
        column: str = None,
        **kwargs: Any,
    ) -> tuple[Figure, Axes]:
        """Step-function plot of the empirical CDF.

        Simpler than KDE -- no bandwidth choice needed. Shows the actual data
        distribution as a monotonically increasing step function.

        Args:
            column: Column to plot. If None, plots all columns overlaid.
            **kwargs: Passed to ``_adjust_axes_labels``.

        Returns:
            tuple: (Figure, Axes)

        Examples:
            Plot the empirical CDF of normally distributed data:

            >>> import numpy as np  # doctest: +SKIP
            >>> from statista.time_series import TimeSeries  # doctest: +SKIP
            >>> np.random.seed(42)  # doctest: +SKIP
            >>> ts = TimeSeries(np.random.randn(100))  # doctest: +SKIP
            >>> fig, ax = ts.empirical_cdf()  # doctest: +SKIP

            Overlay multiple columns on one plot:

            >>> data = np.column_stack([np.random.randn(100), np.random.randn(100) + 2])  # doctest: +SKIP
            >>> ts2 = TimeSeries(data, columns=["A", "B"])  # doctest: +SKIP
            >>> fig, ax = ts2.empirical_cdf()  # doctest: +SKIP
        """
        cols = [column] if column is not None else list(self.columns)

        fig, ax = self._get_ax_fig(**kwargs)
        kwargs.pop("fig", None)
        kwargs.pop("ax", None)

        for col in cols:
            data = np.sort(self[col].dropna().values)
            n = len(data)
            ecdf = np.arange(1, n + 1) / n
            ax.step(data, ecdf, where="post", linewidth=1.2, label=col)

        if "title" not in kwargs:
            kwargs["title"] = "Empirical CDF"
        if "xlabel" not in kwargs:
            kwargs["xlabel"] = "Value"
        if "ylabel" not in kwargs:
            kwargs["ylabel"] = "Cumulative probability"
        if "legend" not in kwargs and len(cols) > 1:
            kwargs["legend"] = cols

        ax = self._adjust_axes_labels(ax, **kwargs)
        plt.show()
        return fig, ax

    def fit_distributions(self, method: str = "lmoments") -> DataFrame:
        """Fit distributions to each column and select the best fit.

        Uses the statista ``Distributions`` facade to fit all available distributions
        (GEV, Gumbel, Exponential, Normal) and selects the best by KS test.

        Args:
            method: Parameter estimation method -- "lmoments", "mle", or "mm".
                Default "lmoments".

        Returns:
            pandas.DataFrame: One row per column with: best_distribution, loc, scale,
                shape, ks_statistic, ks_p_value.

        Examples:
            Fit distributions using MLE and inspect the best fit:

            >>> import numpy as np  # doctest: +SKIP
            >>> from statista.time_series import TimeSeries  # doctest: +SKIP
            >>> np.random.seed(42)  # doctest: +SKIP
            >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
            >>> result = ts.fit_distributions(method="mle")  # doctest: +SKIP
            >>> result.loc["Series1", "best_distribution"]  # doctest: +SKIP
            'Normal'
            >>> round(float(result.loc["Series1", "ks_p_value"]), 4)  # doctest: +SKIP
            0.9997

            Check that the result DataFrame has the expected columns:

            >>> sorted(result.columns.tolist())  # doctest: +SKIP
            ['best_distribution', 'ks_p_value', 'ks_statistic', 'loc', 'scale', 'shape']
        """
        from statista.distributions import Distributions

        rows = []

        for col in self.columns:
            data = self[col].dropna().values

            try:
                dist = Distributions(data=data)
                best_name, best_info = dist.best_fit(method=method)

                params = best_info["parameters"]
                ks_stat, ks_p = best_info["ks"]

                rows.append(
                    {
                        "column": col,
                        "best_distribution": best_name,
                        "loc": params.get("loc", np.nan),
                        "scale": params.get("scale", np.nan),
                        "shape": params.get("shape", np.nan),
                        "ks_statistic": float(ks_stat),
                        "ks_p_value": float(ks_p),
                    }
                )
            except (ValueError, RuntimeError, np.linalg.LinAlgError) as e:
                rows.append(
                    {
                        "column": col,
                        "best_distribution": f"Error: {e}",
                        "loc": np.nan,
                        "scale": np.nan,
                        "shape": np.nan,
                        "ks_statistic": np.nan,
                        "ks_p_value": np.nan,
                    }
                )

        result_df = DataFrame(rows).set_index("column")
        return result_df
qq_plot(distribution='norm', column=None, confidence=0.95, **kwargs) #

Quantile-Quantile plot against a theoretical distribution.

The single most useful diagnostic plot for assessing distributional assumptions. Points near the 1:1 line indicate a good fit; deviations in the tails indicate heavy or light tails.

Parameters:

Name Type Description Default
distribution str

Name of a scipy.stats distribution (e.g., "norm", "expon", "gumbel_r"). Default "norm".

'norm'
column str

Column to plot. If None, uses first column.

None
confidence float

Confidence level for the envelope (0-1). Default 0.95.

0.95
**kwargs Any

Passed to _adjust_axes_labels (title, xlabel, ylabel, etc.).

{}

Returns:

Name Type Description
tuple tuple[Figure, Axes]

(Figure, Axes)

Examples:

Basic QQ plot against a normal distribution:

>>> import numpy as np
>>> from statista.time_series import TimeSeries
>>> np.random.seed(42)
>>> ts = TimeSeries(np.random.randn(200))
>>> fig, ax = ts.qq_plot()

QQ plot against an exponential distribution:

>>> ts2 = TimeSeries(np.random.exponential(2, 200))
>>> fig, ax = ts2.qq_plot(distribution="expon")
References

Wilk, M.B. and Gnanadesikan, R. (1968). Probability plotting methods for the analysis of data. Biometrika, 55(1), 1-17.

Source code in src\statista\time_series\distribution.py
def qq_plot(
    self,
    distribution: str = "norm",
    column: str = None,
    confidence: float = 0.95,
    **kwargs: Any,
) -> tuple[Figure, Axes]:
    """Quantile-Quantile plot against a theoretical distribution.

    The single most useful diagnostic plot for assessing distributional assumptions.
    Points near the 1:1 line indicate a good fit; deviations in the tails indicate
    heavy or light tails.

    Args:
        distribution: Name of a ``scipy.stats`` distribution (e.g., "norm", "expon",
            "gumbel_r"). Default "norm".
        column: Column to plot. If None, uses first column.
        confidence: Confidence level for the envelope (0-1). Default 0.95.
        **kwargs: Passed to ``_adjust_axes_labels`` (title, xlabel, ylabel, etc.).

    Returns:
        tuple: (Figure, Axes)

    Examples:
        Basic QQ plot against a normal distribution:

        >>> import numpy as np  # doctest: +SKIP
        >>> from statista.time_series import TimeSeries  # doctest: +SKIP
        >>> np.random.seed(42)  # doctest: +SKIP
        >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
        >>> fig, ax = ts.qq_plot()  # doctest: +SKIP

        QQ plot against an exponential distribution:

        >>> ts2 = TimeSeries(np.random.exponential(2, 200))  # doctest: +SKIP
        >>> fig, ax = ts2.qq_plot(distribution="expon")  # doctest: +SKIP

    References:
        Wilk, M.B. and Gnanadesikan, R. (1968). Probability plotting methods for the
        analysis of data. Biometrika, 55(1), 1-17.
    """
    if column is None:
        column = self.columns[0]

    data = np.sort(self[column].dropna().values)
    n = len(data)

    dist = getattr(scipy_stats, distribution)
    params = dist.fit(data)

    theoretical_quantiles = dist.ppf((np.arange(1, n + 1) - 0.5) / n, *params)

    fig, ax = self._get_ax_fig(**kwargs)
    kwargs.pop("fig", None)
    kwargs.pop("ax", None)

    ax.scatter(
        theoretical_quantiles,
        data,
        alpha=0.6,
        s=15,
        color="steelblue",
        edgecolor="white",
        linewidth=0.3,
    )

    # 1:1 reference line
    lims = [
        min(theoretical_quantiles.min(), data.min()),
        max(theoretical_quantiles.max(), data.max()),
    ]
    ax.plot(lims, lims, "r-", linewidth=1, label="1:1 line")

    if "title" not in kwargs:
        kwargs["title"] = f"QQ Plot — {column} vs {distribution}"
    if "xlabel" not in kwargs:
        kwargs["xlabel"] = f"Theoretical ({distribution})"
    if "ylabel" not in kwargs:
        kwargs["ylabel"] = "Sample quantiles"

    ax = self._adjust_axes_labels(ax, **kwargs)
    plt.show()
    return fig, ax
pp_plot(distribution='norm', column=None, **kwargs) #

Probability-Probability plot.

Plots the empirical CDF against the theoretical CDF at each data point. Complementary to QQ plot -- PP emphasizes the center of the distribution, QQ emphasizes the tails.

Parameters:

Name Type Description Default
distribution str

Name of a scipy.stats distribution. Default "norm".

'norm'
column str

Column to plot. If None, uses first column.

None
**kwargs Any

Passed to _adjust_axes_labels.

{}

Returns:

Name Type Description
tuple tuple[Figure, Axes]

(Figure, Axes)

Examples:

Basic PP plot against a normal distribution:

>>> import numpy as np
>>> from statista.time_series import TimeSeries
>>> np.random.seed(42)
>>> ts = TimeSeries(np.random.randn(200))
>>> fig, ax = ts.pp_plot()

PP plot against a Gumbel distribution:

>>> ts2 = TimeSeries(np.random.gumbel(0, 1, 200))
>>> fig, ax = ts2.pp_plot(distribution="gumbel_r")
Source code in src\statista\time_series\distribution.py
def pp_plot(
    self,
    distribution: str = "norm",
    column: str = None,
    **kwargs: Any,
) -> tuple[Figure, Axes]:
    """Probability-Probability plot.

    Plots the empirical CDF against the theoretical CDF at each data point.
    Complementary to QQ plot -- PP emphasizes the center of the distribution,
    QQ emphasizes the tails.

    Args:
        distribution: Name of a ``scipy.stats`` distribution. Default "norm".
        column: Column to plot. If None, uses first column.
        **kwargs: Passed to ``_adjust_axes_labels``.

    Returns:
        tuple: (Figure, Axes)

    Examples:
        Basic PP plot against a normal distribution:

        >>> import numpy as np  # doctest: +SKIP
        >>> from statista.time_series import TimeSeries  # doctest: +SKIP
        >>> np.random.seed(42)  # doctest: +SKIP
        >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
        >>> fig, ax = ts.pp_plot()  # doctest: +SKIP

        PP plot against a Gumbel distribution:

        >>> ts2 = TimeSeries(np.random.gumbel(0, 1, 200))  # doctest: +SKIP
        >>> fig, ax = ts2.pp_plot(distribution="gumbel_r")  # doctest: +SKIP
    """
    if column is None:
        column = self.columns[0]

    data = np.sort(self[column].dropna().values)
    n = len(data)

    dist = getattr(scipy_stats, distribution)
    params = dist.fit(data)

    # Empirical CDF (Weibull plotting position)
    empirical_cdf = (np.arange(1, n + 1)) / (n + 1)
    # Theoretical CDF
    theoretical_cdf = dist.cdf(data, *params)

    fig, ax = self._get_ax_fig(**kwargs)
    kwargs.pop("fig", None)
    kwargs.pop("ax", None)

    ax.scatter(
        theoretical_cdf,
        empirical_cdf,
        alpha=0.6,
        s=15,
        color="steelblue",
        edgecolor="white",
        linewidth=0.3,
    )

    ax.plot([0, 1], [0, 1], "r-", linewidth=1, label="1:1 line")

    if "title" not in kwargs:
        kwargs["title"] = f"PP Plot — {column} vs {distribution}"
    if "xlabel" not in kwargs:
        kwargs["xlabel"] = f"Theoretical CDF ({distribution})"
    if "ylabel" not in kwargs:
        kwargs["ylabel"] = "Empirical CDF"

    ax = self._adjust_axes_labels(ax, **kwargs)
    plt.show()
    return fig, ax
normality_test(method='auto', alpha=DEFAULT_ALPHA) #

Test each column for normality.

Parameters:

Name Type Description Default
method str

Test to use. - "auto": Shapiro-Wilk if n < 5000, D'Agostino-Pearson if n >= 5000. - "shapiro": Shapiro-Wilk test (scipy.stats.shapiro). Best for n < 5000. - "dagostino": D'Agostino-Pearson test (scipy.stats.normaltest). For large n. - "anderson": Anderson-Darling test (scipy.stats.anderson). - "jarque_bera": Jarque-Bera test (scipy.stats.jarque_bera).

'auto'
alpha float

Significance level. Default 0.05.

DEFAULT_ALPHA

Returns:

Type Description
DataFrame

pandas.DataFrame: One row per column with: test_name, statistic, p_value, is_normal, conclusion.

Examples:

Test normally distributed data (Shapiro-Wilk auto-selected for n < 5000):

>>> import numpy as np
>>> from statista.time_series import TimeSeries
>>> np.random.seed(42)
>>> ts = TimeSeries(np.random.randn(200))
>>> result = ts.normality_test()
>>> result.loc["Series1", "test_name"]
'Shapiro-Wilk'
>>> round(float(result.loc["Series1", "statistic"]), 4)
0.9956
>>> round(float(result.loc["Series1", "p_value"]), 4)
0.829
>>> bool(result.loc["Series1", "is_normal"])
True

Test non-normal data (exponential distribution fails normality):

>>> np.random.seed(42)
>>> ts2 = TimeSeries(np.random.exponential(2, 200))
>>> result2 = ts2.normality_test()
>>> result2.loc["Series1", "conclusion"]
'Non-normal'
>>> round(float(result2.loc["Series1", "statistic"]), 4)
0.8522

Use Jarque-Bera test explicitly:

>>> np.random.seed(42)
>>> ts3 = TimeSeries(np.random.randn(200))
>>> result3 = ts3.normality_test(method="jarque_bera")
>>> round(float(result3.loc["Series1", "p_value"]), 4)
0.7464
Source code in src\statista\time_series\distribution.py
def normality_test(
    self,
    method: str = "auto",
    alpha: float = DEFAULT_ALPHA,
) -> DataFrame:
    """Test each column for normality.

    Args:
        method: Test to use.
            - "auto": Shapiro-Wilk if n < 5000, D'Agostino-Pearson if n >= 5000.
            - "shapiro": Shapiro-Wilk test (``scipy.stats.shapiro``). Best for n < 5000.
            - "dagostino": D'Agostino-Pearson test (``scipy.stats.normaltest``). For large n.
            - "anderson": Anderson-Darling test (``scipy.stats.anderson``).
            - "jarque_bera": Jarque-Bera test (``scipy.stats.jarque_bera``).
        alpha: Significance level. Default 0.05.

    Returns:
        pandas.DataFrame: One row per column with: test_name, statistic, p_value,
            is_normal, conclusion.

    Examples:
        Test normally distributed data (Shapiro-Wilk auto-selected for n < 5000):

        >>> import numpy as np
        >>> from statista.time_series import TimeSeries
        >>> np.random.seed(42)
        >>> ts = TimeSeries(np.random.randn(200))
        >>> result = ts.normality_test()
        >>> result.loc["Series1", "test_name"]
        'Shapiro-Wilk'
        >>> round(float(result.loc["Series1", "statistic"]), 4)
        0.9956
        >>> round(float(result.loc["Series1", "p_value"]), 4)
        0.829
        >>> bool(result.loc["Series1", "is_normal"])
        True

        Test non-normal data (exponential distribution fails normality):

        >>> np.random.seed(42)
        >>> ts2 = TimeSeries(np.random.exponential(2, 200))
        >>> result2 = ts2.normality_test()
        >>> result2.loc["Series1", "conclusion"]
        'Non-normal'
        >>> round(float(result2.loc["Series1", "statistic"]), 4)
        0.8522

        Use Jarque-Bera test explicitly:

        >>> np.random.seed(42)
        >>> ts3 = TimeSeries(np.random.randn(200))
        >>> result3 = ts3.normality_test(method="jarque_bera")
        >>> round(float(result3.loc["Series1", "p_value"]), 4)
        0.7464
    """
    rows = []

    for col in self.columns:
        data = self[col].dropna().values
        n = len(data)

        if method == "auto":
            chosen = "shapiro" if n < 5000 else "dagostino"
        else:
            chosen = method

        if chosen == "shapiro":
            stat, p = scipy_stats.shapiro(data)
            test_name = "Shapiro-Wilk"
        elif chosen == "dagostino":
            stat, p = scipy_stats.normaltest(data)
            test_name = "D'Agostino-Pearson"
        elif chosen == "anderson":
            result = scipy_stats.anderson(data, dist="norm")
            stat = result.statistic
            # Interpolate p from Anderson-Darling critical values
            # significance_level: [15%, 10%, 5%, 2.5%, 1%]
            sig_levels = result.significance_level / 100.0
            crits = result.critical_values
            if stat >= crits[-1]:
                p = sig_levels[-1]  # beyond 1% critical
            elif stat <= crits[0]:
                p = sig_levels[0]  # below 15% critical
            else:
                from scipy.interpolate import interp1d

                f = interp1d(crits, sig_levels, kind="linear")
                p = float(f(stat))
            test_name = "Anderson-Darling"
        elif chosen == "jarque_bera":
            stat, p = scipy_stats.jarque_bera(data)
            test_name = "Jarque-Bera"
        else:
            raise ValueError(
                f"Unknown method '{method}'. Choose from "
                "'auto', 'shapiro', 'dagostino', 'anderson', 'jarque_bera'."
            )

        is_normal = bool(p > alpha)
        conclusion = "Normal" if is_normal else "Non-normal"

        rows.append(
            {
                "column": col,
                "test_name": test_name,
                "statistic": float(stat),
                "p_value": float(p),
                "is_normal": is_normal,
                "conclusion": conclusion,
            }
        )

    result_df = DataFrame(rows).set_index("column")
    return result_df
empirical_cdf(column=None, **kwargs) #

Step-function plot of the empirical CDF.

Simpler than KDE -- no bandwidth choice needed. Shows the actual data distribution as a monotonically increasing step function.

Parameters:

Name Type Description Default
column str

Column to plot. If None, plots all columns overlaid.

None
**kwargs Any

Passed to _adjust_axes_labels.

{}

Returns:

Name Type Description
tuple tuple[Figure, Axes]

(Figure, Axes)

Examples:

Plot the empirical CDF of normally distributed data:

>>> import numpy as np
>>> from statista.time_series import TimeSeries
>>> np.random.seed(42)
>>> ts = TimeSeries(np.random.randn(100))
>>> fig, ax = ts.empirical_cdf()

Overlay multiple columns on one plot:

>>> data = np.column_stack([np.random.randn(100), np.random.randn(100) + 2])
>>> ts2 = TimeSeries(data, columns=["A", "B"])
>>> fig, ax = ts2.empirical_cdf()
Source code in src\statista\time_series\distribution.py
def empirical_cdf(
    self,
    column: str = None,
    **kwargs: Any,
) -> tuple[Figure, Axes]:
    """Step-function plot of the empirical CDF.

    Simpler than KDE -- no bandwidth choice needed. Shows the actual data
    distribution as a monotonically increasing step function.

    Args:
        column: Column to plot. If None, plots all columns overlaid.
        **kwargs: Passed to ``_adjust_axes_labels``.

    Returns:
        tuple: (Figure, Axes)

    Examples:
        Plot the empirical CDF of normally distributed data:

        >>> import numpy as np  # doctest: +SKIP
        >>> from statista.time_series import TimeSeries  # doctest: +SKIP
        >>> np.random.seed(42)  # doctest: +SKIP
        >>> ts = TimeSeries(np.random.randn(100))  # doctest: +SKIP
        >>> fig, ax = ts.empirical_cdf()  # doctest: +SKIP

        Overlay multiple columns on one plot:

        >>> data = np.column_stack([np.random.randn(100), np.random.randn(100) + 2])  # doctest: +SKIP
        >>> ts2 = TimeSeries(data, columns=["A", "B"])  # doctest: +SKIP
        >>> fig, ax = ts2.empirical_cdf()  # doctest: +SKIP
    """
    cols = [column] if column is not None else list(self.columns)

    fig, ax = self._get_ax_fig(**kwargs)
    kwargs.pop("fig", None)
    kwargs.pop("ax", None)

    for col in cols:
        data = np.sort(self[col].dropna().values)
        n = len(data)
        ecdf = np.arange(1, n + 1) / n
        ax.step(data, ecdf, where="post", linewidth=1.2, label=col)

    if "title" not in kwargs:
        kwargs["title"] = "Empirical CDF"
    if "xlabel" not in kwargs:
        kwargs["xlabel"] = "Value"
    if "ylabel" not in kwargs:
        kwargs["ylabel"] = "Cumulative probability"
    if "legend" not in kwargs and len(cols) > 1:
        kwargs["legend"] = cols

    ax = self._adjust_axes_labels(ax, **kwargs)
    plt.show()
    return fig, ax
fit_distributions(method='lmoments') #

Fit distributions to each column and select the best fit.

Uses the statista Distributions facade to fit all available distributions (GEV, Gumbel, Exponential, Normal) and selects the best by KS test.

Parameters:

Name Type Description Default
method str

Parameter estimation method -- "lmoments", "mle", or "mm". Default "lmoments".

'lmoments'

Returns:

Type Description
DataFrame

pandas.DataFrame: One row per column with: best_distribution, loc, scale, shape, ks_statistic, ks_p_value.

Examples:

Fit distributions using MLE and inspect the best fit:

>>> import numpy as np
>>> from statista.time_series import TimeSeries
>>> np.random.seed(42)
>>> ts = TimeSeries(np.random.randn(200))
>>> result = ts.fit_distributions(method="mle")
>>> result.loc["Series1", "best_distribution"]
'Normal'
>>> round(float(result.loc["Series1", "ks_p_value"]), 4)
0.9997

Check that the result DataFrame has the expected columns:

>>> sorted(result.columns.tolist())
['best_distribution', 'ks_p_value', 'ks_statistic', 'loc', 'scale', 'shape']
Source code in src\statista\time_series\distribution.py
def fit_distributions(self, method: str = "lmoments") -> DataFrame:
    """Fit distributions to each column and select the best fit.

    Uses the statista ``Distributions`` facade to fit all available distributions
    (GEV, Gumbel, Exponential, Normal) and selects the best by KS test.

    Args:
        method: Parameter estimation method -- "lmoments", "mle", or "mm".
            Default "lmoments".

    Returns:
        pandas.DataFrame: One row per column with: best_distribution, loc, scale,
            shape, ks_statistic, ks_p_value.

    Examples:
        Fit distributions using MLE and inspect the best fit:

        >>> import numpy as np  # doctest: +SKIP
        >>> from statista.time_series import TimeSeries  # doctest: +SKIP
        >>> np.random.seed(42)  # doctest: +SKIP
        >>> ts = TimeSeries(np.random.randn(200))  # doctest: +SKIP
        >>> result = ts.fit_distributions(method="mle")  # doctest: +SKIP
        >>> result.loc["Series1", "best_distribution"]  # doctest: +SKIP
        'Normal'
        >>> round(float(result.loc["Series1", "ks_p_value"]), 4)  # doctest: +SKIP
        0.9997

        Check that the result DataFrame has the expected columns:

        >>> sorted(result.columns.tolist())  # doctest: +SKIP
        ['best_distribution', 'ks_p_value', 'ks_statistic', 'loc', 'scale', 'shape']
    """
    from statista.distributions import Distributions

    rows = []

    for col in self.columns:
        data = self[col].dropna().values

        try:
            dist = Distributions(data=data)
            best_name, best_info = dist.best_fit(method=method)

            params = best_info["parameters"]
            ks_stat, ks_p = best_info["ks"]

            rows.append(
                {
                    "column": col,
                    "best_distribution": best_name,
                    "loc": params.get("loc", np.nan),
                    "scale": params.get("scale", np.nan),
                    "shape": params.get("shape", np.nan),
                    "ks_statistic": float(ks_stat),
                    "ks_p_value": float(ks_p),
                }
            )
        except (ValueError, RuntimeError, np.linalg.LinAlgError) as e:
            rows.append(
                {
                    "column": col,
                    "best_distribution": f"Error: {e}",
                    "loc": np.nan,
                    "scale": np.nan,
                    "shape": np.nan,
                    "ks_statistic": np.nan,
                    "ks_p_value": np.nan,
                }
            )

    result_df = DataFrame(rows).set_index("column")
    return result_df