Normal Distribution #

`statista.distributions.Normal` #

Bases: AbstractDistribution

Normal Distribution.

The probability density function (PDF) of the Normal distribution is:

\[ f(x; \mu, \sigma) = \frac{1}{\sigma \sqrt{2\pi}} \exp\left(-\frac{(x - \mu)^2}{2\sigma^2}\right) \]

Where \(\mu\) is the location (mean) parameter and \(\sigma\) is the scale (standard deviation) parameter.
The cumulative distribution function (CDF) is:

\[ F(x; \mu, \sigma) = \frac{1}{2}\left[1 + \mathrm{erf} \left(\frac{x - \mu}{\sigma \sqrt{2}}\right)\right] \]

Source code in src/statista/distributions/normal.py

class Normal(AbstractDistribution):
    """Normal Distribution.

    - The probability density function (PDF) of the Normal distribution is:

        $$
        f(x; \\mu, \\sigma) = \\frac{1}{\\sigma \\sqrt{2\\pi}}
        \\exp\\left(-\\frac{(x - \\mu)^2}{2\\sigma^2}\\right)
        $$

        Where \\(\\mu\\) is the location (mean) parameter and \\(\\sigma\\) is the scale
        (standard deviation) parameter.

    - The cumulative distribution function (CDF) is:

        $$
        F(x; \\mu, \\sigma) = \\frac{1}{2}\\left[1 + \\mathrm{erf}
        \\left(\\frac{x - \\mu}{\\sigma \\sqrt{2}}\\right)\\right]
        $$
    """

    def __init__(
        self,
        data: list | np.ndarray | None = None,
        parameters: Parameters | dict[str, float] | None = None,
    ):
        """Normal.

        Args:
            data (list):
                data time series.
            parameters (Parameters):
                - loc: [numeric]
                    location (mean) parameter of the Normal distribution.
                - scale: [numeric]
                    scale (standard deviation) parameter of the Normal distribution.
                ```python
                Parameters(loc=val, scale=val)
                ```
        """
        super().__init__(data, parameters)

    @staticmethod
    def _pdf_eq(data: list | np.ndarray, parameters: Parameters) -> np.ndarray:
        loc = parameters.loc
        scale = parameters.scale
        if scale is None or scale <= 0:
            raise ValueError(SCALE_PARAMETER_ERROR)
        pdf = norm.pdf(data, loc=loc, scale=scale)

        return pdf

    def pdf(  # type: ignore[override]
        self,
        plot_figure: bool = False,
        parameters: Parameters | dict[str, float] | None = None,
        data: list[float] | np.ndarray | None = None,
        *args: Any,
        **kwargs: Any,
    ) -> tuple[np.ndarray, Figure, Any] | np.ndarray:
        """pdf.

        Returns the value of Gumbel's pdf with parameters loc and scale at x.

        Args:
            parameters (Parameters, optional):
                if not provided, the parameters provided in the class initialization will be used. default is None.
                - loc: [numeric]
                    location parameter of the normal distribution.
                - scale: [numeric]
                    scale parameter of the normal distribution.
                ```python
                Parameters(loc=val, scale=val)
                ```
            data (np.ndarray):
                array if you want to calculate the pdf for different data than the time series given to the constructor
                method. default is None.
            plot_figure (bool):
                Default is False.
            kwargs (dict[str, Any]):
                fig_size: [tuple]
                    Default is (6, 5).
                xlabel: [str]
                    Default is "Actual data".
                ylabel: [str]
                    Default is "pdf".
                fontsize: [int]
                    Default is 15

        Returns:
            pdf (array):
                probability density function pdf.
            fig (matplotlib.figure.Figure):
                Figure object is returned only if `plot_figure` is True.
            ax (matplotlib.axes.Axes):
                Axes object is returned only if `plot_figure` is True.
        """
        result = super().pdf(
            parameters=parameters,
            data=data,
            plot_figure=plot_figure,
            *args,
            **kwargs,
        )  # type: ignore[misc]

        return result

    @staticmethod
    def _cdf_eq(data: list | np.ndarray, parameters: Parameters) -> np.ndarray:
        loc = parameters.loc
        scale = parameters.scale

        if scale is None or scale <= 0:
            raise ValueError(SCALE_PARAMETER_ERROR)

        cdf = norm.cdf(data, loc=loc, scale=scale)
        return cdf

    def cdf(  # type: ignore[override]
        self,
        plot_figure: bool = False,
        parameters: Parameters | dict[str, float] | None = None,
        data: list[float] | np.ndarray | None = None,
        *args: Any,
        **kwargs: Any,
    ) -> tuple[np.ndarray, Figure, Any] | np.ndarray:
        """cdf.

        cdf calculates the value of Normal distribution cdf with parameters loc and scale at x.

        Args:
            parameters (Parameters, optional):
                if not provided, the parameters provided in the class initialization will be used. default is None.
                - loc (numeric):
                    location parameter of the Normal distribution.
                - scale (numeric):
                    scale parameter of the Normal distribution.
                ```python
                Parameters(loc=val, scale=val)
                ```
            data (np.ndarray):
                array if you want to calculate the pdf for different data than the time series given to the constructor
                method. default is None.
            plot_figure (bool):
                Default is False.
            kwargs (dict[str, Any]):
                fig_size (tuple):
                    Default is (6, 5).
                xlabel (str):
                    Default is "Actual data".
                ylabel (str):
                    Default is "cdf".
                fontsize (int):
                    Default is 15.

        Returns:
            cdf (array):
                probability density function cdf.
            fig (matplotlib.figure.Figure):
                Figure object is returned only if `plot_figure` is True.
            ax (matplotlib.axes.Axes):
                Axes object is returned only if `plot_figure` is True.
        """
        result = super().cdf(
            parameters=parameters,
            data=data,
            plot_figure=plot_figure,
            *args,
            **kwargs,
        )  # type: ignore[misc]
        return result

    def fit_model(
        self,
        method: str = "mle",
        obj_func=None,
        threshold: int | float | None = None,
        test: bool = True,
    ) -> Parameters:
        """fit_model.

        fit_model estimates the distribution parameter based on MLM
        (Maximum likelihood method), if an objective function is entered as an input

        There are two likelihood functions (L1 and L2), one for values above some
        threshold (x>=C) and one for the values below (x < C), now the likeliest parameters
        are those at the max value of multiplication between two functions max(L1*L2).

        In this case, the L1 is still the product of multiplication of probability
        density function's values at xi, but the L2 is the probability that threshold
        value C will be exceeded (1-F(C)).

        Args:
            obj_func (function):
                function to be used to get the distribution parameters.
            threshold (numeric):
                Value you want to consider only the greater values.
            method (str):
                'mle', 'mm', 'lmoments', optimization
            test (bool):
                Default is True

        Returns:
            parameters (list):
                shape, loc, scale parameter of the gumbel distribution in that order.
        """
        # obj_func = lambda p, x: (-np.log(Gumbel.pdf(x, p[0], p[1]))).sum()
        # #first we make a simple Gumbel fit
        # Par1 = so.fmin(obj_func, [0.5,0.5], args=(np.array(data),))
        method = super().fit_model(method=method)  # type: ignore[assignment]

        if method == "mle" or method == "mm":
            param_list: Any = list(norm.fit(self.data, method=method))
        elif method == "lmoments":
            lm = Lmoments(self.data)
            lmu = lm.calculate()
            param_list = Lmoments.normal(lmu)
        elif method == "optimization":
            if obj_func is None or threshold is None:
                raise TypeError(OBJ_FUNCTION_THRESHOLD_ERROR)

            param_list = norm.fit(self.data, method="mle")
            # then we use the result as starting value for your truncated Gumbel fit
            param_list = so.fmin(
                obj_func,
                [threshold, param_list[0], param_list[1]],
                args=(self.data,),
                maxiter=500,
                maxfun=500,
            )
            param_list = [param_list[1], param_list[2]]
        else:
            raise ValueError(f"The given: {method} does not exist")

        param = Parameters(loc=param_list[0], scale=param_list[1])
        self.parameters = param

        if test:
            self.ks()
            self.chisquare()

        return param

    def inverse_cdf(
        self,
        cdf: np.ndarray | list[float] | None = None,
        parameters: Parameters | dict[str, float] | None = None,
    ) -> np.ndarray:
        """Theoretical Estimate.

        Theoretical Estimate method calculates the theoretical values based on a given  non exceedence probability

        Args:
            parameters (Parameters):
                Parameters(loc=val, scale=val)

                - loc (numeric):
                    location parameter of the Normal distribution.
                - scale (numeric):
                    scale parameter of the Normal distribution.
            cdf (list):
                cumulative distribution function/ Non-Exceedance probability.

        Returns:
            numeric:
                Value based on the theoretical distribution
        """
        if parameters is None:
            parameters = self.parameters
        elif isinstance(parameters, dict):
            parameters = Parameters(**parameters)

        loc = parameters.loc
        scale = parameters.scale

        if scale is None or scale <= 0:
            raise ValueError(SCALE_PARAMETER_ERROR)

        cdf = np.array(cdf)
        if np.any(cdf < 0) or np.any(cdf > 1):
            raise ValueError(CDF_INVALID_VALUE_ERROR)

        # the main equation from scipy
        q_th = norm.ppf(cdf, loc=loc, scale=scale)
        return q_th

    def ks(self):
        """Kolmogorov-Smirnov (KS) test.

        The smaller the D static, the more likely that the two samples are drawn from the same distribution
        IF Pvalue < significance level ------ reject

        Returns:
            Dstatic (numeric):
                The smaller the D static the more likely that the two samples are drawn from the same distribution
            Pvalue (numeric):
                IF Pvalue < significance level ------ reject the null hypothesis
        """
        return super().ks()

    def chisquare(self) -> tuple:
        """chisquare test"""
        return super().chisquare()

`init(data=None, parameters=None)` #

Normal.

Parameters:

Name	Type	Description	Default
`data`	`list`	data time series.	`None`
`parameters`	`Parameters`	loc: [numeric] location (mean) parameter of the Normal distribution. scale: [numeric] scale (standard deviation) parameter of the Normal distribution. `Parameters(loc=val, scale=val)`	`None`

Source code in src/statista/distributions/normal.py

def __init__(
    self,
    data: list | np.ndarray | None = None,
    parameters: Parameters | dict[str, float] | None = None,
):
    """Normal.

    Args:
        data (list):
            data time series.
        parameters (Parameters):
            - loc: [numeric]
                location (mean) parameter of the Normal distribution.
            - scale: [numeric]
                scale (standard deviation) parameter of the Normal distribution.
            ```python
            Parameters(loc=val, scale=val)
            ```
    """
    super().__init__(data, parameters)

`pdf(plot_figure=False, parameters=None, data=None, *args, **kwargs)` #

pdf.

Returns the value of Gumbel's pdf with parameters loc and scale at x.

Parameters:

Name	Type	Description	Default
`parameters`	`Parameters`	if not provided, the parameters provided in the class initialization will be used. default is None. - loc: [numeric] location parameter of the normal distribution. - scale: [numeric] scale parameter of the normal distribution. `Parameters(loc=val, scale=val)`	`None`
`data`	`ndarray`	array if you want to calculate the pdf for different data than the time series given to the constructor method. default is None.	`None`
`plot_figure`	`bool`	Default is False.	`False`
`kwargs`	`dict[str, Any]`	fig_size: [tuple] Default is (6, 5). xlabel: [str] Default is "Actual data". ylabel: [str] Default is "pdf". fontsize: [int] Default is 15	`{}`

Returns:

Name	Type	Description
`pdf`	`array`	probability density function pdf.
`fig`	`Figure`	Figure object is returned only if `plot_figure` is True.
`ax`	`Axes`	Axes object is returned only if `plot_figure` is True.

Source code in src/statista/distributions/normal.py

def pdf(  # type: ignore[override]
    self,
    plot_figure: bool = False,
    parameters: Parameters | dict[str, float] | None = None,
    data: list[float] | np.ndarray | None = None,
    *args: Any,
    **kwargs: Any,
) -> tuple[np.ndarray, Figure, Any] | np.ndarray:
    """pdf.

    Returns the value of Gumbel's pdf with parameters loc and scale at x.

    Args:
        parameters (Parameters, optional):
            if not provided, the parameters provided in the class initialization will be used. default is None.
            - loc: [numeric]
                location parameter of the normal distribution.
            - scale: [numeric]
                scale parameter of the normal distribution.
            ```python
            Parameters(loc=val, scale=val)
            ```
        data (np.ndarray):
            array if you want to calculate the pdf for different data than the time series given to the constructor
            method. default is None.
        plot_figure (bool):
            Default is False.
        kwargs (dict[str, Any]):
            fig_size: [tuple]
                Default is (6, 5).
            xlabel: [str]
                Default is "Actual data".
            ylabel: [str]
                Default is "pdf".
            fontsize: [int]
                Default is 15

    Returns:
        pdf (array):
            probability density function pdf.
        fig (matplotlib.figure.Figure):
            Figure object is returned only if `plot_figure` is True.
        ax (matplotlib.axes.Axes):
            Axes object is returned only if `plot_figure` is True.
    """
    result = super().pdf(
        parameters=parameters,
        data=data,
        plot_figure=plot_figure,
        *args,
        **kwargs,
    )  # type: ignore[misc]

    return result

`cdf(plot_figure=False, parameters=None, data=None, *args, **kwargs)` #

cdf.

cdf calculates the value of Normal distribution cdf with parameters loc and scale at x.

Parameters:

Name	Type	Description	Default
`parameters`	`Parameters`	if not provided, the parameters provided in the class initialization will be used. default is None. - loc (numeric): location parameter of the Normal distribution. - scale (numeric): scale parameter of the Normal distribution. `Parameters(loc=val, scale=val)`	`None`
`data`	`ndarray`	array if you want to calculate the pdf for different data than the time series given to the constructor method. default is None.	`None`
`plot_figure`	`bool`	Default is False.	`False`
`kwargs`	`dict[str, Any]`	fig_size (tuple): Default is (6, 5). xlabel (str): Default is "Actual data". ylabel (str): Default is "cdf". fontsize (int): Default is 15.	`{}`

Returns:

Name	Type	Description
`cdf`	`array`	probability density function cdf.
`fig`	`Figure`	Figure object is returned only if `plot_figure` is True.
`ax`	`Axes`	Axes object is returned only if `plot_figure` is True.

Source code in src/statista/distributions/normal.py

def cdf(  # type: ignore[override]
    self,
    plot_figure: bool = False,
    parameters: Parameters | dict[str, float] | None = None,
    data: list[float] | np.ndarray | None = None,
    *args: Any,
    **kwargs: Any,
) -> tuple[np.ndarray, Figure, Any] | np.ndarray:
    """cdf.

    cdf calculates the value of Normal distribution cdf with parameters loc and scale at x.

    Args:
        parameters (Parameters, optional):
            if not provided, the parameters provided in the class initialization will be used. default is None.
            - loc (numeric):
                location parameter of the Normal distribution.
            - scale (numeric):
                scale parameter of the Normal distribution.
            ```python
            Parameters(loc=val, scale=val)
            ```
        data (np.ndarray):
            array if you want to calculate the pdf for different data than the time series given to the constructor
            method. default is None.
        plot_figure (bool):
            Default is False.
        kwargs (dict[str, Any]):
            fig_size (tuple):
                Default is (6, 5).
            xlabel (str):
                Default is "Actual data".
            ylabel (str):
                Default is "cdf".
            fontsize (int):
                Default is 15.

    Returns:
        cdf (array):
            probability density function cdf.
        fig (matplotlib.figure.Figure):
            Figure object is returned only if `plot_figure` is True.
        ax (matplotlib.axes.Axes):
            Axes object is returned only if `plot_figure` is True.
    """
    result = super().cdf(
        parameters=parameters,
        data=data,
        plot_figure=plot_figure,
        *args,
        **kwargs,
    )  # type: ignore[misc]
    return result

`fit_model(method='mle', obj_func=None, threshold=None, test=True)` #

fit_model.

fit_model estimates the distribution parameter based on MLM (Maximum likelihood method), if an objective function is entered as an input

There are two likelihood functions (L1 and L2), one for values above some threshold (x>=C) and one for the values below (x < C), now the likeliest parameters are those at the max value of multiplication between two functions max(L1*L2).

In this case, the L1 is still the product of multiplication of probability density function's values at xi, but the L2 is the probability that threshold value C will be exceeded (1-F(C)).

Parameters:

Name	Type	Description	Default
`obj_func`	`function`	function to be used to get the distribution parameters.	`None`
`threshold`	`numeric`	Value you want to consider only the greater values.	`None`
`method`	`str`	'mle', 'mm', 'lmoments', optimization	`'mle'`
`test`	`bool`	Default is True	`True`

Returns:

Name	Type	Description
`parameters`	`list`	shape, loc, scale parameter of the gumbel distribution in that order.

Source code in src/statista/distributions/normal.py

def fit_model(
    self,
    method: str = "mle",
    obj_func=None,
    threshold: int | float | None = None,
    test: bool = True,
) -> Parameters:
    """fit_model.

    fit_model estimates the distribution parameter based on MLM
    (Maximum likelihood method), if an objective function is entered as an input

    There are two likelihood functions (L1 and L2), one for values above some
    threshold (x>=C) and one for the values below (x < C), now the likeliest parameters
    are those at the max value of multiplication between two functions max(L1*L2).

    In this case, the L1 is still the product of multiplication of probability
    density function's values at xi, but the L2 is the probability that threshold
    value C will be exceeded (1-F(C)).

    Args:
        obj_func (function):
            function to be used to get the distribution parameters.
        threshold (numeric):
            Value you want to consider only the greater values.
        method (str):
            'mle', 'mm', 'lmoments', optimization
        test (bool):
            Default is True

    Returns:
        parameters (list):
            shape, loc, scale parameter of the gumbel distribution in that order.
    """
    # obj_func = lambda p, x: (-np.log(Gumbel.pdf(x, p[0], p[1]))).sum()
    # #first we make a simple Gumbel fit
    # Par1 = so.fmin(obj_func, [0.5,0.5], args=(np.array(data),))
    method = super().fit_model(method=method)  # type: ignore[assignment]

    if method == "mle" or method == "mm":
        param_list: Any = list(norm.fit(self.data, method=method))
    elif method == "lmoments":
        lm = Lmoments(self.data)
        lmu = lm.calculate()
        param_list = Lmoments.normal(lmu)
    elif method == "optimization":
        if obj_func is None or threshold is None:
            raise TypeError(OBJ_FUNCTION_THRESHOLD_ERROR)

        param_list = norm.fit(self.data, method="mle")
        # then we use the result as starting value for your truncated Gumbel fit
        param_list = so.fmin(
            obj_func,
            [threshold, param_list[0], param_list[1]],
            args=(self.data,),
            maxiter=500,
            maxfun=500,
        )
        param_list = [param_list[1], param_list[2]]
    else:
        raise ValueError(f"The given: {method} does not exist")

    param = Parameters(loc=param_list[0], scale=param_list[1])
    self.parameters = param

    if test:
        self.ks()
        self.chisquare()

    return param

`inverse_cdf(cdf=None, parameters=None)` #

Theoretical Estimate.

Theoretical Estimate method calculates the theoretical values based on a given non exceedence probability

Parameters:

Name	Type	Description	Default
`parameters`	`Parameters`	Parameters(loc=val, scale=val) loc (numeric): location parameter of the Normal distribution. scale (numeric): scale parameter of the Normal distribution.	`None`
`cdf`	`list`	cumulative distribution function/ Non-Exceedance probability.	`None`

Returns:

Name	Type	Description
`numeric`	`ndarray`	Value based on the theoretical distribution

Source code in src/statista/distributions/normal.py

def inverse_cdf(
    self,
    cdf: np.ndarray | list[float] | None = None,
    parameters: Parameters | dict[str, float] | None = None,
) -> np.ndarray:
    """Theoretical Estimate.

    Theoretical Estimate method calculates the theoretical values based on a given  non exceedence probability

    Args:
        parameters (Parameters):
            Parameters(loc=val, scale=val)

            - loc (numeric):
                location parameter of the Normal distribution.
            - scale (numeric):
                scale parameter of the Normal distribution.
        cdf (list):
            cumulative distribution function/ Non-Exceedance probability.

    Returns:
        numeric:
            Value based on the theoretical distribution
    """
    if parameters is None:
        parameters = self.parameters
    elif isinstance(parameters, dict):
        parameters = Parameters(**parameters)

    loc = parameters.loc
    scale = parameters.scale

    if scale is None or scale <= 0:
        raise ValueError(SCALE_PARAMETER_ERROR)

    cdf = np.array(cdf)
    if np.any(cdf < 0) or np.any(cdf > 1):
        raise ValueError(CDF_INVALID_VALUE_ERROR)

    # the main equation from scipy
    q_th = norm.ppf(cdf, loc=loc, scale=scale)
    return q_th

`ks()` #

Kolmogorov-Smirnov (KS) test.

The smaller the D static, the more likely that the two samples are drawn from the same distribution IF Pvalue < significance level ------ reject

Returns:

Name	Type	Description
`Dstatic`	`numeric`	The smaller the D static the more likely that the two samples are drawn from the same distribution
`Pvalue`	`numeric`	IF Pvalue < significance level ------ reject the null hypothesis

Source code in src/statista/distributions/normal.py

def ks(self):
    """Kolmogorov-Smirnov (KS) test.

    The smaller the D static, the more likely that the two samples are drawn from the same distribution
    IF Pvalue < significance level ------ reject

    Returns:
        Dstatic (numeric):
            The smaller the D static the more likely that the two samples are drawn from the same distribution
        Pvalue (numeric):
            IF Pvalue < significance level ------ reject the null hypothesis
    """
    return super().ks()

`chisquare()` #

chisquare test

Source code in src/statista/distributions/normal.py

def chisquare(self) -> tuple:
    """chisquare test"""
    return super().chisquare()