Source code for simba.model.regression.metrics

from typing import Optional

import numpy as np

from simba.utils.checks import check_float, check_valid_array
from simba.utils.enums import Formats


[docs]def mean_absolute_percentage_error(y_true: np.ndarray,
                                   y_pred: np.ndarray,
                                   epsilon=1e-10,
                                   weights: Optional[np.ndarray] = None) -> float:
    """
    Compute the Mean Absolute Percentage Error (MAPE)

    :param np.ndarray y_true: The array containing the true values (dependent variable) of the dataset. Should be a 1D numeric array of shape (n,).
    :param np.ndarray y_pred: The array containing the predicted values for the dataset. Should be a 1D numeric array of shape (n,) and of the same length as `y_true`.
    :param float epsilon: A small pseudovalue to replace zeros in `y_true` to avoid division by zero errors.
    :param Optional[np.ndarray] weights: An optional 1D array of weights to apply to each error. If provided, the weighted mean absolute percentage error is computed.
    :return: The Mean Absolute Percentage Error (MAPE) as a float, in percentage format. A lower value indicates better prediction accuracy.
    :rtype: float

    :example:

    >>> x, y = np.random.random(size=(100000,)), np.random.random(size=(100000,))
    >>> mean_absolute_percentage_error(y_true=x, y_pred=y)
    """

    check_valid_array(data=y_true, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_valid_array(data=y_pred, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0],accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_float(name=mean_absolute_percentage_error.__name__, value=epsilon)
    y_true = np.where(y_true == 0, epsilon, y_true)
    se = np.abs((y_true - y_pred) / y_true)
    if weights is not None:
        check_valid_array(data=weights, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
        se = se * weights
        return (np.sum(se) / np.sum(weights)) * 100
    else:
        return np.mean(se * 100)


[docs]def mean_squared_error(y_true: np.ndarray,
                       y_pred: np.ndarray,
                       weights: Optional[np.ndarray] = None) -> float:

    """
    Compute the Mean Squared Error (MSE) between the true and predicted values.

    :param np.ndarray y_true: The array containing the true values (dependent variable) of the dataset. Should be a 1D numeric array of shape (n,).
    :param np.ndarray y_pred: The array containing the predicted values for the dataset. Should be a 1D numeric array of shape (n,) and of the same length as `y_true`.
    :param Optional[np.ndarray] weights: An optional 1D array of weights to apply to each squared error. If provided, the weighted mean squared error is computed.
    :return: The Mean Squared Error (MSE) as a float. A lower value indicates better model accuracy.
    :rtype: float
    """

    check_valid_array(data=y_true, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_valid_array(data=y_pred, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0],accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    se = (y_true - y_pred) ** 2
    if weights is not None:
        check_valid_array(data=weights, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
        se = se * weights
        return np.sum(se) / np.sum(weights)
    else:
        return np.mean(se)

[docs]def mean_absolute_error(y_true: np.ndarray,
                        y_pred: np.ndarray,
                        weights: Optional[np.ndarray] = None) -> float:
    """
    Compute the Mean Absolute Error (MAE) between the true and predicted values.

    :param np.ndarray y_true: A 1D array of true values (ground truth).
    :param np.ndarray y_pred: A 1D array of predicted values.
    :param np.ndarray weights: An optional 1D array of weights for each observation. If provided, the weighted MAE is computed.
    :return: The Mean Absolute Error (MAE) as a float. A lower value indicates a better fit.
    :rtype: float
    """

    check_valid_array(data=y_true, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_valid_array(data=y_pred, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    absolute_error = np.abs(y_true - y_pred)
    if weights is not None:
        check_valid_array(data=weights, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
        absolute_error = absolute_error * weights
        return np.sum(absolute_error) / np.sum(weights)
    else:
        return np.mean(absolute_error)


[docs]def r2_score(y_true: np.ndarray, y_pred: np.ndarray, weights: Optional[np.ndarray] = None) -> float:
    """
    Compute the R^2 (coefficient of determination) score.

    :param np.ndarray y_true: 1D array of true values (dependent variable).
    :param np.ndarray y_pred: 1D array of predicted values, same length as `y_true`.
    :param np.ndarray weights: Optional 1D array of weights for each observation.
    :return: The R^2 score as a float. A value closer to 1 indicates better fit.
    :rtype: float

    """

    check_valid_array(data=y_true, source=r2_score.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_valid_array(data=y_pred, source=r2_score.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)

    if weights is not None:
        check_valid_array(data=weights, source=r2_score.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0])

    y_mean = np.average(y_true, weights=weights) if weights is not None else np.mean(y_true)
    residuals, total = (y_true - y_pred) ** 2, (y_true - y_mean) ** 2

    if weights is not None:
        ss_residual = np.sum(residuals * weights)
        ss_total = np.sum(total * weights)
    else:
        ss_residual = np.sum(residuals)
        ss_total = np.sum(total)

    return 1 - (ss_residual / ss_total)


[docs]def root_mean_squared_error(y_true: np.ndarray,
                            y_pred: np.ndarray,
                            weights: Optional[np.ndarray] = None) -> float:

    """
    Compute the Root Mean Squared Error (RMSE) between the true and predicted values.

    :param np.ndarray y_true: The array containing the true values (dependent variable) of the dataset. Should be a 1D numeric array of shape (n,).
    :param np.ndarray y_pred: The array containing the predicted values for the dataset. Should be a 1D numeric array of shape (n,) and of the same length as `y_true`.
    :param Optional[np.ndarray] weights: An optional 1D array of weights to apply to each squared error. If provided, the weighted mean squared error is computed.
    :return: The Root Mean Squared Error (MSE) as a float. A lower value indicates better model accuracy.
    :rtype: float
    """

    check_valid_array(data=y_true, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    check_valid_array(data=y_pred, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0],accepted_dtypes=Formats.NUMERIC_DTYPES.value)
    se = (y_true - y_pred)  ** 2
    if weights is not None:
        check_valid_array(data=weights, source=mean_absolute_percentage_error.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
        weighted_mse = np.sum(se * weights) / np.sum(weights)
        return np.sqrt(weighted_mse)
    else:
        return np.sqrt(np.mean(se))