Source code for simba.model.regression.model

from itertools import product
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

from simba.model.regression.metrics import (mean_absolute_error,
                                            mean_absolute_percentage_error,
                                            mean_squared_error, r2_score,
                                            root_mean_squared_error)
from simba.utils.checks import (check_float, check_instance, check_int,
                                check_str, check_valid_array,
                                check_valid_dataframe, check_valid_lst)
from simba.utils.enums import Formats
from simba.utils.errors import DataHeaderError


[docs]def fit_xgb(x: pd.DataFrame, y: np.ndarray, mdl: xgb.XGBRegressor) -> xgb.XGBRegressor: """ Fits an XGBoost regressor model to the given data. :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. :param np.ndarray y: Target values, must be a 1-dimensional array of numeric types with the same number of rows as `x`. :param xgb.XGBRegressor mdl: Defined xgb.XGBRegressor. E.g., can be defined with :func:`simba.model.regression.model.xgb_define`, :return: Trained XGBoost regressor model. :rtype: xgb.XGBRegressor :example: >>> x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> y = np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) """ check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_instance(source=f'{fit_xgb.__name__} fit_xgb', instance=xgb_reg, accepted_types=(xgb.XGBRegressor,)) return mdl.fit(X=x, y=y)
[docs]def transform_xgb(x: pd.DataFrame, mdl: xgb.XGBRegressor) -> np.ndarray: """ Transforms the input data using the provided XGBoost model by making predictions. :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. :param xgb.XGBRegressor mdl: Trained XGBoost model to use for making predictions. :return: Predictions rounded to 2 decimal places. :rtype: np.ndarray :example: >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> results = transform_xgb(x=new_x, mdl=mdl) :example: >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> results = transform_xgb(x=new_x, mdl=mdl) """ check_instance(source=transform_xgb.__name__, instance=mdl, accepted_types=(xgb.XGBRegressor,)) check_valid_dataframe(df=x, source=f'{transform_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) expected_x_names = mdl.get_booster().feature_names new_x_names = [str(i) for i in list(x.columns)] missing_x_names = set([i for i in expected_x_names if i not in new_x_names]) additional_x_names = set([i for i in new_x_names if i not in expected_x_names]) if len(additional_x_names) > 0: raise DataHeaderError(msg=f'The new data has {len(additional_x_names)} features not expected by the model: {additional_x_names}', source=transform_xgb.__name__) if len(missing_x_names) > 0: raise DataHeaderError(msg=f'The new data are missing {len(missing_x_names)} features expected by the model: {missing_x_names}', source=transform_xgb.__name__) if expected_x_names != new_x_names: raise DataHeaderError(msg=f'The new data contains features in the wrong order from the expected features', source=transform_xgb.__name__) return np.round(mdl.predict(x), 2)
[docs]def evaluate_xgb(y_pred: np.ndarray, y_true: np.ndarray, metrics: List[str], stratified: Optional[bool] = False) -> dict: """ Evaluates the performance of a regression model (e.g., XGBoost) by calculating selected metrics. Optionally, the evaluation can be stratified by unique values in the true target variable (`y_true`), where performance is computed separately for each class/level. :param np.ndarray y_pred: Predicted values generated by the model, must have the same shape as `y_true`. :param np.ndarray y_true: True target values to compare the predictions against. :param List[str] metrics: List of metrics to compute. :param stratified: If True, computes the metric for each unique class/level in `y_true`. If False (default), computes the metric for the entire dataset. :return: A dictionary containing the computed metrics. :rtype: dict :example: >>> x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> y = np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> y_pred = transform_xgb(x=new_x, mdl=mdl) >>> evaluate_xgb(y_pred=y_pred, y_true=y, metrics=['MAE', 'MAPE', 'RMSE', 'MSE']) """ METRICS = {'MAPE': mean_absolute_percentage_error, 'MSE': mean_squared_error, 'MAE': mean_absolute_error, 'R2': r2_score, 'RMSE': root_mean_squared_error} check_valid_array(data=y_true, source=evaluate_xgb.__name__, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y_pred, source=evaluate_xgb.__name__, accepted_ndims=(1,), min_axis_0=y_true.shape[0],accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_lst(data=metrics, source=f'{evaluate_xgb.__name__} metrics', valid_values=list(METRICS.keys())) results = {} for metric in metrics: if not stratified: results[metric] = METRICS[metric](y_true=y_true, y_pred=y_pred) else: results[metric] = {} for unique_true in (np.unique(y_true)): sample_idx = np.argwhere(y_true == unique_true) sample_y_true, sample_y_pred = y_true[sample_idx].flatten(), y_pred[sample_idx].flatten() results[metric][unique_true] = METRICS[metric](y_true=sample_y_true, y_pred=sample_y_pred) return results
[docs]def xgb_define(objective: str = 'reg:squarederror', n_estimators: int = 100, max_depth: int = 6, verbosity: int = 1, learning_rate: float = 0.3, eta: float = 0.3, gamma: float = 0.0, tree_method: str = 'auto') -> xgb.XGBRegressor: """ Defines an XGBoost regressor. :param str objective: The learning objective for the model. :param int n_estimators: Number of boosting rounds. Must be greater than or equal to 1. Default is 100. :param int max_depth: Maximum depth of a tree. Increasing this value makes the model more complex and more likely to overfit. Must be greater than or equal to 1. Default is 6. :param int verbosity: Verbosity of the training process (0-3). :param float learning_rate: Step size shrinkage used to prevent overfitting. Lower values make the model more robust but require more boosting rounds. Must be between 0.1 and 1.0. Default is 0.3. :param float eta: Learning rate alias. Must be between 0.0 and 1.0. Default is 0.3. :param float gamma: Minimum loss reduction required to make a further partition on a leaf node of the tree. Larger values prevent overfitting. Must be greater than or equal to 0.0. Default is 0.0. :param str tree_method: The tree construction algorithm used in XGBoost. :return: An initialized XGBoost Regressor with the specified configuration. :rtype: xgb.XGBRegressor """ OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror') TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist') check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES) check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS) check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1) check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1) check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3) check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0) check_float(name=f'{fit_xgb.__name__} eta', value=eta, min_value=0.0, max_value=1.0) check_float(name=f'{fit_xgb.__name__} gamma', value=gamma, min_value=0.0) return xgb.XGBRegressor(objective=objective, max_depth=max_depth, n_estimators=n_estimators, verbosity=verbosity, learning_rate=learning_rate, eta=eta, gamma=gamma, tree_method=tree_method)
def xgb_grid_define(objective: Tuple[str] = ('reg:squarederror',), n_estimators: Tuple[int] = (100,), max_depth: Tuple[int] =(6,), verbosity: Tuple[int] = (1,), learning_rate: Tuple[float] = (0.3,), eta: Tuple[float] = (0.3,), gamma: Tuple[float] = (0.0,), tree_method: Tuple[str] = ('auto',)) -> List[xgb.XGBRegressor]: grid = list(product(objective, n_estimators, max_depth, verbosity, learning_rate, eta, gamma, tree_method)) mdls = [] for i in grid: mdl = xgb_define(objective=i[0], n_estimators=i[1], max_depth=i[2], verbosity=i[3], learning_rate=i[4], eta=i[5], gamma=i[6], tree_method=i[7]) mdls.append(mdl) return mdls def xgb_grid_fit(x: pd.DataFrame, y: np.ndarray, mdls: List[xgb.XGBRegressor]) -> List[xgb.XGBRegressor]: check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_lst(data=mdls, source=xgb_grid_fit.__name__, valid_dtypes=(xgb.XGBRegressor,)) results = [] for mdl in mdls: results.append(fit_xgb(x=x, y=y, mdl=mdl)) return results # grid_df = pd.DataFrame(grid, columns=['objective', 'n_estimators', 'max_depth', 'learning_rate', 'eta', 'gamma']) # grid_df['verbosity'], grid_df['tree_method'] = verbosity, tree_method # # grid_df.apply(fit, axis=1, result_type='expand') #xgb_grid_define(max_depth=(6, 3), gamma=(0, 0.3)) # x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) # y = np.random.randint(1, 6, (100,)) # mdl = fit_xgb(x=x, y=y) # new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) # y_pred = transform_xgb(x=new_x, model=mdl) # evaluate_xgb(y_pred=y_pred, y_true=y, metrics=['MAE', 'MAPE', 'RMSE', 'MSE'], stratified=True) # # x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) # y = np.random.randint(1, 6, (100,)) # #mdl = fit_xgb(x=x, y=y) # # kfold_fit_xgb(x=x, y=y) #for fold_cnt, (train_index, test_index) in enumerate(k_fold.split(x_data, y_data)):