Source code for gamspy.formulations.ml.gradient_boosting

from __future__ import annotations

import importlib
import uuid
from typing import TYPE_CHECKING

import gamspy as gp
import gamspy.formulations.utils as utils
from gamspy.exceptions import ValidationError
from gamspy.formulations.ml.decision_tree_struct import DecisionTreeStruct
from gamspy.formulations.ml.regression_tree import RegressionTree

if TYPE_CHECKING:
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.tree import DecisionTreeRegressor


[docs] class GradientBoosting: """ Formulation generator for Gradient Boosted Trees in GAMSPy. Parameters ---------- container : Container Container that will contain the new variable and equations. ensemble: GradientBoostingRegressor | list[DecisionTreeStruct] - A fitted `sklearn.ensemble.GradientBoostingRegressor` instance, - If `sklearn.ensemble.GradientBoostingRegressor` is not utilized, the ensembled trees information can be supplied via a list of `DecisionTreeStruct` dataclasse instances, which represents the same components as those in `sklearn.tree`. See :meth:`DecisionTreeStruct <gamspy.formulations.DecisionTreeStruct>` for details on required attributes. name_prefix : str | None Prefix for generated GAMSPy symbols, by default None which means random prefix. Using the same name_prefix in different formulations causes name conflicts. Do not use the same name_prefix again. bias: float | 1 Bias term used to consolidate the final output using the contribution of each tree. This is generally the average of the output data used for training and it is useful when `ensemble` is a `list[DecisionTreeStruct]`. Otherwise, this is deduced from `ensemble` itself. learning_rate: float | 0.1 Rate at which each tree's contribution is reduced, by default is 0.1. This is useful when `ensemble` is a `list[DecisionTreeStruct]`. Otherwise, this is deduced from `ensemble` itself. Examples -------- >>> import gamspy as gp >>> import numpy as np >>> from gamspy.math import dim >>> np.random.seed(42) >>> m = gp.Container() >>> in_data = np.random.randint(0, 10, size=(5, 2)) >>> out_data = np.random.randint(1, 3, size=(5, 1)) >>> tree1_attribute = { ... "capacity": 3, ... "children_left": np.array([1, -1, -1]), ... "children_right": np.array([2, -1, -1]), ... "feature": np.array([0, -2, -2]), ... "n_features": 2, ... "threshold": np.array([4.0, -2.0, -2.0]), ... "value": np.array([[-4.4408921e-17], [-8.0000000e-01], [2.0000000e-01]]), ... } >>> tree2_attribute = { ... "capacity": 3, ... "children_left": np.array([1, -1, -1]), ... "children_right": np.array([2, -1, -1]), ... "feature": np.array([0, -2, -2]), ... "n_features": 2, ... "threshold": np.array([4.0, -2.0, -2.0]), ... "value": np.array([[-8.8817842e-17], [-6.4000000e-01], [1.6000000e-01]]), ... } >>> gb_trees = [gp.formulations.DecisionTreeStruct(**tree1_attribute), gp.formulations.DecisionTreeStruct(**tree2_attribute)] >>> dt_model = gp.formulations.GradientBoosting(m, gb_trees) >>> x = gp.Variable(m, "x", domain=dim((5, 2)), type="positive") >>> x.up[:, :] = 10 >>> y, eqns = dt_model(x) >>> set_of_samples = y.domain[0] >>> set_of_samples.name 'DenseDim5_1' """ def __init__( self, container: gp.Container, ensemble: GradientBoostingRegressor | list[DecisionTreeStruct], name_prefix: str | None = None, bias: float = 1, learning_rate: float = 0.1, ): if not isinstance(container, gp.Container): raise ValidationError(f"{container} is not a gp.Container.") def _validate_ensemble( ensemble, ) -> list[DecisionTreeRegressor | DecisionTreeStruct]: if isinstance(ensemble, list) and all( isinstance(item, DecisionTreeStruct) for item in ensemble ): return ensemble else: try: sklearn_boosting = importlib.import_module( "sklearn.ensemble" ) if isinstance( ensemble, sklearn_boosting.GradientBoostingRegressor ): if not hasattr(ensemble, "estimators_"): raise ValidationError( f"{ensemble} must be a trained/fitted instance of >sklearn.ensemble.GradientBoostingRegressor<." ) return ensemble.estimators_.flatten() else: raise ValidationError( f"{ensemble} must be an instance of either >sklearn.ensemble.GradientBoostingRegressor< or a list of >DecisionTreeStruct<" ) except ModuleNotFoundError: raise ValidationError( ">sklearn.ensemble< module not found." ) from None self.container = container if name_prefix is None: name_prefix = str(uuid.uuid4()).split("-")[0] self._name_prefix = name_prefix self._list_of_trees: list[RegressionTree] = [] for tree in _validate_ensemble(ensemble): rt_instance = RegressionTree( self.container, regressor=tree, name_prefix=self._name_prefix, ) self._list_of_trees.append(rt_instance) if isinstance(ensemble, list): # list[DecisionTreeStruct] self._bias = bias self._learning_rate = learning_rate else: # GradientBoostingRegressor self._bias = ensemble.init_.constant_.flatten()[0] self._learning_rate = ensemble.learning_rate
[docs] def __call__( self, input: gp.Parameter | gp.Variable, M: float | None = None, ) -> tuple[gp.Variable, list[gp.Equation]]: gb_out_list: list[gp.Variable] = [] gb_eqn_list: list[gp.Equation] = [] for regression_tree in self._list_of_trees: dt_out, dt_eqn, set_of_output_dim = regression_tree( input, M, is_ensemble=True ) gb_out_list.append(dt_out) gb_eqn_list += dt_eqn set_of_samples = input.domain[0] out = gp.Variable._constructor_bypass( self.container, name=utils._generate_name("v", self._name_prefix, "real_output"), domain=[set_of_samples, set_of_output_dim], ) gb_eqn = gp.Equation._constructor_bypass( self.container, name=utils._generate_name("e", self._name_prefix, "gb_eqn"), domain=[set_of_samples, set_of_output_dim], description="perdicted out should be equal to the sum of gradient descent out times the learning rate.", ) self.container._synch_with_gams(gams_to_gamspy=True) gb_eqn[...] = ( self._bias + self._learning_rate * sum(gb_out_list) == out ) gb_eqn_list.append(gb_eqn) return out, gb_eqn_list