mlr-org · sebffischer · Sep 6, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,8 @@
 # mlr3learners (development version)
 
+* feat: use `base_margin` in xgboost learners (#205)
 * bugfix: validation for learner `lrn("regr.xgboost")` now works properly. Previously the training data was used.
-* feat: add weights for logistic regression again, which were incorrectlu removed
+* feat: add weights for logistic regression again, which were incorrectly removed
   in a previous release (#265)
 * BREAKING_CHANGE: When using internal tuning for xgboost learners, the `eval_metric` must now be set.
   This achieves that one needs to make the conscious decision which performance metric to use for

diff --git a/R/LearnerClassifXgboost.R b/R/LearnerClassifXgboost.R
@@ -95,6 +95,7 @@ LearnerClassifXgboost = R6Class("LearnerClassifXgboost",
         alpha                       = p_dbl(0, default = 0, tags = "train"),
         approxcontrib               = p_lgl(default = FALSE, tags = "predict"),
         base_score                  = p_dbl(default = 0.5, tags = "train"),
+        base_margin                 = p_uty(default = NULL, special_vals = list(NULL), tags = "train"),
         booster                     = p_fct(c("gbtree", "gblinear", "dart"), default = "gbtree", tags = c("train", "control")),
         callbacks                   = p_uty(default = list(), tags = "train"),
         colsample_bylevel           = p_dbl(0, 1, default = 1, tags = "train"),
@@ -244,13 +245,22 @@ LearnerClassifXgboost = R6Class("LearnerClassifXgboost",
       )
 
       data = task$data(cols = task$feature_names)
-      # recode to 0:1 to that for the binary case the positive class translates to 1 (#32)
+      # recode to 0:1 so that for the binary case the positive class translates to 1 (#32)
       # note that task$truth() is guaranteed to have the factor levels in the right order
       label = nlvls - as.integer(task$truth())
-      data = xgboost::xgb.DMatrix(data = as_numeric_matrix(data), label = label)
+      xgb_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(data), label = label)
 
       if ("weights" %in% task$properties) {
-        xgboost::setinfo(data, "weight", task$weights$weight)
+        xgboost::setinfo(xgb_data, "weight", task$weights$weight)
+      }
+
+      bm = pv$base_margin
+      pv$base_margin = NULL # silence xgb.train message
+      bm_is_feature = !is.null(bm) && is.character(bm) && (bm %in% task$feature_names)
+      # works only with binary classification objectives
+      obj_is_binary = startsWith(pv$objective, "binary")
+      if (bm_is_feature && obj_is_binary) {
+        xgboost::setinfo(xgb_data, "base_margin", data[[bm]])
       }
 
       # the last element in the watchlist is used as the early stopping set
@@ -262,8 +272,11 @@ LearnerClassifXgboost = R6Class("LearnerClassifXgboost",
       if (!is.null(internal_valid_task)) {
         test_data = internal_valid_task$data(cols = internal_valid_task$feature_names)
         test_label = nlvls - as.integer(internal_valid_task$truth())
-        test_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(test_data), label = test_label)
-        pv$watchlist = c(pv$watchlist, list(test = test_data))
+        xgb_test_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(test_data), label = test_label)
+        if (bm_is_feature && obj_is_binary) {
+          xgboost::setinfo(xgb_test_data, "base_margin", test_data[[bm]])
+        }
+        pv$watchlist = c(pv$watchlist, list(test = xgb_test_data))
       }
 
       # set internal validation measure
@@ -293,7 +306,7 @@ LearnerClassifXgboost = R6Class("LearnerClassifXgboost",
         pv$maximize = !measure$minimize
       }
 
-      invoke(xgboost::xgb.train, data = data, .args = pv)
+      invoke(xgboost::xgb.train, data = xgb_data, .args = pv)
     },
 
     .predict = function(task) {

diff --git a/R/LearnerRegrXgboost.R b/R/LearnerRegrXgboost.R
@@ -72,6 +72,7 @@ LearnerRegrXgboost = R6Class("LearnerRegrXgboost",
         alpha                       = p_dbl(0, default = 0, tags = "train"),
         approxcontrib               = p_lgl(default = FALSE, tags = "predict"),
         base_score                  = p_dbl(default = 0.5, tags = "train"),
+        base_margin                 = p_uty(default = NULL, special_vals = list(NULL), tags = "train"),
         booster                     = p_fct(c("gbtree", "gblinear", "dart"), default = "gbtree", tags = "train"),
         callbacks                   = p_uty(default = list(), tags = "train"),
         colsample_bylevel           = p_dbl(0, 1, default = 1, tags = "train"),
@@ -200,10 +201,17 @@ LearnerRegrXgboost = R6Class("LearnerRegrXgboost",
 
       data = task$data(cols = task$feature_names)
       target = task$data(cols = task$target_names)
-      data = xgboost::xgb.DMatrix(data = as_numeric_matrix(data), label = data.matrix(target))
+      xgb_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(data), label = data.matrix(target))
 
       if ("weights" %in% task$properties) {
-        xgboost::setinfo(data, "weight", task$weights$weight)
+        xgboost::setinfo(xgb_data, "weight", task$weights$weight)
+      }
+
+      bm = pv$base_margin
+      pv$base_margin = NULL # silence xgb.train message
+      bm_is_feature = !is.null(bm) && is.character(bm) && (bm %in% task$feature_names)
+      if (bm_is_feature) {
+        xgboost::setinfo(xgb_data, "base_margin", data[[bm]])
       }
 
       # the last element in the watchlist is used as the early stopping set
@@ -214,8 +222,11 @@ LearnerRegrXgboost = R6Class("LearnerRegrXgboost",
       if (!is.null(internal_valid_task)) {
         test_data = internal_valid_task$data(cols = task$feature_names)
         test_target = internal_valid_task$data(cols = task$target_names)
-        test_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(test_data), label = data.matrix(test_target))
-        pv$watchlist = c(pv$watchlist, list(test = test_data))
+        xgb_test_data = xgboost::xgb.DMatrix(data = as_numeric_matrix(test_data), label = data.matrix(test_target))
+        if (bm_is_feature) {
+          xgboost::setinfo(xgb_test_data, "base_margin", test_data[[bm]])
+        }
+        pv$watchlist = c(pv$watchlist, list(test = xgb_test_data))
       }
 
       # set internal validation measure
@@ -235,7 +246,7 @@ LearnerRegrXgboost = R6Class("LearnerRegrXgboost",
         pv$maximize = !measure$minimize
       }
 
-      invoke(xgboost::xgb.train, data = data, .args = pv)
+      invoke(xgboost::xgb.train, data = xgb_data, .args = pv)
     },
     #' Returns the `$best_iteration` when early stopping is activated.
     .predict = function(task) {

diff --git a/inst/paramtest/test_paramtest_classif.xgboost.R b/inst/paramtest/test_paramtest_classif.xgboost.R
@@ -42,7 +42,8 @@ test_that("classif.xgboost", {
     "label", # handled by mlr3
     "weight", # handled by mlr3
     "nthread", # handled by mlr3
-    "feval" # handled via eval_metric parameter
+    "feval", # handled via eval_metric parameter
+    "base_margin" # handled by mlr3
   )
 
   ParamTest = run_paramtest(learner, fun, exclude, tag = "train")

diff --git a/inst/paramtest/test_paramtest_regr.xgboost.R b/inst/paramtest/test_paramtest_regr.xgboost.R
@@ -42,7 +42,8 @@ test_that("regr.xgboost", {
     "label", # handled by mlr3
     "weight", # handled by mlr3
     "nthread", # handled by mlr3
-    "feval" # handled via eval_metric parameter
+    "feval", # handled via eval_metric parameter
+    "base_margin" # handled by mlr3
   )
 
   ParamTest = run_paramtest(learner, fun, exclude, tag = "train")