stan-dev · fweber144 · Jun 27, 2023 · Jun 19, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -9,6 +9,7 @@ If you read this from a place other than <https://mc-stan.org/projpred/news/inde
 * If an L1 search selects an interaction term before all involved lower-order interaction terms (including main-effect terms) have been selected, the predictor ranking is now automatically modified so that the lower-order interaction terms come before this interaction term. A corresponding warning is thrown, which may be deactivated by setting the global option `projpred.warn_L1_interactions` to `FALSE`. Previously, beginning with version 2.5.0, only a warning was thrown and this only if an L1 search selected an interaction term before all involved *main-effect* terms had been selected. (GitHub: #420)
 * Added a progress bar for `project()` (when using the built-in divergence minimizers). For this, `project()` has gained a new argument `verbose` which can also be controlled via the global option `projpred.verbose_project`. By default, the new progress bar is activated. (GitHub: #421)
 * Added a new argument `parallel` to `cv_varsel()`. With `parallel = TRUE`, costly parts of **projpred**'s cross-validation (CV) can be run in parallel. See the documentation of that new argument (and section "Note" of `cv_varsel()`'s documentation) for details. (GitHub: #422)
+* Throw a warning for issue #323 (for multilevel Gaussian models, the projection onto the full model can be instable). (GitHub: #426)
 
 ## Bug fixes
 

diff --git a/R/cv_varsel.R b/R/cv_varsel.R
@@ -173,10 +173,12 @@ cv_varsel.refmodel <- function(
   }
 
   refmodel <- object
+  nterms_all <- count_terms_in_formula(refmodel$formula) - 1L
   # Parse arguments which also exist in varsel():
   args <- parse_args_varsel(
     refmodel = refmodel, method = method, refit_prj = refit_prj,
-    nterms_max = nterms_max, nclusters = nclusters, search_terms = search_terms
+    nterms_max = nterms_max, nclusters = nclusters, search_terms = search_terms,
+    nterms_all = nterms_all
   )
   method <- args$method
   refit_prj <- args$refit_prj
@@ -260,7 +262,7 @@ cv_varsel.refmodel <- function(
               y_wobs_test,
               nobs_test = nrow(y_wobs_test),
               summaries = sel_cv$summaries,
-              nterms_all = count_terms_in_formula(refmodel$formula) - 1L,
+              nterms_all,
               nterms_max,
               method,
               cv_method,

diff --git a/R/project.R b/R/project.R
@@ -252,6 +252,18 @@ project <- function(object, nterms = NULL, solution_terms = NULL,
     nclusters <- 1
   }
 
+  nterms_max <- max(nterms)
+  nterms_all <- count_terms_in_formula(refmodel$formula) - 1L
+  if (nterms_max == nterms_all &&
+      formula_contains_group_terms(refmodel$formula) &&
+      (refmodel$family$family == "gaussian" || refmodel$family$for_latent)) {
+    warning(
+      "In case of the Gaussian family (also in case of the latent projection) ",
+      "and multilevel terms, the projection onto the full model can be ",
+      "instable and even lead to an error, see GitHub issue #323."
+    )
+  }
+
   ## get the clustering or thinning
   if (refit_prj) {
     p_ref <- get_refdist(refmodel, ndraws = ndraws, nclusters = nclusters)

diff --git a/R/varsel.R b/R/varsel.R
@@ -213,11 +213,13 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
   }
 
   refmodel <- object
+  nterms_all <- count_terms_in_formula(refmodel$formula) - 1L
 
   # Parse arguments:
   args <- parse_args_varsel(
     refmodel = refmodel, method = method, refit_prj = refit_prj,
-    nterms_max = nterms_max, nclusters = nclusters, search_terms = search_terms
+    nterms_max = nterms_max, nclusters = nclusters, search_terms = search_terms,
+    nterms_all = nterms_all
   )
   method <- args$method
   refit_prj <- args$refit_prj
@@ -379,7 +381,7 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
               y_wobs_test,
               nobs_test,
               summaries = nlist(sub, ref),
-              nterms_all = count_terms_in_formula(refmodel$formula) - 1L,
+              nterms_all,
               nterms_max,
               method,
               cv_method = NULL,
@@ -425,7 +427,7 @@ select <- function(method, p_sel, refmodel, nterms_max, penalty, verbose, opt,
 # them in with the default values. The purpose of this function is to avoid
 # repeating the same code both in varsel() and cv_varsel().
 parse_args_varsel <- function(refmodel, method, refit_prj, nterms_max,
-                              nclusters, search_terms) {
+                              nclusters, search_terms, nterms_all) {
   search_terms_was_null <- is.null(search_terms)
   if (search_terms_was_null) {
     search_terms <- split_formula(refmodel$formula,
@@ -486,5 +488,18 @@ parse_args_varsel <- function(refmodel, method, refit_prj, nterms_max,
   }
   nterms_max <- min(max_nv_possible, nterms_max)
 
+  if (nterms_max == nterms_all && has_group_features &&
+      (refmodel$family$family == "gaussian" || refmodel$family$for_latent)) {
+    warning(
+      "In case of the Gaussian family (also in case of the latent projection) ",
+      "and multilevel terms, the projection onto the full model can be ",
+      "instable and even lead to an error, see GitHub issue #323. If you ",
+      "experience this and may refrain from the projection onto the full ",
+      "model, set `nterms_max` to the number of predictor terms in the full ",
+      "model minus 1 (possibly accounting for submodel sizes skipped by ",
+      "custom `search_terms`)."
+    )
+  }
+
   return(nlist(method, refit_prj, nterms_max, nclusters, search_terms))
 }
diff --git a/tests/testthat/test_varsel.R b/tests/testthat/test_varsel.R
@@ -1105,9 +1105,11 @@ test_that("invalid `method` fails", {
 
 test_that("invalid `cv_method` fails", {
   for (tstsetup in names(refmods)) {
-    expect_error(cv_varsel(refmods[[tstsetup]], cv_method = "k-fold"),
-                 "^Unknown `cv_method`\\.$",
-                 info = tstsetup)
+    expect_error(
+      suppressWarnings(cv_varsel(refmods[[tstsetup]], cv_method = "k-fold")),
+      "^Unknown `cv_method`\\.$",
+      info = tstsetup
+    )
   }
 })