From 468c9aed58b099516b608d81fdc8888b63de75c6 Mon Sep 17 00:00:00 2001 From: Yann McLatchie Date: Fri, 6 Oct 2023 12:54:32 +0100 Subject: [PATCH 01/17] add skeleton LFO varsel method --- NAMESPACE | 1 + R/cv_varsel.R | 208 ++++++++++++++++++++++++++++++ R/methods.R | 28 ++++ R/misc.R | 12 ++ man/as_draws_matrix.projection.Rd | 4 +- man/cv-indices.Rd | 7 + man/cv_varsel.Rd | 2 +- man/projpred-package.Rd | 2 +- 8 files changed, 260 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b88477458..6c47d2d51 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -63,6 +63,7 @@ export(extend_family) export(force_search_terms) export(get_refmodel) export(init_refmodel) +export(lfo_folds) export(predictor_terms) export(proj_linpred) export(proj_predict) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index 8546474da..fc5dcd168 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1367,6 +1367,214 @@ run_cvfun.refmodel <- function(object, return(structure(cvfits, folds = folds)) } +# Exact LFO-CV --------------------------------------------------------------- + +lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, + ndraws_pred, nclusters_pred, refit_prj, penalty, + verbose, opt, L, cvfits, search_terms, parallel, ...) { + # Fetch the K reference model fits (or fit them now if not already done) and + # create objects of class `refmodel` from them (and also store the `omitted` + # indices): + list_cv <- get_lfo(refmodel, L = L, cvfits = cvfits, verbose = verbose) + K <- length(list_cv) + + if (refmodel$family$for_latent) { + # Need to set the latent response values in `refmodel$y` to `NA`s because + # `refmodel$y` resulted from applying `colMeans(posterior_linpred())` to the + # original (full-data) reference model fit, so using the `fold$omitted` + # subset of `refmodel$y` as (latent) response values in fold k of K would + # induce a dependency between training and test data: + refmodel$y <- rep(NA, refmodel$nobs) + } + y_wobs_test <- as.data.frame(refmodel[nms_y_wobs_test()]) + + verb_out("-----\nRunning the search and the performance evaluation for ", + "each of the K = ", K, " CV folds separately ...", verbose = verbose) + one_fold <- function(fold, + verbose_search = verbose && + getOption("projpred.extra_verbose", FALSE), + ...) { + # Run the search for the current fold: + search_path <- select( + refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters, + method = method, nterms_max = nterms_max, penalty = penalty, + verbose = verbose_search, opt = opt, search_terms = search_terms, + est_runtime = FALSE, ... + ) + + # Run the performance evaluation for the submodels along the predictor + # ranking: + perf_eval_out <- perf_eval( + search_path = search_path, refmodel = fold$refmodel, regul = opt$regul, + refit_prj = refit_prj, ndraws = ndraws_pred, nclusters = nclusters_pred, + refmodel_fulldata = refmodel, indices_test = fold$omitted, ... + ) + + # Performance evaluation for the reference model of the current fold: + eta_test <- fold$refmodel$ref_predfun( + fold$refmodel$fit, + newdata = refmodel$fetch_data(obs = fold$omitted), + excl_offs = FALSE + ) + mu_test <- fold$refmodel$family$linkinv(eta_test) + summaries_ref <- weighted_summary_means( + y_wobs_test = y_wobs_test[fold$omitted, , drop = FALSE], + family = fold$refmodel$family, + wdraws = fold$refmodel$wdraws_ref, + mu = mu_test, + dis = fold$refmodel$dis, + cl_ref = seq_along(fold$refmodel$wdraws_ref) + ) + + return(nlist(predictor_ranking = search_path[["solution_terms"]], + summaries_sub = perf_eval_out[["sub_summaries"]], + summaries_ref, clust_used_eval = perf_eval_out[["clust_used"]], + nprjdraws_eval = perf_eval_out[["nprjdraws"]])) + } + if (!parallel) { + # Sequential case. Actually, we could simply use ``%do_projpred%` <- + # foreach::`%do%`` here and then proceed as in the parallel case, but that + # would require adding more "hard" dependencies (because packages 'foreach' + # and 'doRNG' would have to be moved from `Suggests:` to `Imports:`). + if (verbose) { + pb <- utils::txtProgressBar(min = 0, max = K, style = 3, initial = 0) + } + res_cv <- lapply(seq_along(list_cv), function(k) { + if (verbose) { + on.exit(utils::setTxtProgressBar(pb, k)) + } + one_fold(list_cv[[k]], ...) + }) + if (verbose) { + close(pb) + } + } else { + # Parallel case. + if (!requireNamespace("foreach", quietly = TRUE)) { + stop("Please install the 'foreach' package.") + } + if (!requireNamespace("doRNG", quietly = TRUE)) { + stop("Please install the 'doRNG' package.") + } + dot_args <- list(...) + `%do_projpred%` <- doRNG::`%dorng%` + res_cv <- foreach::foreach( + list_cv_k = list_cv, + .export = c("one_fold", "dot_args"), + .noexport = c("list_cv") + ) %do_projpred% { + do.call(one_fold, c(list(fold = list_cv_k, verbose_search = FALSE), + dot_args)) + } + } + verb_out("-----", verbose = verbose) + solution_terms_cv <- do.call(rbind, lapply(res_cv, "[[", "predictor_ranking")) + clust_used_eval <- element_unq(res_cv, nm = "clust_used_eval") + nprjdraws_eval <- element_unq(res_cv, nm = "nprjdraws_eval") + + # Handle the submodels' performance evaluation results: + sub_foldwise <- lapply(res_cv, "[[", "summaries_sub") + if (getRversion() >= package_version("4.2.0")) { + sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE, except = NULL) + } else { + sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE) + if (is.null(dim(sub_foldwise))) { + sub_dim <- dim(solution_terms_cv) + sub_dim[2] <- sub_dim[2] + 1L # +1 is for the empty model + dim(sub_foldwise) <- rev(sub_dim) + } + } + sub <- apply(sub_foldwise, 1, rbind2list) + idxs_sorted_by_fold <- unlist(lapply(list_cv, function(fold) { + fold$omitted + })) + idxs_sorted_by_fold_aug <- idxs_sorted_by_fold + if (!is.null(refmodel$family$cats)) { + idxs_sorted_by_fold_aug <- idxs_sorted_by_fold_aug + rep( + (seq_along(refmodel$family$cats) - 1L) * refmodel$nobs, + each = length(idxs_sorted_by_fold_aug) + ) + } + idxs_sorted_by_fold_flx <- idxs_sorted_by_fold_aug + if (refmodel$family$for_latent && !is.null(refmodel$family$cats)) { + idxs_sorted_by_fold_flx <- idxs_sorted_by_fold + } + sub <- lapply(sub, function(summ) { + summ$mu <- summ$mu[order(idxs_sorted_by_fold_flx)] + summ$lppd <- summ$lppd[order(idxs_sorted_by_fold)] + + # Add fold-specific weights (see the discussion at GitHub issue #94 for why + # this might have to be changed): + summ$wcv <- rep(1, length(summ$lppd)) + summ$wcv <- summ$wcv / sum(summ$wcv) + + if (!is.null(summ$oscale)) { + summ$oscale$mu <- summ$oscale$mu[order(idxs_sorted_by_fold_aug)] + summ$oscale$lppd <- summ$oscale$lppd[order(idxs_sorted_by_fold)] + summ$oscale$wcv <- summ$wcv + } + return(summ) + }) + + # Handle the reference model's performance evaluation results: + ref <- rbind2list(lapply(res_cv, "[[", "summaries_ref")) + ref$mu <- ref$mu[order(idxs_sorted_by_fold_flx)] + ref$lppd <- ref$lppd[order(idxs_sorted_by_fold)] + if (!is.null(ref$oscale)) { + ref$oscale$mu <- ref$oscale$mu[order(idxs_sorted_by_fold_aug)] + ref$oscale$lppd <- ref$oscale$lppd[order(idxs_sorted_by_fold)] + } + + return(nlist(solution_terms_cv, summaries = nlist(sub, ref), y_wobs_test, + clust_used_eval, nprjdraws_eval)) +} + +# Re-fit the reference model K times (once for each fold; `cvfun` case) or fetch +# the K reference model fits if already computed (`cvfits` case). This function +# will return a list of length K, where each element is a list with elements +# `refmodel` (output of init_refmodel()) and `omitted` (vector of indices of +# those observations which were left out for the corresponding fold). +get_lfo <- function(refmodel, L, cvfits, verbose) { + if (is.null(cvfits)) { + if (!is.null(refmodel$cvfun)) { + # In this case, cvfun() provided (and `cvfits` not), so run cvfun() now. + if (verbose && !inherits(refmodel, "datafit")) { + verb_out("-----\nRefitting the reference model K = ", K, " times ", + "(using the fold-wise training data) ...") + } + folds <- lfo_folds(refmodel$nobs, L = L, + seed = sample.int(.Machine$integer.max, 1)) + if (getOption("projpred.warn_kfold_refits", TRUE)) { + cvfits <- refmodel$cvfun(folds) + } else { + cvfits <- suppressWarnings(refmodel$cvfun(folds)) + } + verb_out("-----", verbose = verbose) + } else { + stop("For a reference model which is not of class `datafit`, either ", + "`cvfits` or `cvfun` needs to be provided for K-fold CV (see ", + "`?init_refmodel`).") + } + } else { + folds <- attr(cvfits, "folds") + } + return(lapply(seq_len(K), function(k) { + cvfit <- cvfits[[k]] + # Add the omitted observation indices for this fold (and the fold index `k` + # itself): + omitted_idxs <- which(folds == k) + if (is.list(cvfit)) { + cvfit$omitted <- omitted_idxs + cvfit$projpred_k <- k + } else { + attr(cvfit, "omitted") <- omitted_idxs + attr(cvfit, "projpred_k") <- k + } + return(list(refmodel = refmodel$cvrefbuilder(cvfit), + omitted = omitted_idxs)) + })) +} + # PSIS-LOO CV helpers ----------------------------------------------------- # ## decide which points to go through in the validation (i.e., which points diff --git a/R/methods.R b/R/methods.R index 4c102bd3b..4cecb6fea 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2331,6 +2331,34 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { return(cv) } +#' @rdname cv-indices +#' @export +#' @param T the total number of time observations +#' @param L the number of observations to fit the initial fold with +lfo_folds <- function(T, L, seed = NA) { + validate_lfo_folds(T, L) + + if (exists(".Random.seed", envir = .GlobalEnv)) { + rng_state_old <- get(".Random.seed", envir = .GlobalEnv) + } + if (!is.na(seed)) { + # Set seed, but ensure the old RNG state is restored on exit: + if (exists(".Random.seed", envir = .GlobalEnv)) { + on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) + } + set.seed(seed) + } + + ## create fold indices + # these indices, unlike for K-fold indicate the observations to use to the + # fit the model at the i-th fold + folds <- rep(1, T) + for (t in (L + 1) : (T)) { + folds[t] <- t - L + } + return(folds) +} + #' Retrieve the full-data solution path from a [varsel()] or [cv_varsel()] run #' or the predictor combination from a [project()] run #' diff --git a/R/misc.R b/R/misc.R index 62fde8991..dfda1685b 100644 --- a/R/misc.R +++ b/R/misc.R @@ -133,6 +133,18 @@ validate_num_folds <- function(k, n) { } } +validate_lfo_folds <- function(T, L) { + if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) { + stop("Number of folds must be a single integer value.") + } + if (L < 1) { + stop("Number of initial observations must be at least 1.") + } + if (L > T) { + stop("Number of initial observations cannot exceed total observations.") + } +} + validate_vsel_object_stats <- function(object, stats, resp_oscale = TRUE) { if (!inherits(object, c("vsel"))) { stop("The object is not a variable selection object. Run variable ", diff --git a/man/as_draws_matrix.projection.Rd b/man/as_draws_matrix.projection.Rd index 6432a207a..9d5b657d6 100644 --- a/man/as_draws_matrix.projection.Rd +++ b/man/as_draws_matrix.projection.Rd @@ -6,9 +6,9 @@ \title{Extract projected parameter draws and coerce to \code{draws_matrix} (see package \pkg{posterior})} \usage{ -\method{as_draws_matrix}{projection}(x, ...) +as_draws_matrix.projection(x, ...) -\method{as_draws}{projection}(x, ...) +as_draws.projection(x, ...) } \arguments{ \item{x}{An object of class \code{projection} (returned by \code{\link[=project]{project()}}, possibly diff --git a/man/cv-indices.Rd b/man/cv-indices.Rd index 9385bc750..52891c747 100644 --- a/man/cv-indices.Rd +++ b/man/cv-indices.Rd @@ -5,6 +5,7 @@ \alias{cv_folds} \alias{cvfolds} \alias{cv_ids} +\alias{lfo_folds} \title{Create cross-validation folds} \usage{ cv_folds(n, K, seed = NA) @@ -12,6 +13,8 @@ cv_folds(n, K, seed = NA) cvfolds(n, K, seed = NA) cv_ids(n, K, out = c("foldwise", "indices"), seed = NA) + +lfo_folds(T, L, seed = NA) } \arguments{ \item{n}{Number of observations.} @@ -26,6 +29,10 @@ results can be obtained again if needed. Passed to argument \code{seed} of \item{out}{Format of the output, either \code{"foldwise"} or \code{"indices"}. See below for details.} + +\item{T}{the total number of time observations} + +\item{L}{the number of observations to fit the initial fold with} } \value{ \code{\link[=cv_folds]{cv_folds()}} returns a vector of length \code{n} such that each element is diff --git a/man/cv_varsel.Rd b/man/cv_varsel.Rd index fa8ddb6e4..0b1f1d500 100644 --- a/man/cv_varsel.Rd +++ b/man/cv_varsel.Rd @@ -265,7 +265,7 @@ the backends from packages \pkg{doParallel}, \pkg{doMPI}, or \pkg{doFuture}. For GLMs, this CV parallelization should work reliably, but for other models (such as GLMMs), it may lead to excessive memory usage which in turn may crash the R session (on Unix systems, setting an -appropriate memory limit via \code{\link[unix:rlimit]{unix::rlimit_as()}} may avoid crashing the +appropriate memory limit via \code{\link[unix:rlimit_as]{unix::rlimit_as()}} may avoid crashing the whole machine). However, the problem of excessive memory usage is less pronounced for the CV parallelization than for the projection parallelization described in \link{projpred-package}. In that regard, the CV diff --git a/man/projpred-package.Rd b/man/projpred-package.Rd index f16ff53fa..dd410f922 100644 --- a/man/projpred-package.Rd +++ b/man/projpred-package.Rd @@ -86,7 +86,7 @@ submodel has no multilevel or additive predictor terms), but for all other types of submodels, the fitted submodel objects are quite big, which---when running in parallel---may lead to excessive memory usage which in turn may crash the R session (on Unix systems, setting an appropriate memory limit via -\code{\link[unix:rlimit]{unix::rlimit_as()}} may avoid crashing the whole machine). Thus, we currently +\code{\link[unix:rlimit_as]{unix::rlimit_as()}} may avoid crashing the whole machine). Thus, we currently cannot recommend parallelizing projections onto submodels which are GLMs (in this context, the latent projection onto a submodel without multilevel and without additive terms may be regarded as a projection onto a submodel which From 99f728cc1a89ab9dee95b4f47c6f9f6b5bf4dd1e Mon Sep 17 00:00:00 2001 From: Yann McLatchie Date: Fri, 6 Oct 2023 15:27:27 +0100 Subject: [PATCH 02/17] update comment --- R/cv_varsel.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index fc5dcd168..6af179c5d 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1529,11 +1529,12 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, clust_used_eval, nprjdraws_eval)) } -# Re-fit the reference model K times (once for each fold; `cvfun` case) or fetch -# the K reference model fits if already computed (`cvfits` case). This function -# will return a list of length K, where each element is a list with elements -# `refmodel` (output of init_refmodel()) and `omitted` (vector of indices of -# those observations which were left out for the corresponding fold). +# Re-fit the reference model T-L-1 times (once for each observation from L to +# the second-to-last observation; `cvfun` case) or fetch the fits if already +# computed (`cvfits` case). This function will return a list of length T-L-1, +# where each element is a list with elements `refmodel` (output of +# init_refmodel()) and `omitted` (vector of indices of those observations which +# were left out for the corresponding fold). get_lfo <- function(refmodel, L, cvfits, verbose) { if (is.null(cvfits)) { if (!is.null(refmodel$cvfun)) { From 63b0475ccaf25a899e54e279b8ed8fc12841bc57 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 12:31:32 +0200 Subject: [PATCH 03/17] re-document --- man/as_draws_matrix.projection.Rd | 4 ++-- man/cv_varsel.Rd | 2 +- man/projpred-package.Rd | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/as_draws_matrix.projection.Rd b/man/as_draws_matrix.projection.Rd index 9d5b657d6..6432a207a 100644 --- a/man/as_draws_matrix.projection.Rd +++ b/man/as_draws_matrix.projection.Rd @@ -6,9 +6,9 @@ \title{Extract projected parameter draws and coerce to \code{draws_matrix} (see package \pkg{posterior})} \usage{ -as_draws_matrix.projection(x, ...) +\method{as_draws_matrix}{projection}(x, ...) -as_draws.projection(x, ...) +\method{as_draws}{projection}(x, ...) } \arguments{ \item{x}{An object of class \code{projection} (returned by \code{\link[=project]{project()}}, possibly diff --git a/man/cv_varsel.Rd b/man/cv_varsel.Rd index 0b1f1d500..fa8ddb6e4 100644 --- a/man/cv_varsel.Rd +++ b/man/cv_varsel.Rd @@ -265,7 +265,7 @@ the backends from packages \pkg{doParallel}, \pkg{doMPI}, or \pkg{doFuture}. For GLMs, this CV parallelization should work reliably, but for other models (such as GLMMs), it may lead to excessive memory usage which in turn may crash the R session (on Unix systems, setting an -appropriate memory limit via \code{\link[unix:rlimit_as]{unix::rlimit_as()}} may avoid crashing the +appropriate memory limit via \code{\link[unix:rlimit]{unix::rlimit_as()}} may avoid crashing the whole machine). However, the problem of excessive memory usage is less pronounced for the CV parallelization than for the projection parallelization described in \link{projpred-package}. In that regard, the CV diff --git a/man/projpred-package.Rd b/man/projpred-package.Rd index dd410f922..f16ff53fa 100644 --- a/man/projpred-package.Rd +++ b/man/projpred-package.Rd @@ -86,7 +86,7 @@ submodel has no multilevel or additive predictor terms), but for all other types of submodels, the fitted submodel objects are quite big, which---when running in parallel---may lead to excessive memory usage which in turn may crash the R session (on Unix systems, setting an appropriate memory limit via -\code{\link[unix:rlimit_as]{unix::rlimit_as()}} may avoid crashing the whole machine). Thus, we currently +\code{\link[unix:rlimit]{unix::rlimit_as()}} may avoid crashing the whole machine). Thus, we currently cannot recommend parallelizing projections onto submodels which are GLMs (in this context, the latent projection onto a submodel without multilevel and without additive terms may be regarded as a projection onto a submodel which From 2dcdc2bfa8b208e07fc5db3dad53cebc52b877a1 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 12:35:39 +0200 Subject: [PATCH 04/17] minor cleaning --- R/cv_varsel.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index 6af179c5d..d5238d9b3 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1367,7 +1367,7 @@ run_cvfun.refmodel <- function(object, return(structure(cvfits, folds = folds)) } -# Exact LFO-CV --------------------------------------------------------------- +# Exact LFO CV ------------------------------------------------------------ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, ndraws_pred, nclusters_pred, refit_prj, penalty, From 7050b822d01af02327792f815eb389fe2abcd4e3 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 12:44:34 +0200 Subject: [PATCH 05/17] update `lfo_varsel()` to the most recent version of `kfold_varsel()` --- R/cv_varsel.R | 59 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index a235274ff..3fb8f7c56 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1531,13 +1531,20 @@ run_cvfun.refmodel <- function(object, lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, ndraws_pred, nclusters_pred, refit_prj, penalty, - verbose, opt, L, cvfits, search_terms, parallel, ...) { + verbose, opt, L, cvfits, validate_search, + search_path_fulldata, search_terms, search_out_rks, + parallel, ...) { # Fetch the K reference model fits (or fit them now if not already done) and # create objects of class `refmodel` from them (and also store the `omitted` # indices): list_cv <- get_lfo(refmodel, L = L, cvfits = cvfits, verbose = verbose) K <- length(list_cv) + search_out_rks_was_null <- is.null(search_out_rks) + if (search_out_rks_was_null) { + search_out_rks <- replicate(K, NULL, simplify = FALSE) + } + if (refmodel$family$for_latent) { # Need to set the latent response values in `refmodel$y` to `NA`s because # `refmodel$y` resulted from applying `colMeans(posterior_linpred())` to the @@ -1548,19 +1555,34 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, } y_wobs_test <- as.data.frame(refmodel[nms_y_wobs_test()]) - verb_out("-----\nRunning the search and the performance evaluation for ", - "each of the K = ", K, " CV folds separately ...", verbose = verbose) + if (verbose) { + verb_txt_start <- "-----\nRunning " + if (!search_out_rks_was_null || !validate_search) { + verb_txt_mid <- "" + } else { + verb_txt_mid <- "the search and " + } + verb_out(verb_txt_start, verb_txt_mid, "the performance evaluation for ", + "each of the K = ", K, " CV folds separately ...") + } one_fold <- function(fold, + rk, verbose_search = verbose && getOption("projpred.extra_verbose", FALSE), ...) { # Run the search for the current fold: - search_path <- select( - refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters, - method = method, nterms_max = nterms_max, penalty = penalty, - verbose = verbose_search, opt = opt, search_terms = search_terms, - est_runtime = FALSE, ... - ) + if (!validate_search) { + search_path <- search_path_fulldata + } else if (!search_out_rks_was_null) { + search_path <- list(solution_terms = rk) + } else { + search_path <- select( + refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters, + method = method, nterms_max = nterms_max, penalty = penalty, + verbose = verbose_search, opt = opt, search_terms = search_terms, + est_runtime = FALSE, ... + ) + } # Run the performance evaluation for the submodels along the predictor # ranking: @@ -1603,7 +1625,7 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, if (verbose) { on.exit(utils::setTxtProgressBar(pb, k)) } - one_fold(list_cv[[k]], ...) + one_fold(fold = list_cv[[k]], rk = search_out_rks[[k]], ...) }) if (verbose) { close(pb) @@ -1620,10 +1642,12 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, `%do_projpred%` <- doRNG::`%dorng%` res_cv <- foreach::foreach( list_cv_k = list_cv, + search_out_rks_k = search_out_rks, .export = c("one_fold", "dot_args"), - .noexport = c("list_cv") + .noexport = c("list_cv", "search_out_rks") ) %do_projpred% { - do.call(one_fold, c(list(fold = list_cv_k, verbose_search = FALSE), + do.call(one_fold, c(list(fold = list_cv_k, rk = search_out_rks_k, + verbose_search = FALSE), dot_args)) } } @@ -1685,8 +1709,15 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, ref$oscale$lppd <- ref$oscale$lppd[order(idxs_sorted_by_fold)] } - return(nlist(solution_terms_cv, summaries = nlist(sub, ref), y_wobs_test, - clust_used_eval, nprjdraws_eval)) + if (!validate_search) { + out_list <- list() + } else { + out_list <- nlist(solution_terms_cv) + } + out_list <- c(out_list, + nlist(summaries = nlist(sub, ref), y_wobs_test, clust_used_eval, + nprjdraws_eval)) + return(out_list) } # Re-fit the reference model T-L-1 times (once for each observation from L to From 11ba4c8a191413b329be0acc833e2c2f9c020bad Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:00:48 +0200 Subject: [PATCH 06/17] Add alias `forced_search_terms()` for `force_search_terms()`. --- NAMESPACE | 1 + R/search.R | 7 ++++++- man/force_search_terms.Rd | 6 +++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6c47d2d51..8f6f42d30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(cvfolds) export(do_call) export(extend_family) export(force_search_terms) +export(forced_search_terms) export(get_refmodel) export(init_refmodel) export(lfo_folds) diff --git a/R/search.R b/R/search.R index f0e367b96..5bb02b5b1 100644 --- a/R/search.R +++ b/R/search.R @@ -148,7 +148,8 @@ search_forward <- function(p_ref, refmodel, nterms_max, verbose = TRUE, opt, #' [varsel()] or [cv_varsel()] if certain predictor terms should be forced to be #' selected first whereas other predictor terms are optional (i.e., they are #' subject to the variable selection, but only after the inclusion of the -#' "forced" terms). +#' "forced" terms). Function [forced_search_terms()] is simply an alias for +#' [force_search_terms()]. #' #' @param forced_terms A character vector of predictor terms that should be #' selected first. @@ -195,6 +196,10 @@ force_search_terms <- function(forced_terms, optional_terms) { return(c(forced_terms, paste0(forced_terms, " + ", optional_terms))) } +#' @rdname force_search_terms +#' @export +forced_search_terms <- force_search_terms + search_L1_surrogate <- function(p_ref, d_train, family, intercept, nterms_max, penalty, opt) { diff --git a/man/force_search_terms.Rd b/man/force_search_terms.Rd index d0b4349ce..96df41d6c 100644 --- a/man/force_search_terms.Rd +++ b/man/force_search_terms.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/search.R \name{force_search_terms} \alias{force_search_terms} +\alias{forced_search_terms} \title{Force search terms} \usage{ force_search_terms(forced_terms, optional_terms) + +forced_search_terms(forced_terms, optional_terms) } \arguments{ \item{forced_terms}{A character vector of predictor terms that should be @@ -23,7 +26,8 @@ A helper function to construct the input for argument \code{search_terms} of \code{\link[=varsel]{varsel()}} or \code{\link[=cv_varsel]{cv_varsel()}} if certain predictor terms should be forced to be selected first whereas other predictor terms are optional (i.e., they are subject to the variable selection, but only after the inclusion of the -"forced" terms). +"forced" terms). Function \code{\link[=forced_search_terms]{forced_search_terms()}} is simply an alias for +\code{\link[=force_search_terms]{force_search_terms()}}. } \examples{ \dontshow{if (requireNamespace("rstanarm", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From 190d0bcb89247c7eefbe75da0bff136778e56232 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:14:27 +0200 Subject: [PATCH 07/17] Add function `ordered_search_terms()`. --- NAMESPACE | 1 + R/search.R | 50 +++++++++++++++++++++++++++++++++++- man/force_search_terms.Rd | 2 +- man/ordered_search_terms.Rd | 51 +++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 man/ordered_search_terms.Rd diff --git a/NAMESPACE b/NAMESPACE index 8f6f42d30..b26ab940f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ export(forced_search_terms) export(get_refmodel) export(init_refmodel) export(lfo_folds) +export(ordered_search_terms) export(predictor_terms) export(proj_linpred) export(proj_predict) diff --git a/R/search.R b/R/search.R index 5bb02b5b1..374ae7a4a 100644 --- a/R/search.R +++ b/R/search.R @@ -160,7 +160,7 @@ search_forward <- function(p_ref, refmodel, nterms_max, verbose = TRUE, opt, #' @return A character vector that may be used as input for argument #' `search_terms` of [varsel()] or [cv_varsel()]. #' -#' @seealso [varsel()], [cv_varsel()] +#' @seealso [varsel()], [cv_varsel()], [ordered_search_terms()] #' #' @examplesIf requireNamespace("rstanarm", quietly = TRUE) #' # Data: @@ -200,6 +200,54 @@ force_search_terms <- function(forced_terms, optional_terms) { #' @export forced_search_terms <- force_search_terms +#' Ordered search terms +#' +#' A helper function to construct the input for argument `search_terms` of +#' [varsel()] or [cv_varsel()] if the predictor terms should be selected in a +#' given order. +#' +#' @param ordered_terms A character vector of predictor terms that should be +#' selected in their given order. +#' +#' @return A character vector that may be used as input for argument +#' `search_terms` of [varsel()] or [cv_varsel()]. +#' +#' @seealso [varsel()], [cv_varsel()], [forced_search_terms()] +#' +#' @examplesIf requireNamespace("rstanarm", quietly = TRUE) +#' # Data: +#' dat_gauss <- data.frame(y = df_gaussian$y, df_gaussian$x) +#' +#' # The "stanreg" fit which will be used as the reference model (with small +#' # values for `chains` and `iter`, but only for technical reasons in this +#' # example; this is not recommended in general): +#' fit <- rstanarm::stan_glm( +#' y ~ X1 + X2 + X3 + X4 + X5, family = gaussian(), data = dat_gauss, +#' QR = TRUE, chains = 2, iter = 500, refresh = 0, seed = 9876 +#' ) +#' +#' # We will select X3, X1, and X2 in this order: +#' search_terms_ordered <- ordered_search_terms( +#' ordered_terms = c("X3", "X1", "X2") +#' ) +#' +#' # Run varsel() (here without cross-validation and with small values for +#' # `nterms_max`, `nclusters`, and `nclusters_pred`, but only for the sake of +#' # speed in this example; this is not recommended in general): +#' vs <- varsel(fit, nclusters = 5, nclusters_pred = 10, +#' search_terms = search_terms_ordered, seed = 5555) +#' # Now see, for example, `?print.vsel`, `?plot.vsel`, `?suggest_size.vsel`, +#' # and `?ranking` for possible post-processing functions. +#' +#' @export +ordered_search_terms <- function(ordered_terms) { + for (idx in utils::tail(seq_along(ordered_terms), -1)) { + ordered_terms[idx] <- paste0(ordered_terms[idx - 1], " + ", + ordered_terms[idx]) + } + return(ordered_terms) +} + search_L1_surrogate <- function(p_ref, d_train, family, intercept, nterms_max, penalty, opt) { diff --git a/man/force_search_terms.Rd b/man/force_search_terms.Rd index 96df41d6c..4243944d4 100644 --- a/man/force_search_terms.Rd +++ b/man/force_search_terms.Rd @@ -58,5 +58,5 @@ vs <- varsel(fit, nclusters = 5, nclusters_pred = 10, \dontshow{\}) # examplesIf} } \seealso{ -\code{\link[=varsel]{varsel()}}, \code{\link[=cv_varsel]{cv_varsel()}} +\code{\link[=varsel]{varsel()}}, \code{\link[=cv_varsel]{cv_varsel()}}, \code{\link[=ordered_search_terms]{ordered_search_terms()}} } diff --git a/man/ordered_search_terms.Rd b/man/ordered_search_terms.Rd new file mode 100644 index 000000000..59a99bc96 --- /dev/null +++ b/man/ordered_search_terms.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search.R +\name{ordered_search_terms} +\alias{ordered_search_terms} +\title{Ordered search terms} +\usage{ +ordered_search_terms(ordered_terms) +} +\arguments{ +\item{ordered_terms}{A character vector of predictor terms that should be +selected in their given order.} +} +\value{ +A character vector that may be used as input for argument +\code{search_terms} of \code{\link[=varsel]{varsel()}} or \code{\link[=cv_varsel]{cv_varsel()}}. +} +\description{ +A helper function to construct the input for argument \code{search_terms} of +\code{\link[=varsel]{varsel()}} or \code{\link[=cv_varsel]{cv_varsel()}} if the predictor terms should be selected in a +given order. +} +\examples{ +\dontshow{if (requireNamespace("rstanarm", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Data: +dat_gauss <- data.frame(y = df_gaussian$y, df_gaussian$x) + +# The "stanreg" fit which will be used as the reference model (with small +# values for `chains` and `iter`, but only for technical reasons in this +# example; this is not recommended in general): +fit <- rstanarm::stan_glm( + y ~ X1 + X2 + X3 + X4 + X5, family = gaussian(), data = dat_gauss, + QR = TRUE, chains = 2, iter = 500, refresh = 0, seed = 9876 +) + +# We will select X3, X1, and X2 in this order: +search_terms_ordered <- ordered_search_terms( + ordered_terms = c("X3", "X1", "X2") +) + +# Run varsel() (here without cross-validation and with small values for +# `nterms_max`, `nclusters`, and `nclusters_pred`, but only for the sake of +# speed in this example; this is not recommended in general): +vs <- varsel(fit, nclusters = 5, nclusters_pred = 10, + search_terms = search_terms_ordered, seed = 5555) +# Now see, for example, `?print.vsel`, `?plot.vsel`, `?suggest_size.vsel`, +# and `?ranking` for possible post-processing functions. +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=varsel]{varsel()}}, \code{\link[=cv_varsel]{cv_varsel()}}, \code{\link[=forced_search_terms]{forced_search_terms()}} +} From 186047bb17d65caf27411add69b2aee24f08741d Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:19:19 +0200 Subject: [PATCH 08/17] Docs: Use a consistent location for arguments of functions documented via `@rdname`. --- R/methods.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/methods.R b/R/methods.R index 655fbb61d..ae834df55 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2240,6 +2240,8 @@ mat2drmat <- function(xmat) { #' [set.seed()], but can also be `NA` to not call [set.seed()] at all. If not #' `NA`, then the PRNG state is reset (to the state before calling #' [cv_folds()] or [cv_ids()]) upon exiting [cv_folds()] or [cv_ids()]. +#' @param T the total number of time observations +#' @param L the number of observations to fit the initial fold with #' #' @return [cv_folds()] returns a vector of length `n` such that each element is #' an integer between 1 and `K` denoting which fold the corresponding data @@ -2334,8 +2336,6 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { #' @rdname cv-indices #' @export -#' @param T the total number of time observations -#' @param L the number of observations to fit the initial fold with lfo_folds <- function(T, L, seed = NA) { validate_lfo_folds(T, L) From 62276ef71942a397952698fceb1ac9b81d55d977 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:27:23 +0200 Subject: [PATCH 09/17] In R, using `T` as an object name is bad practice (because `T` is identical to `TRUE`). --- R/methods.R | 12 ++++++------ man/cv-indices.Rd | 7 +++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/R/methods.R b/R/methods.R index ae834df55..1e02dd85c 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2231,7 +2231,8 @@ mat2drmat <- function(xmat) { #' #' @name cv-indices #' -#' @param n Number of observations. +#' @param n Number of observations (in case of a time-series model, this is the +#' number of time points). #' @param K Number of folds. Must be at least 2 and not exceed `n`. #' @param out Format of the output, either `"foldwise"` or `"indices"`. See #' below for details. @@ -2240,7 +2241,6 @@ mat2drmat <- function(xmat) { #' [set.seed()], but can also be `NA` to not call [set.seed()] at all. If not #' `NA`, then the PRNG state is reset (to the state before calling #' [cv_folds()] or [cv_ids()]) upon exiting [cv_folds()] or [cv_ids()]. -#' @param T the total number of time observations #' @param L the number of observations to fit the initial fold with #' #' @return [cv_folds()] returns a vector of length `n` such that each element is @@ -2336,8 +2336,8 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { #' @rdname cv-indices #' @export -lfo_folds <- function(T, L, seed = NA) { - validate_lfo_folds(T, L) +lfo_folds <- function(n, L, seed = NA) { + validate_lfo_folds(n, L) if (exists(".Random.seed", envir = .GlobalEnv)) { rng_state_old <- get(".Random.seed", envir = .GlobalEnv) @@ -2353,8 +2353,8 @@ lfo_folds <- function(T, L, seed = NA) { ## create fold indices # these indices, unlike for K-fold indicate the observations to use to the # fit the model at the i-th fold - folds <- rep(1, T) - for (t in (L + 1) : (T)) { + folds <- rep(1, n) + for (t in (L + 1) : (n)) { folds[t] <- t - L } return(folds) diff --git a/man/cv-indices.Rd b/man/cv-indices.Rd index 52891c747..865c2312e 100644 --- a/man/cv-indices.Rd +++ b/man/cv-indices.Rd @@ -14,10 +14,11 @@ cvfolds(n, K, seed = NA) cv_ids(n, K, out = c("foldwise", "indices"), seed = NA) -lfo_folds(T, L, seed = NA) +lfo_folds(n, L, seed = NA) } \arguments{ -\item{n}{Number of observations.} +\item{n}{Number of observations (in case of a time-series model, this is the +number of time points).} \item{K}{Number of folds. Must be at least 2 and not exceed \code{n}.} @@ -30,8 +31,6 @@ results can be obtained again if needed. Passed to argument \code{seed} of \item{out}{Format of the output, either \code{"foldwise"} or \code{"indices"}. See below for details.} -\item{T}{the total number of time observations} - \item{L}{the number of observations to fit the initial fold with} } \value{ From e8525906b08e9d5139b144a3d114e455c16f4c4e Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:35:05 +0200 Subject: [PATCH 10/17] Docs: Minor enhancements; add `TODO`s. --- R/methods.R | 14 +++++++++++--- man/cv-indices.Rd | 14 +++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/R/methods.R b/R/methods.R index 1e02dd85c..521251e32 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2222,7 +2222,8 @@ mat2drmat <- function(xmat) { #' #' These are helper functions to create cross-validation (CV) folds, i.e., to #' split up the indices from 1 to `n` into `K` subsets ("folds") for -#' \eqn{K}-fold CV. These functions are potentially useful when creating the +#' \eqn{K}-fold CV (via [cv_folds()] or [cv_ids()]) or to **TODO** (via +#' [lfo_folds()]). These functions are potentially useful when creating the #' input for arguments `cvfits` and `cvfun` of [init_refmodel()] (or argument #' `cvfits` of [cv_varsel.refmodel()]). Function [cvfolds()] is deprecated; #' please use [cv_folds()] instead (apart from the name, they are the same). The @@ -2241,11 +2242,14 @@ mat2drmat <- function(xmat) { #' [set.seed()], but can also be `NA` to not call [set.seed()] at all. If not #' `NA`, then the PRNG state is reset (to the state before calling #' [cv_folds()] or [cv_ids()]) upon exiting [cv_folds()] or [cv_ids()]. -#' @param L the number of observations to fit the initial fold with +#' @param L Number of observations (time points) to fit the initial fold with. +#' Must be at least 1 and not exceed `n`. #' #' @return [cv_folds()] returns a vector of length `n` such that each element is #' an integer between 1 and `K` denoting which fold the corresponding data -#' point belongs to. The return value of [cv_ids()] depends on the `out` +#' point belongs to. +#' +#' The return value of [cv_ids()] depends on the `out` #' argument. If `out = "foldwise"`, the return value is a `list` with `K` #' elements, each being a `list` with elements `tr` and `ts` giving the #' training and test indices, respectively, for the corresponding fold. If @@ -2253,6 +2257,8 @@ mat2drmat <- function(xmat) { #' each being a `list` with `K` elements giving the training and test indices, #' respectively, for each fold. #' +#' The return value of [lfo_folds()] is **TODO**. +#' #' @examples #' n <- 100 #' set.seed(1234) @@ -2261,6 +2267,8 @@ mat2drmat <- function(xmat) { #' # Mean within the test set of each fold: #' cvmeans <- sapply(cv, function(fold) mean(y[fold$ts])) #' +#' # TODO: lfo_folds() example +#' NULL #' @rdname cv-indices diff --git a/man/cv-indices.Rd b/man/cv-indices.Rd index 865c2312e..98a40c7b6 100644 --- a/man/cv-indices.Rd +++ b/man/cv-indices.Rd @@ -31,23 +31,29 @@ results can be obtained again if needed. Passed to argument \code{seed} of \item{out}{Format of the output, either \code{"foldwise"} or \code{"indices"}. See below for details.} -\item{L}{the number of observations to fit the initial fold with} +\item{L}{Number of observations (time points) to fit the initial fold with. +Must be at least 1 and not exceed \code{n}.} } \value{ \code{\link[=cv_folds]{cv_folds()}} returns a vector of length \code{n} such that each element is an integer between 1 and \code{K} denoting which fold the corresponding data -point belongs to. The return value of \code{\link[=cv_ids]{cv_ids()}} depends on the \code{out} +point belongs to. + +The return value of \code{\link[=cv_ids]{cv_ids()}} depends on the \code{out} argument. If \code{out = "foldwise"}, the return value is a \code{list} with \code{K} elements, each being a \code{list} with elements \code{tr} and \code{ts} giving the training and test indices, respectively, for the corresponding fold. If \code{out = "indices"}, the return value is a \code{list} with elements \code{tr} and \code{ts} each being a \code{list} with \code{K} elements giving the training and test indices, respectively, for each fold. + +The return value of \code{\link[=lfo_folds]{lfo_folds()}} is \strong{TODO}. } \description{ These are helper functions to create cross-validation (CV) folds, i.e., to split up the indices from 1 to \code{n} into \code{K} subsets ("folds") for -\eqn{K}-fold CV. These functions are potentially useful when creating the +\eqn{K}-fold CV (via \code{\link[=cv_folds]{cv_folds()}} or \code{\link[=cv_ids]{cv_ids()}}) or to \strong{TODO} (via +\code{\link[=lfo_folds]{lfo_folds()}}). These functions are potentially useful when creating the input for arguments \code{cvfits} and \code{cvfun} of \code{\link[=init_refmodel]{init_refmodel()}} (or argument \code{cvfits} of \code{\link[=cv_varsel.refmodel]{cv_varsel.refmodel()}}). Function \code{\link[=cvfolds]{cvfolds()}} is deprecated; please use \code{\link[=cv_folds]{cv_folds()}} instead (apart from the name, they are the same). The @@ -62,4 +68,6 @@ cv <- cv_ids(n, K = 5) # Mean within the test set of each fold: cvmeans <- sapply(cv, function(fold) mean(y[fold$ts])) +# TODO: lfo_folds() example + } From bdb6475a898878e76345af04c0aca67fbe3d1af7 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:35:40 +0200 Subject: [PATCH 11/17] fixup! In R, using `T` as an object name is bad practice (because `T` is identical to `TRUE`). --- R/misc.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/misc.R b/R/misc.R index 8795a565f..d1b63ad09 100644 --- a/R/misc.R +++ b/R/misc.R @@ -130,14 +130,14 @@ validate_num_folds <- function(k, n) { } } -validate_lfo_folds <- function(T, L) { +validate_lfo_folds <- function(n, L) { if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) { stop("Number of folds must be a single integer value.") } if (L < 1) { stop("Number of initial observations must be at least 1.") } - if (L > T) { + if (L > n) { stop("Number of initial observations cannot exceed total observations.") } } From 20a06a05d51c3180cdbc3cf709eab0908fa33439 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:36:52 +0200 Subject: [PATCH 12/17] Use a consistent naming scheme for the `validate_num_folds[_lfo]()` functions. --- R/methods.R | 2 +- R/misc.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/methods.R b/R/methods.R index 521251e32..74b4f3999 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2345,7 +2345,7 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { #' @rdname cv-indices #' @export lfo_folds <- function(n, L, seed = NA) { - validate_lfo_folds(n, L) + validate_num_folds_lfo(n, L) if (exists(".Random.seed", envir = .GlobalEnv)) { rng_state_old <- get(".Random.seed", envir = .GlobalEnv) diff --git a/R/misc.R b/R/misc.R index d1b63ad09..142963191 100644 --- a/R/misc.R +++ b/R/misc.R @@ -130,7 +130,7 @@ validate_num_folds <- function(k, n) { } } -validate_lfo_folds <- function(n, L) { +validate_num_folds_lfo <- function(n, L) { if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) { stop("Number of folds must be a single integer value.") } From 94ab84855817bb57b85ad425d1ab7fe6688ffb46 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:37:48 +0200 Subject: [PATCH 13/17] Use a consistent argument order for the `validate_num_folds[_lfo]()` functions. --- R/methods.R | 2 +- R/misc.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/methods.R b/R/methods.R index 74b4f3999..91ef99b39 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2345,7 +2345,7 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { #' @rdname cv-indices #' @export lfo_folds <- function(n, L, seed = NA) { - validate_num_folds_lfo(n, L) + validate_num_folds_lfo(L = L, n = n) if (exists(".Random.seed", envir = .GlobalEnv)) { rng_state_old <- get(".Random.seed", envir = .GlobalEnv) diff --git a/R/misc.R b/R/misc.R index 142963191..329bf2b13 100644 --- a/R/misc.R +++ b/R/misc.R @@ -130,7 +130,7 @@ validate_num_folds <- function(k, n) { } } -validate_num_folds_lfo <- function(n, L) { +validate_num_folds_lfo <- function(L, n) { if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) { stop("Number of folds must be a single integer value.") } From 5c44bd9e1dd9d8de5699fce155f71a465d560ae1 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:41:10 +0200 Subject: [PATCH 14/17] `lfo_folds()` doesn't involve PRNG, so seed-related code is removed here. --- R/methods.R | 13 +------------ man/cv-indices.Rd | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/R/methods.R b/R/methods.R index 91ef99b39..3727ef692 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2344,20 +2344,9 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) { #' @rdname cv-indices #' @export -lfo_folds <- function(n, L, seed = NA) { +lfo_folds <- function(n, L) { validate_num_folds_lfo(L = L, n = n) - if (exists(".Random.seed", envir = .GlobalEnv)) { - rng_state_old <- get(".Random.seed", envir = .GlobalEnv) - } - if (!is.na(seed)) { - # Set seed, but ensure the old RNG state is restored on exit: - if (exists(".Random.seed", envir = .GlobalEnv)) { - on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) - } - set.seed(seed) - } - ## create fold indices # these indices, unlike for K-fold indicate the observations to use to the # fit the model at the i-th fold diff --git a/man/cv-indices.Rd b/man/cv-indices.Rd index 98a40c7b6..11f4504f7 100644 --- a/man/cv-indices.Rd +++ b/man/cv-indices.Rd @@ -14,7 +14,7 @@ cvfolds(n, K, seed = NA) cv_ids(n, K, out = c("foldwise", "indices"), seed = NA) -lfo_folds(n, L, seed = NA) +lfo_folds(n, L) } \arguments{ \item{n}{Number of observations (in case of a time-series model, this is the From 8900dd05260acf16b0a8f24452521315ce771da5 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:47:18 +0200 Subject: [PATCH 15/17] `lfo_folds()`: Fix a bug. --- R/methods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods.R b/R/methods.R index 3727ef692..ceaae0799 100644 --- a/R/methods.R +++ b/R/methods.R @@ -2352,7 +2352,7 @@ lfo_folds <- function(n, L) { # fit the model at the i-th fold folds <- rep(1, n) for (t in (L + 1) : (n)) { - folds[t] <- t - L + folds[t] <- t - L + 1 } return(folds) } From 5c19f7d360bf0d02f968d97ed19125d44947bc1a Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 26 Oct 2023 13:51:54 +0200 Subject: [PATCH 16/17] Fix typos in internal documentation. --- R/cv_varsel.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index 3fb8f7c56..6ea6c20f8 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1720,9 +1720,9 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, return(out_list) } -# Re-fit the reference model T-L-1 times (once for each observation from L to +# Re-fit the reference model T-L times (once for each observation from L to # the second-to-last observation; `cvfun` case) or fetch the fits if already -# computed (`cvfits` case). This function will return a list of length T-L-1, +# computed (`cvfits` case). This function will return a list of length T-L, # where each element is a list with elements `refmodel` (output of # init_refmodel()) and `omitted` (vector of indices of those observations which # were left out for the corresponding fold). From c8efad530af7030f3dd73892653c5a941c41f84e Mon Sep 17 00:00:00 2001 From: fweber144 Date: Thu, 30 Nov 2023 15:51:49 +0100 Subject: [PATCH 17/17] Update the LFO functions to their most recent K-fold analogs. --- R/cv_varsel.R | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/R/cv_varsel.R b/R/cv_varsel.R index ec2f819a3..5832c3bc2 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -1587,7 +1587,7 @@ run_cvfun.refmodel <- function(object, lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, ndraws_pred, nclusters_pred, refit_prj, penalty, - verbose, opt, L, cvfits, validate_search, + verbose, search_control, L, cvfits, validate_search, search_path_fulldata, search_terms, search_out_rks, parallel, ...) { # Fetch the K reference model fits (or fit them now if not already done) and @@ -1618,8 +1618,9 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, } else { verb_txt_mid <- "the search and " } - verb_out(verb_txt_start, verb_txt_mid, "the performance evaluation for ", - "each of the K = ", K, " CV folds separately ...") + verb_out(verb_txt_start, verb_txt_mid, "the performance evaluation with ", + "`refit_prj = ", refit_prj, "` for each of the K = ", K, " CV ", + "folds separately ...") } one_fold <- function(fold, rk, @@ -1630,20 +1631,20 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, if (!validate_search) { search_path <- search_path_fulldata } else if (!search_out_rks_was_null) { - search_path <- list(solution_terms = rk) + search_path <- list(predictor_ranking = rk) } else { search_path <- select( refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters, method = method, nterms_max = nterms_max, penalty = penalty, - verbose = verbose_search, opt = opt, search_terms = search_terms, - est_runtime = FALSE, ... + verbose = verbose_search, search_control = search_control, + search_terms = search_terms, est_runtime = FALSE, ... ) } # Run the performance evaluation for the submodels along the predictor # ranking: perf_eval_out <- perf_eval( - search_path = search_path, refmodel = fold$refmodel, regul = opt$regul, + search_path = search_path, refmodel = fold$refmodel, refit_prj = refit_prj, ndraws = ndraws_pred, nclusters = nclusters_pred, refmodel_fulldata = refmodel, indices_test = fold$omitted, ... ) @@ -1664,7 +1665,7 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, cl_ref = seq_along(fold$refmodel$wdraws_ref) ) - return(nlist(predictor_ranking = search_path[["solution_terms"]], + return(nlist(predictor_ranking = search_path[["predictor_ranking"]], summaries_sub = perf_eval_out[["sub_summaries"]], summaries_ref, clust_used_eval = perf_eval_out[["clust_used"]], nprjdraws_eval = perf_eval_out[["nprjdraws"]])) @@ -1702,13 +1703,14 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, .export = c("one_fold", "dot_args"), .noexport = c("list_cv", "search_out_rks") ) %do_projpred% { - do.call(one_fold, c(list(fold = list_cv_k, rk = search_out_rks_k, + do_call(one_fold, c(list(fold = list_cv_k, rk = search_out_rks_k, verbose_search = FALSE), dot_args)) } } verb_out("-----", verbose = verbose) - solution_terms_cv <- do.call(rbind, lapply(res_cv, "[[", "predictor_ranking")) + predictor_ranking_cv <- do.call(rbind, + lapply(res_cv, "[[", "predictor_ranking")) clust_used_eval <- element_unq(res_cv, nm = "clust_used_eval") nprjdraws_eval <- element_unq(res_cv, nm = "nprjdraws_eval") @@ -1719,7 +1721,7 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, } else { sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE) if (is.null(dim(sub_foldwise))) { - sub_dim <- dim(solution_terms_cv) + sub_dim <- dim(predictor_ranking_cv) sub_dim[2] <- sub_dim[2] + 1L # +1 is for the empty model dim(sub_foldwise) <- rev(sub_dim) } @@ -1768,7 +1770,7 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, if (!validate_search) { out_list <- list() } else { - out_list <- nlist(solution_terms_cv) + out_list <- nlist(predictor_ranking_cv) } out_list <- c(out_list, nlist(summaries = nlist(sub, ref), y_wobs_test, clust_used_eval, @@ -1776,8 +1778,8 @@ lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters, return(out_list) } -# Re-fit the reference model T-L times (once for each observation from L to -# the second-to-last observation; `cvfun` case) or fetch the fits if already +# Refit the reference model T-L times (once for each observation from L to the +# second-to-last observation; `cvfun` case) or fetch the fits if already # computed (`cvfits` case). This function will return a list of length T-L, # where each element is a list with elements `refmodel` (output of # init_refmodel()) and `omitted` (vector of indices of those observations which @@ -1806,6 +1808,7 @@ get_lfo <- function(refmodel, L, cvfits, verbose) { } else { folds <- attr(cvfits, "folds") } + stopifnot(!is.null(folds)) return(lapply(seq_len(K), function(k) { cvfit <- cvfits[[k]] # Add the omitted observation indices for this fold (and the fold index `k`