Title: | Split Robust Least Angle Regression |
---|---|
Description: | Functions to perform split robust least angle regression. The approach first uses the least angle regression algorithm to split the variables into the models of an ensemble and robust estimates of the correlation between predictors. An elastic net estimator is then applied to the selected predictors in each model using the imputed data from the detect deviating cell (DDC) method. |
Authors: | Anthony Christidis [aut, cre], Gabriela Cohen-Freue [aut] |
Maintainer: | Anthony Christidis <[email protected]> |
License: | GPL (>= 2) |
Version: | 1.0.1 |
Built: | 2024-11-03 06:23:06 UTC |
Source: | https://github.com/anthonychristidis/srlars |
coef.srlars
returns the coefficients for a srlars object.
## S3 method for class 'srlars' coef(object, group_index = NULL, ...)
## S3 method for class 'srlars' coef(object, group_index = NULL, ...)
object |
An object of class srlars |
group_index |
Groups included in the ensemble. Default setting includes all the groups. |
... |
Additional arguments for compatibility. |
The coefficients for the srlars object.
Anthony-Alexander Christidis, [email protected]
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
predict.srlars
returns the predictions for a srlars object.
## S3 method for class 'srlars' predict(object, newx, group_index = NULL, dynamic = FALSE, ...)
## S3 method for class 'srlars' predict(object, newx, group_index = NULL, dynamic = FALSE, ...)
object |
An object of class srlars |
newx |
New data for predictions. |
group_index |
Groups included in the ensemble. Default setting includes all the groups. |
dynamic |
Argument to determine whether dynamic predictions are used based on deviating cells. Default is FALSE. |
... |
Additional arguments for compatibility. |
The predictions for the srlars object.
Anthony-Alexander Christidis, [email protected]
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
srlars
performs split robust least angle regression.
srlars( x, y, n_models = 1, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = NULL, robust = TRUE, compute_coef = FALSE, en_alpha = 1/4 )
srlars( x, y, n_models = 1, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = NULL, robust = TRUE, compute_coef = FALSE, en_alpha = 1/4 )
x |
Design matrix. |
y |
Response vector. |
n_models |
Number of models into which the variables are split. |
model_saturation |
Criterion to determine if a model is saturated. Must be one of "fixed" (default) or "p-value". |
alpha |
P-value used to determine when the model is saturated |
model_size |
Size of the models in the ensemble. |
robust |
Argument to determine if robust measures of location, scale and correlation are used. Default is TRUE. |
compute_coef |
Argument to determine if coefficients are computed (via the elastic net) for each model. Default is FALSE. |
en_alpha |
Elastic net mixing parmeter for parameters shrinkage. Default is 1/4. |
An object of class srlars
Anthony-Alexander Christidis, [email protected]
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
# Required library library(mvnfast) # Simulation parameters n <- 50 p <- 500 rho.within <- 0.8 rho.between <- 0.2 p.active <- 100 group.size <- 25 snr <- 3 contamination.prop <- 0.2 # Setting the seed set.seed(0) # Block correlation structure sigma.mat <- matrix(0, p, p) sigma.mat[1:p.active, 1:p.active] <- rho.between for(group in 0:(p.active/group.size - 1)) sigma.mat[(group*group.size+1):(group*group.size+group.size), (group*group.size+1):(group*group.size+group.size)] <- rho.within diag(sigma.mat) <- 1 # Simulation of beta vector true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active)) # Setting the SD of the variance sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr)) # Simulation of uncontaminated data x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat) y <- x %*% true.beta + rnorm(n, 0, sigma) # Contamination of data contamination_indices <- 1:floor(n*contamination.prop) k_lev <- 2 k_slo <- 100 x_train <- x y_train <- y beta_cont <- true.beta beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo) beta_cont[true.beta==0] <- k_slo*max(abs(true.beta)) for(cont_id in contamination_indices){ a <- runif(p, min = -1, max = 1) a <- a - as.numeric((1/p)*t(a) %*% rep(1, p)) x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) + k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a)) y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont } # Ensemble models ensemble_fit <- srlars(x_train, y_train, n_models = 5, model_saturation = c("fixed", "p-value")[1], alpha = 0.05, model_size = n - 1, robust = TRUE, compute_coef = TRUE, en_alpha = 1/4) # Ensemble coefficients ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models) sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0) # Simulation of test data m <- 2e3 x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat) y_test <- x_test %*% true.beta + rnorm(m, 0, sigma) # Prediction of test samples ensemble_preds <- predict(ensemble_fit, newx = x_test, group_index = 1:ensemble_fit$n_models, dynamic = FALSE) mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2