R/cv.plasso.R

Defines functions CV_core plot.cv.plasso coef.cv.plasso predict.cv.plasso print.summary.cv.plasso summary.cv.plasso print.cv.plasso cv.plasso

Documented in coef.cv.plasso CV_core cv.plasso plot.cv.plasso predict.cv.plasso print.cv.plasso print.summary.cv.plasso summary.cv.plasso

#' Cross-Validated Lasso and Post-Lasso
#' 
#' @description
#' \code{\link{cv.plasso}} uses the \code{\link[glmnet]{glmnet}} package to estimate the coefficient paths and cross-validates least squares Lasso AND Post-Lasso.
#'
#' @param x Matrix of covariates (number of observations times number of covariates matrix)
#' @param y Vector of outcomes
#' @param w Vector of weights
#' @param kf Number of folds in k-fold cross-validation
#' @param parallel Set as TRUE for parallelized cross-validation. Default is FALSE.
#' @param ... Pass \code{\link[glmnet]{glmnet}} options
#' 
#' @import glmnet
#' @import Matrix
#' @import parallel
#' @import doParallel
#' @import foreach
#' @import iterators
#' @importFrom stats coef predict var
#'
#' @return cv.plasso object (using a list structure) including the base \code{\link[glmnet]{glmnet}} object and cross-validation results (incl. optimal Lambda values) for both Lasso and Post-Lasso model.
#' \item{call}{the call that produced this}
#' \item{lasso_full}{base \code{\link[glmnet]{glmnet}} object}
#' \item{kf}{number of folds in k-fold cross-validation}
#' \item{cv_MSE_lasso}{cross-validated MSEs of Lasso model (for every iteration of k-fold cross-validation)}
#' \item{cv_MSE_plasso}{cross-validated MSEs of Post-Lasso model (for every iteration of k-fold cross-validation)}
#' \item{mean_MSE_lasso}{averaged cross-validated MSEs of Lasso model}
#' \item{mean_MSE_plasso}{averaged cross-validated MSEs of Post-Lasso model}
#' \item{ind_min_l}{index of MSE optimal lambda value for Lasso model}
#' \item{ind_min_pl}{index of MSE optimal lambda value for Post-Lasso model}
#' \item{lambda_min_l}{MSE optimal lambda value for Lasso model}
#' \item{lambda_min_pl}{MSE optimal lambda value for Post-Lasso model}
#' \item{names_l}{Names of active variables for MSE optimal Lasso model}
#' \item{names_pl}{Names of active variables for MSE optimal Post-Lasso model}
#' \item{coef_min_l}{Coefficients for MSE optimal Lasso model}
#' \item{coef_min_pl}{Coefficients for MSE optimal Post-Lasso model}
#' \item{x}{Input matrix of covariates}
#' \item{y}{Matrix of outcomes}
#' \item{w}{Matrix of weights}
#'
#' @export
#' 
#' @examples
#' # load toeplitz data
#' data(toeplitz)
#' # extract target and features from data
#' y = as.matrix(toeplitz[,1])
#' X = toeplitz[,-1]
#' # fit cv.plasso to the data
#' \donttest{p.cv = plasso::cv.plasso(X,y)}
#' # get basic summary statistics
#' \donttest{print(summary(p.cv, default=FALSE))}
#' # plot cross-validated MSE curves and number of active coefficients
#' \donttest{plot(p.cv, legend_pos="bottomleft")}
#' # get coefficients at MSE optimal lambda value for both Lasso and Post-Lasso model
#' \donttest{coef(p.cv)}
#' # get coefficients at MSE optimal lambda value according to 1-standard-error rule
#' \donttest{coef(p.cv, se_rule=-1)}
#' # predict fitted values along whole lambda sequence 
#' \donttest{pred = predict(p.cv, s="all")}
#' \donttest{head(pred$plasso)}
#'
cv.plasso = function(x,y,
                  w=NULL,
                  kf=10,
                  parallel=FALSE,
                  ...) {

  this.call = match.call()
  # Handle potentially provided sample weights, otherwise create weight vector of ones
  w = handle_weights(w,nrow(x))
  # Create variable names if not provided
  if ( is.null( colnames(x) ) ) colnames(x) = sprintf("var%s",seq_len(ncol(x)))
  
  # Lasso with full estimation sample
  lasso_full = glmnet(x,y,weights=as.vector(w),family="gaussian",...)
  coef_lasso_full = coef(lasso_full)                   # Save coefficients to extract later the ones at the CV minima
  lambda = lasso_full$lambda                           # Save grid to use the same in cross-validation
  
  # Get indicator for CV samples
  split = stats::runif(nrow(x))
  cvgroup = as.numeric(cut(split,stats::quantile(split,probs=seq(0,1,1/kf)),include.lowest=TRUE))  # Groups for k-fold CV
  list = seq_len(kf)                                   # Needed later in the loop to get the appropriate samples
  
  if (!parallel) {
    
    # Initialize matrices for MSE of Lasso and post-lasso for each grid point and CV fold
    cv_MSE_lasso = matrix(nrow = kf,ncol = length(lambda))
    cv_MSE_plasso = matrix(nrow = kf,ncol = length(lambda))
    
    # Start loop for cross-validation of Lasso and Post-Lasso
    for (i in list) {
      CV = CV_core(x,y,w,cvgroup,list,i,lambda,...)
      
      # Extract MSE of Lasso and Post-Lasso
      cv_MSE_lasso[i,] = CV$MSE_lasso
      cv_MSE_plasso[i,] = CV$MSE_plasso
    }
    
  } else if (parallel)  {
    
    n_cores = min(parallel::detectCores()-1,kf)
    
    cl = parallel::makeCluster(n_cores, methods=FALSE)
    doParallel::registerDoParallel(cl)
    
    para_res <- foreach::foreach(i = seq_len(kf), .combine="rbind", .inorder=FALSE) %dopar% {
                           
                           ## core CV program functionsxxx no. 9
                           CV = CV_core(x,y,w,cvgroup,list,i,lambda,...)
                           
                           ## Extract MSE of Lasso and Post-Lasso
                           cv_MSE_lasso = CV$MSE_lasso
                           cv_MSE_post_lasso = CV$MSE_plasso
                           
                           ## Matrices to be returned from cores
                           return(list(as.matrix(cv_MSE_lasso),as.matrix(cv_MSE_post_lasso)))
    }
    parallel::stopCluster(cl)
    
    para_res = as.matrix(do.call(cbind,para_res))
    cv_MSE_lasso = t(para_res[,seq_len(kf)])
    cv_MSE_plasso = t(para_res[,(kf+1):(2*kf)])
    
  }
  
  # Calculate mean MSEs over all folds
  cv_MSE_lasso[is.na(cv_MSE_lasso)] = max(cv_MSE_lasso) # can happen if glmnet does not go over the full grid
  cv_MSE_plasso[is.na(cv_MSE_plasso)] = max(cv_MSE_plasso) # and/or Post-Lasso has not full rank
  mean_MSE_lasso = colMeans(cv_MSE_lasso)
  mean_MSE_plasso = colMeans(cv_MSE_plasso)
  
  # Get grid position of minimum MSE
  ind_min_l = which.min(mean_MSE_lasso)
  ind_min_pl = which.min(mean_MSE_plasso)
  # Get variable names at minima
  names_l = names(coef_lasso_full[,ind_min_l])[which(coef_lasso_full[,ind_min_l] != 0)]
  names_pl = names(coef_lasso_full[,ind_min_pl])[which(coef_lasso_full[,ind_min_pl] != 0)]
  
  # Get Lasso coefficients at minimum of Lasso
  coef_min_l = coef_lasso_full[,ind_min_l][which(coef_lasso_full[,ind_min_l] != 0)]
  # Get Lasso coefficients at minimum of Post-Lasso
  coef_min_pl = coef_lasso_full[,ind_min_pl][which(coef_lasso_full[,ind_min_pl] != 0)]
  
  # Return names and coefficients
  output = list("call"=this.call,
                "lasso_full"=lasso_full,"kf"=kf,
                "cv_MSE_lasso"=cv_MSE_lasso,"cv_MSE_plasso"=cv_MSE_plasso,
                "mean_MSE_lasso"=mean_MSE_lasso, "mean_MSE_plasso"=mean_MSE_plasso,
                "ind_min_l"=ind_min_l,"ind_min_pl"=ind_min_pl,
                "lambda_min_l"=lambda[ind_min_l],"lambda_min_pl"=lambda[ind_min_pl],
                "names_l"=names_l,"names_pl"=names_pl,
                "coef_min_l"=coef_min_l,"coef_min_pl"=coef_min_pl,
                "x"=x,"y"=y,"w"=w)
  
  class(output) = "cv.plasso"
  return(output)
}


#' Print cross-validated (Post-) Lasso model
#' 
#' @description
#' Printing main insights from cross-validated (Post-) Lasso model.
#'
#' @param x \code{\link{cv.plasso}} object
#' @param ... Pass generic \code{\link[base]{print}} options
#' @param digits Integer, used for number formatting
#'
#' @return Prints basic statistics for different lambda values of a fitted \code{\link{plasso}} object,
#' i.e. cross-validated MSEs for both Lasso and Post-Lasso model as well as the number of active variables.
#' 
#' @method print cv.plasso
#'
#' @export
#'
print.cv.plasso = function(x,...,digits=max(3, getOption("digits")-3)) {
  
  plasso = x
  cat("\nCall: ", deparse(plasso$call), "\n\n")
  out = data.frame(Df=plasso$lasso_full$df,
                   `%Dev`=round(plasso$lasso_full$dev.ratio*100, 2),
                   Lambda=signif(plasso$lasso_full$lambda, digits),
                   MSE_lasso=plasso$mean_MSE_lasso,
                   MSE_plasso=plasso$mean_MSE_plasso,
                   check.names=FALSE,row.names=seq(along=plasso$lasso_full$df))
  class(out) = c("anova",class(out))
  print(out)
}


#' Summary of cross-validated (Post-) Lasso model
#' 
#' @description
#' Summary of cross-validated (Post-) Lasso model.
#'
#' @param object \code{\link{cv.plasso}} object
#' @param ... Pass generic \code{\link[base]{summary}} summary options
#' @param default TRUE for \code{\link[glmnet]{glmnet}}-like summary output, FALSE for more specific summary information
#'
#' @return For specific summary information: summary.cv.plasso object (using list structure) containing optimal
#' lambda values and associated MSEs for both cross-validated Lasso and Post-Lasso model.
#' For default: \code{\link[base:summary]{summaryDefault}} object.
#' 
#' @method summary cv.plasso
#'
#' @export
#'
#' @examples
#' # load toeplitz data
#' data(toeplitz)
#' # extract target and features from data
#' y = as.matrix(toeplitz[,1])
#' X = toeplitz[,-1]
#' # fit cv.plasso to the data
#' \donttest{p.cv = plasso::cv.plasso(X,y)}
#' # get informative summary statistics
#' \donttest{print(summary(p.cv, default=FALSE))}
#' # set default=TRUE for standard summary statistics
#' \donttest{print(summary(p.cv, default=TRUE))}
#' 
summary.cv.plasso = function(object,...,default=FALSE) {
  
  plasso = object
  
  if (default) {
    
    return(summary.default(plasso, ...))
    
  } else {
  
    value = list(
      call = plasso$call,
      mean_MSE_lasso = plasso$mean_MSE_lasso,
      mean_MSE_plasso = plasso$mean_MSE_plasso,
      lasso_full = plasso$lasso_full,
      ind_min_lasso = plasso$ind_min_l,
      ind_min_plasso = plasso$ind_min_pl
    )
    class(value) = "summary.cv.plasso"
    return(value)

  }
}


#' Print summary of (Post-) Lasso model
#'
#' @description
#' Prints summary information of \code{\link{cv.plasso}} object
#'
#' @param x Summary of plasso object (either of class \code{\link{summary.cv.plasso}} or \code{\link[base]{summary}})
#' @param ... Pass generic R \code{\link[base]{print}} options
#' @param digits Integer, used for number formatting
#' 
#' @return Prints information from \code{\link{summary.cv.plasso}} object into console.
#' 
#' @method print summary.cv.plasso
#'
#' @export
#' 
print.summary.cv.plasso = function(x,...,digits=max(3L, getOption("digits") - 3L)) {
  
  if (inherits(x,'summaryDefault')) {
    
    print.summaryDefault(x, digits=digits, ...)
    
  } else if (inherits(x,'summary.cv.plasso')){
    
    cat("\nCall:\n ", paste(deparse(x$call), sep="\n", collapse = "\n"), "\n\n", sep = "")
    
    cat("Lasso:\n Minimum CV MSE Lasso: ",toString(signif(min(x$mean_MSE_lasso,na.rm=TRUE),digits)))
    cat("\n Lambda at minimum: ",toString(signif(x$lasso_full$lambda[x$ind_min_l],digits)))
    cat("\n Active variables at minimum: ",names(coef(x$lasso_full)[,x$ind_min_l])[which(coef(x$lasso_full)[,x$ind_min_l] != 0)])
    
    cat("\nPost-Lasso:\n Minimum CV MSE Post-Lasso: ",toString(signif(min(x$mean_MSE_plasso,na.rm=TRUE),digits)))
    cat("\n Lambda at minimum: ",toString(signif(x$lasso_full$lambda[x$ind_min_pl],digits)))
    cat("\n Active variables at minimum: ",names(coef(x$lasso_full)[,x$ind_min_pl])[which(coef(x$lasso_full)[,x$ind_min_pl] != 0)])
    cat("\n")
    
  }
  
}


#' Predict after cross-validated (Post-) Lasso
#' 
#' @description
#' Prediction for cross-validated (Post-) Lasso.
#'
#' @param object Fitted \code{\link{cv.plasso}} model object
#' @param ... Pass generic \code{\link[stats]{predict}} options
#' @param newx Matrix of new values for x at which predictions are to be made. If no value is supplied, x from fitting procedure is used. This argument is not used for \code{type="coefficients"}.
#' @param type Type of prediction required. \code{"response"} returns fitted values, \code{"coefficients"} returns beta estimates.
#' @param s Determines whether prediction is done for all values of lambda (\code{"all"}) or only for the optimal lambda (\code{"optimal"}) according to the standard error-rule.
#' @param se_rule If equal to 0, predictions from cross-validated MSE minimum (default). Negative values go in the direction of smaller
#' models, positive values go in the direction of larger models (e.g. \code{se_rule=-1} creates the standard 1SE rule).
#' This argument is not used for \code{s="all"}.
#' 
#' @return List object containing either fitted values or coefficients for both
#' the Lasso and Post-Lasso models respectively.
#' \item{lasso}{Matrix with Lasso predictions or coefficients}
#' \item{plasso}{Matrix with Post-Lasso predictions or coefficients}
#'
#' @method predict cv.plasso
#'
#' @export
#' 
#' @examples
#' 
#' # load toeplitz data
#' data(toeplitz)
#' # extract target and features from data
#' y = as.matrix(toeplitz[,1])
#' X = toeplitz[,-1]
#' # fit cv.plasso to the data
#' \donttest{p.cv = plasso::cv.plasso(X,y)}
#' # predict fitted values along whole lambda sequence 
#' \donttest{pred = predict(p.cv, s="all")}
#' \donttest{head(pred$plasso)}
#' # predict fitted values for optimal lambda value (according to cross-validation) 
#' \donttest{pred_optimal = predict(p.cv, s="optimal")}
#' \donttest{head(pred_optimal$plasso)}
#' # predict fitted values for new feature set X
#' \donttest{X_new = head(X, 10)}
#' \donttest{pred_new = predict(p.cv, newx=X_new, s="optimal")}
#' \donttest{pred_new$plasso}
#' # get estimated coefficients along whole lambda sequence
#' \donttest{coefs = predict(p.cv, type="coefficients", s="all")}
#' \donttest{head(coefs$plasso)}
#' # get estimated coefficients for optimal lambda value according to 1-standard-error rule
#' \donttest{predict(p.cv, type="coefficients", s="optimal", se_rule=-1)}
#'
predict.cv.plasso = function(object,
                             ...,
                             newx=NULL,
                             type=c("response","coefficients"),
                             s=c("optimal","all"),
                             se_rule=0) {
  
  plasso = object
  # Check if type and s is valid
  type = match.arg(type)
  s = match.arg(s)
  
  x = plasso$x
  if ( is.null(newx) ) newx = x
  if ( is.null( colnames(x) ) ) colnames(x) = sprintf("var%s",seq_len(ncol(x)))
  colnames(newx) = colnames(x)
  x = add_intercept(x)
  newx = add_intercept(newx)
  
  y = plasso$y
  w = handle_weights(plasso$w,nrow(x))
  lambda_names = names(plasso$lasso_full$a0)
  
  if (s == "optimal") {
    
    oneSE_lasso = sqrt(apply(plasso$cv_MSE_lasso,2,var)/plasso$kf)
    oneSE_plasso = sqrt(apply(plasso$cv_MSE_plasso,2,var)/plasso$kf)
    oneSE_lasso[is.na(oneSE_lasso)] = 0
    oneSE_plasso[is.na(oneSE_plasso)] = 0
    ind_Xse_l = find_Xse_ind(plasso$mean_MSE_lasso,plasso$ind_min_l,oneSE_lasso,se_rule)
    ind_Xse_pl = find_Xse_ind(plasso$mean_MSE_plasso,plasso$ind_min_pl,oneSE_plasso,se_rule)
    
    coef_lasso = as.matrix(coef(plasso$lasso_full)[,ind_Xse_l])
    colnames(coef_lasso) = paste0("optimal(",se_rule,")")
    
    nm_act = names(coef(plasso$lasso_full)[,ind_Xse_pl])[which(coef(plasso$lasso_full)[,ind_Xse_pl] != 0)]
    coef_plasso = as.matrix(fit_betas(x,y,w,nm_act,coef(plasso$lasso_full)[,ind_Xse_pl]))
    colnames(coef_plasso) = paste0("optimal(",se_rule,")")
    
    if (type == "coefficients") {
      
      return(list(
        "lasso"=as(coef_lasso,"dgCMatrix"),
        "plasso"=as(coef_plasso,"dgCMatrix")
        )
      )
      
    } else if (type == "response"){
      
      # Fitted values for lasso
      fit_lasso = as.matrix(newx %*% coef_lasso)
      colnames(fit_lasso) = paste0("optimal(",se_rule,")")
      
      # Fitted values for post lasso
      fit_plasso = as.matrix(newx %*% coef_plasso)
      colnames(fit_plasso) = paste0("optimal(",se_rule,")")
      
      return(list("lasso"=fit_lasso,"plasso"=fit_plasso))
      
    }
    
  } else if (s == "all"){
    
    l = length(plasso$lasso_full$lambda)
    
    coef_lasso = as.matrix(coef(plasso$lasso_full))
    coef_plasso = matrix(NA,nrow=ncol(x),ncol=l,dimnames=list(colnames(x),colnames(coef_lasso)))
    
    for (i in seq_len(l)) {
  
      nm_act = names(coef_lasso[,i])[which(coef_lasso[,i] != 0)]
      coef_plasso[,i] = fit_betas(x,y,w,nm_act,coef_lasso[,i])
    }
    
    if (type == "coefficients") {
      
      return(list(
        "lasso"=as(coef_lasso,"dgCMatrix"),
        "plasso"=as(coef_plasso,"dgCMatrix")
      )
      )
      
    } else if (type == "response"){
      
      fit_lasso = matrix(NA,nrow=nrow(newx),ncol=l,dimnames=list(NULL,colnames(coef_lasso)))
      fit_plasso = matrix(NA,nrow=nrow(newx),ncol=l,dimnames=list(NULL,colnames(coef_lasso)))
      
      for (i in seq_len(l)) {
        
        fit_lasso[,i] = newx %*% coef_lasso[,i]
        fit_plasso[,i] = newx %*% coef_plasso[,i]
      }
      
      return(list("lasso"=fit_lasso,"plasso"=fit_plasso))
      
    }
  }
}


#' Extract coefficients from a \code{\link{cv.plasso}} object
#' 
#' @description
#' Extract coefficients for both Lasso and Post-Lasso from a \code{\link{cv.plasso}} object.
#' 
#' @param object \code{\link{cv.plasso}} object
#' @param ... Pass generic \code{\link[stats]{coef}} options
#' @param s Determines whether coefficients are extracted for all values of lambda ("all") or only for the optimal lambda ("optimal") according to the specified standard error-rule.
#' @param se_rule If equal to 0, predictions from cross-validated MSE minimum (default). Negative values go in the direction of smaller
#' models, positive values go in the direction of larger models (e.g. \code{se_rule=-1} creates the standard 1SE rule).
#' This argument is not used for \code{s="all"}.
#' 
#' @return List object containing coefficients for both the Lasso and Post-Lasso
#' models respectively.
#' \item{lasso}{Sparse \code{\link[Matrix:dgCMatrix-class]{dgCMatrix}} with Lasso coefficients}
#' \item{plasso}{Sparse \code{\link[Matrix:dgCMatrix-class]{dgCMatrix}} with Post-Lasso coefficients}
#'
#' @method coef cv.plasso
#'
#' @export 
#' 
#' @examples
#' 
#' # load toeplitz data
#' data(toeplitz)
#' # extract target and features from data
#' y = as.matrix(toeplitz[,1])
#' X = toeplitz[,-1]
#' # fit cv.plasso to the data
#' \donttest{p.cv = plasso::cv.plasso(X,y)}
#' # get estimated coefficients along whole lambda sequence
#' \donttest{coefs = coef(p.cv, s="all")}
#' \donttest{head(coefs$plasso)}
#' # get estimated coefficients for optimal lambda value according to 1-standard-error rule
#' \donttest{coef(p.cv, s="optimal", se_rule=-1)}
#'
coef.cv.plasso = function(object,...,s=c("optimal","all"),se_rule=0){
  return(predict(object,...,s=s,se_rule=se_rule,type="coefficients"))
}


#' Plot of cross-validation curves
#' 
#' @description
#' Plot of cross-validation curves.
#' 
#' @param x \code{\link{cv.plasso}} object
#' @param ... Pass generic \code{\link[base]{plot}} options
#' @param legend_pos Legend position. Only considered for joint plot (lass=FALSE).
#' @param legend_size Font size of legend
#' @param lasso If set as True, only the cross-validation curve for the Lasso model is plotted. Default is False.
#' 
#' @return Plots the cross-validation curves for both Lasso and Post-Lasso models (incl. upper and lower standard deviation curves)
#' for a fitted \code{\link{cv.plasso}} object.
#' 
#' @method plot cv.plasso
#'
#' @export
#' 
#' @examples
#' # load toeplitz data
#' data(toeplitz)
#' # extract target and features from data
#' y = as.matrix(toeplitz[,1])
#' X = toeplitz[,-1]
#' # fit cv.plasso to the data
#' \donttest{p.cv = plasso::cv.plasso(X,y)}
#' # plot cross-validated MSE curves and number of active coefficients
#' \donttest{plot(p.cv, legend_pos="bottomleft")}
#'
plot.cv.plasso = function(x,...,
                          legend_pos=c("bottomright","bottom","bottomleft","left","topleft","top","topright","right","center"),
                          legend_size=0.5,
                          lasso=FALSE) {
  
  plasso = x
  legend_pos = match.arg(legend_pos)
  
  if (!lasso) {
  
    # Standard error of folds
    oneSE_lasso = sqrt(apply(plasso$cv_MSE_lasso,2,var)/plasso$kf)
    oneSE_plasso = sqrt(apply(plasso$cv_MSE_plasso,2,var)/plasso$kf)
    oneSE_lasso[is.na(oneSE_lasso)] = 0
    oneSE_plasso[is.na(oneSE_plasso)] = 0
    
    # Calculate 1SE bands
    lasso_1se_up = plasso$mean_MSE_lasso+oneSE_lasso
    lasso_1se_low = plasso$mean_MSE_lasso-oneSE_lasso
    plasso_1se_up = plasso$mean_MSE_plasso+oneSE_plasso
    plasso_1se_low = plasso$mean_MSE_plasso-oneSE_plasso
    
    # Get ranges for the graph
    xrange = range(log(plasso$lasso_full$lambda))
    yrange = c(max(-1.7e+308,min(lasso_1se_low,plasso_1se_low)),
               min(1.7e+308,max(lasso_1se_up,plasso_1se_up)))
    
    # Plot mean lines
    ylab = "Mean-squared Error"
    xlab = expression(Log(lambda))
    graphics::plot(xrange,yrange,type="n",xlab=xlab,ylab=ylab)
    graphics::lines(log(plasso$lasso_full$lambda),plasso$mean_MSE_lasso,lwd=1.5,col="blue")
    graphics::lines(log(plasso$lasso_full$lambda),plasso$mean_MSE_plasso,lwd=1.5,col="red")
    
    # Plot upper and lower 1SE lines
    graphics::lines(log(plasso$lasso_full$lambda),lasso_1se_up,lty=2,lwd=1,col="blue")
    graphics::lines(log(plasso$lasso_full$lambda),lasso_1se_low,lty=2,lwd=1,col="blue")
    graphics::lines(log(plasso$lasso_full$lambda),plasso_1se_up,lty=2,lwd=1,col="red")
    graphics::lines(log(plasso$lasso_full$lambda),plasso_1se_low,lty=2,lwd=1,col="red")
    
    # Show location of minima
    graphics::abline(v=log(plasso$lambda_min_l),lty = 1,col="blue")
    graphics::abline(v=log(plasso$lambda_min_pl),lty = 1,col="red")
    
    # Print legend
    graphics::legend(legend_pos, c("Lasso MSE","Lasso MSE+-1SE","Post-Lasso MSE","Post-Lasso MSE+-1SE","# active coeff."),lty=c(1,2,1,2,1),
                     col=c('blue','blue','red','red','forestgreen'),ncol=1,bty ="n",cex=legend_size)
    
    oldpar = graphics::par(no.readonly = TRUE)
    on.exit(graphics::par(oldpar))
    
    # Open a new graph for number of coefficients to be written on existing
    graphics::par(new=TRUE)
    graphics::plot(log(plasso$lasso_full$lambda),plasso$lasso_full$df,axes=F,xlab=NA,ylab=NA,cex=1.2,type="l",col="forestgreen",lwd=1.5)
    graphics::axis(side=4)
    graphics::mtext(side=4, line=3, "# active coefficients")
    
  } else if (lasso) {
    
    plot(plasso$lasso_full,...)
    
  }
}


#' Core part for (Post-) Lasso cross-validation
#' 
#' @description
#' \code{\link{CV_core}} contains the core parts of the cross-validation for Lasso and Post-Lasso.
#' 
#' @param x Covariate matrix to be used in cross-validation
#' @param y Vector of outcomes
#' @param w Vector of weight
#' @param cvgroup Categorical with k groups to identify folds
#' @param list List 1:k
#' @param i Number of fold that is used for prediction
#' @param lambda Series of lambdas used
#' @param ... Pass \code{\link[glmnet]{glmnet}} options
#'
#' @return MSE_lasso / MSE_plasso: means squared errors for each lambda.
#'
#' @keywords internal
#'
CV_core = function(x,y,w,cvgroup,list,i,lambda,...) {
  
  # Get estimation and prediction sample for this specific fold
  x_est_cv = x[cvgroup %in% list[-i],]
  y_est_cv = y[cvgroup %in% list[-i]]
  w_est_cv = w[cvgroup %in% list[-i],]
  x_pred_cv = x[cvgroup == list[i],]
  y_pred_cv = y[cvgroup == list[i]]
  w_pred_cv = w[cvgroup == list[i],]
  
  # Normalize the weights to N
  w_est_cv = norm_w_to_n(as.matrix(w_est_cv))
  w_pred_cv = norm_w_to_n(as.matrix(w_pred_cv))
  
  # Estimate Lasso for this fold using the grid of the full sample
  lasso_cv = glmnet::glmnet(x_est_cv, y_est_cv,lambda = lambda,weights=as.vector(w_est_cv),
                    family="gaussian",...)
  coef_lasso_cv = coef(lasso_cv)                                       # Save coefficients at each grid point
  
  # Predicted values with lasso coefficients for prediction sample at each grid
  fit_lasso = predict(lasso_cv,x_pred_cv)
  if (ncol(fit_lasso) != length(lambda)) {
    fit_lasso = cbind(fit_lasso,matrix(NA,nrow(fit_lasso),(length(lambda)-ncol(fit_lasso))))
  }
  
  ## Now calculate predicted values for post Lasso at each grid
  # Initialize first matrix for fitted values
  fit_plasso = matrix(NA,nrow = nrow(fit_lasso),ncol = ncol(fit_lasso))
  
  # Figure out which coefficients are active at each Lasso grid
  act_coef = (coef_lasso_cv != 0)         # boolean matrix
  
  ## Calculate full covariance matrix X'X and X'y once and select only the relevant in the loop below (much faster)
  # First, figure out variables that were active at least once and get only these
  act_once = apply(act_coef,1,any)
  nm_all_act_coef = rownames(act_coef)[act_once]
  if (length(nm_all_act_coef) == 1) {
    warning("No variables selected in one CV, even for lowest lambda, might reconsider choice of penalty term grid")
    fit_plasso = fit_lasso
  } else {
    ## Create the covariate matrix to "manually" calculate the fitted values (much faster than the build in lm command)
    if (sum(abs(coef_lasso_cv[1,])) == 0) {   # Indicates that no intercept was used
      x_all_act = x_est_cv[,nm_all_act_coef]
    }
    else if (sum(abs(coef_lasso_cv[1,])) != 0) {    # Indicates that intercept was used
      x_all_act = add_intercept(x_est_cv[,nm_all_act_coef[2:length(nm_all_act_coef)],drop=F])
      colnames(x_all_act)[1] = "(Intercept)"
      # add intercept also to prediction sample
      x_pred_cv = add_intercept(x_pred_cv[,nm_all_act_coef[2:length(nm_all_act_coef)],drop=F])
      colnames(x_pred_cv)[1] = "(Intercept)"
    }
    else {
      stop("Something strange happens with the intercepts at the Lasso path")
    }
    # Add weights
    x_w = apply(x_all_act,2,`*`,sqrt(w_est_cv))
    y_w = y_est_cv * sqrt(w_est_cv)
    
    # Get X'X
    XtX_all = crossprod(x_w)
    # Get X'y
    Xty_all = crossprod(x_w,y_w)
    
    # Initialize vector to save chosen variable names to figure out whether new variables were added from the last to the next grid
    nm_act_coef_prev = "nobody should call a variable like this"
    
    ## Loop over the grid of Lambdas to get the Post-Lasso predictions
    for (j in seq_len(length(lambda))) {
      # Get names of active variables at this grid
      nm_act_coef = rownames(act_coef)[act_coef[,j]]
      if (identical(nm_act_coef,character(0)) == TRUE) {
        # print("No variable selected at this grid => no prediction")
        next
      }
      if (identical(nm_act_coef,nm_act_coef_prev) == TRUE & j > 1) {
        # print("Same variables selected as in previous grid => Post-Lasso predictions remain unchanged")
        fit_plasso[,j] = fit_plasso[,(j-1)]
        next
      }
      else {     # Get prediction covariate matrix for that grid
        x_ols_pred = as.matrix(x_pred_cv[,nm_act_coef])
      }
      
      # Get OLS Post-Lasso predictions for this grid point
      fit = fitted_values_cv(XtX_all,Xty_all,x_ols_pred,nm_act_coef)
      if (is.null(fit) & j == 1) {
        fit_plasso[,j] = rep(mean(y),nrow(fit_plasso))
        next
      }
      if (is.null(fit) & j > 1) {
        # cat("\n X'X not invertible at grid",toString(j),": Use last feasible coefficients")
        fit_plasso[,j] = fit_plasso[,(j-1)]
        next
      } else{
        fit_plasso[,j] = fit
      }
      
      # Update current active covariates for the check whether it changes in the next grid
      nm_act_coef_prev = nm_act_coef
    }       # end loop over grids
    
  } # end if at least one var selected
  
  # Matrix with "real" outcome values for each grid
  y_rep = matrix(rep(y_pred_cv,length(lambda)),nrow=nrow(fit_lasso),ncol=ncol(fit_lasso))
  
  # Get RMSE
  SE_lasso = (y_rep - fit_lasso)^2
  SE_plasso = (y_rep - fit_plasso)^2
  if (!is.null(w)) {                                     # Weight errors by "sampling weights"
    SE_lasso = apply(SE_lasso,2,"*",w_pred_cv)
    SE_plasso = apply(SE_plasso,2,"*",w_pred_cv)
  }
  MSE_lasso = apply(SE_lasso,2,mean)
  MSE_plasso = apply(SE_plasso,2,mean)
  
  return(list("MSE_lasso" = MSE_lasso,"MSE_plasso" = MSE_plasso))
}

Try the plasso package in your browser

Any scripts or data that you put into this service are public.

plasso documentation built on May 29, 2024, 6:46 a.m.