R/internal.R
In plsmselect: Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

Documented in find_family formula_setup lasso_gam_loop meandist nzeros

## SECONDARY INTERNAL FUNCTIONS

#' Internal Function
#'
#' Undocumented function. Do not use directly
#'
#' @param fam Family in character form
find_family = function(fam)
{
  lasso_family = fam
  if(fam == "cox") gam_family = mgcv::cox.ph()
  else gam_family = eval(parse(text = paste(fam, "()", sep = "")))
  return(list(gam_family = gam_family, lasso_family = lasso_family))
}

#' Internal Function
#'
#' Undocumented function. Do not use directly
#'
#' @inheritParams gamlassoChecks
#' @param formula A formula to be parsed
#' @param interactions logical. Should interactions be included.
#'
#' @importFrom dplyr "%>%"
formula_setup = function(formula = NULL, response.name = NULL, linear.name = NULL,
                         smooth.name = NULL, family = NULL, smooth.penalty = NULL,
                         num.knots = NULL, offset.name = NULL, interactions = F)
{
  if(is.null(formula))
  {
    if(length(response.name)==1)
      r.formula = response.name
    else
    {
      if(family=="binomial")
        r.formula = paste("cbind(", paste(response.name, collapse = ","), ")", sep = "")
      else
        stop("Multiple responses allowed only for binomial models.")
    }

    if(is.null(linear.name)) {
      l.formula = NULL
    } else {
      l.formula = paste(linear.name, collapse = "+")
    }

    if(is.null(smooth.name)) {
      s.formula = NULL
    }
    else
    {
      if(smooth.penalty == 1)
        s.formula = paste(paste("s(",smooth.name,",k=",num.knots,",bs='ts')",sep=""),collapse="+")
      else
        s.formula = paste(paste("s(",smooth.name,",k=",num.knots,",bs='tp')",sep=""),collapse="+")
    }

    if(is.null(offset.name)) {
      o.formula = NULL
    } else {
      o.formula = paste("+offset(", offset.name, ")", sep = "")
    }

    if(interactions)
    {
      ll.int.formula = expand.grid(linear.name, linear.name) %>%
        apply(1, function(x) paste(x, collapse = ":")) %>%
        paste(collapse = "+")

      l.formula = paste(l.formula, ll.int.formula, collapse = "+")

      ## Number of knots might be a problem if diff for separate smooth.terms

      if(smooth.penalty==1)
      {
        s.formula = paste(paste("ti(",smooth.name,",k=",num.knots,",bs='ts')",
                                sep=""),collapse="+")
        ls.int.formula = expand.grid(smooth.name, linear.name) %>%
          apply(1, function(x) paste0("ti(",x[1],",by=",x[2],
                                      ",k=",num.knots,",ordered=T,bs='ts')")) %>%
          paste(collapse = "+")

        ss.int.formula = expand.grid(smooth.name, smooth.name) %>%
          apply(1, function(x) paste0("ti(",x[1],",",x[2],
                                      ",k=",num.knots,",bs='ts')")) %>%
          paste(collapse = "+")
      }
      else
      {
        s.formula = paste(paste("ti(",smooth.name,",k=",num.knots,",bs='tp')",
                                sep=""),collapse="+")
        ls.int.formula = expand.grid(linear.name, smooth.name) %>%
          apply(1, function(x) paste0("ti(",x[1],",by=",x[2],
                                      ",k=",num.knots,",ordered=T,bs='tp')")) %>%
          paste(collapse = "+")

        ss.int.formula = expand.grid(smooth.name, smooth.name) %>%
          apply(1, function(x) paste0("ti(",x[1],",",x[2],
                                      ",k=",num.knots,",bs='tp')")) %>%
          paste(collapse = "+")
      }

      s.formula = paste(s.formula, ls.int.formula, ss.int.formula, collapse = "+")
    }
  }
  else
  {
    fm = deparse(formula) %>% paste(collapse = "") %>%
      (function(x) {gsub("[[:space:]]", "", x)}) %>% strsplit("~") %>% unlist()
    # The anonymous function is only defined to circumvent the note that we get
    # for using . and which is not allowed in CRAN

    r.formula = fm[1]

    if(grepl("X+", fm[2])) {
      l.formula = "X"
    } else {
      l.formula = NULL
    }

    #fm1 = strsplit(fm[2], "X+", fixed = T) %>% unlist() %>% paste(collapse = "")
    fm1 = gsub("X+", "", fm[2], fixed = T)
    if(is.na(fm1) || nchar(fm1) == 0) {
      s.formula = NULL
    } else {
      s.formula = fm1
    }

    if(is.null(offset.name)) {
      o.formula = NULL
    } else {
      o.formula = paste("+offset(", offset.name, ")", sep = "")
    }
  }

  formulae = list(response = r.formula, linear = l.formula,
                  smooth = s.formula, offset = o.formula)
  return(formulae)
}

detect_one_two = function(response) {

  if(grepl("cbind", response)) {
    y = gsub("cbind(", "", response, fixed = T)
    y = gsub(")", "", y, fixed = T)
    y = strsplit(y, ",")
    y = unlist(y)
  } else {
    y = response
  }
  return(y)
}


lasso_response = function(response.name, family, data, train.weights)
{
  y = NULL
  if(family == "binomial" && length(response.name) == 2)
  {
    y = data %>% dplyr::select(response.name[2:1]) %>% as.matrix()
  }
  else if(family == "cox")
  {
    response = data %>% dplyr::pull(response.name)
    y = survival::Surv(response, train.weights)
  }
  else
  {
    y = data %>% dplyr::pull(response.name)
  }
  return(y)
}


#' Internal Function
#'
#' Undocumented function. Do not use directly
#'
#' @param x,y Vectors of the same length
meandist = function(x,y)
{
  d1 = sum((x-y)^2)
  d1 = sqrt(d1/length(x))
  return(as.numeric(d1))
}

#' Internal Function
#'
#' Undocumented function. Do not use directly
#'
#' @param x,y Vectors of the same length
nzeros = function(x,y=NULL)
{
  z1 = sum(x==0)
  z2 = NULL
  if(!is.null(y)) z2 = sum((x==0) & (y==0))
  return(c(z1,z2))
}

#' Internal Function
#'
#' Undocumented function. Do not use directly
#'
#' @param data The data with value for all the linear and smooth predictors
#' @param families List of two families as returned by \code{find_family}
#' @param formulae List of formulae as returned by \code{formula_setup}
#' @param weights Vector with values of the weights variable if it exists.
#'   \code{NULL} otherwise.
#' @inheritParams gamlassoChecks
#'
#' @importFrom dplyr one_of "%>%"
#' @importFrom stats as.formula model.matrix cor
lasso_gam_loop = function(data, response.name, families, formulae,
                          num.iter, tolerance, offset.name, weights, seed)
{
  off = train.offset = rep(0, nrow(data))
  if(!is.null(offset.name)) train.offset = data %>% dplyr::pull(offset.name)
  off = off + train.offset
  set.seed(seed)

  ## INITIAL GAM STEP
  eqn = as.formula(paste(formulae$response, "~", formulae$smooth))
  gam.model = mgcv::gam(eqn, family = families$gam_family, data = data,
                        weights = weights, offset = off)

  gam.pred = mgcv::predict.gam(gam.model, newdata = data) %>% as.numeric()
  gam.coefs = gam.model$coefficients %>% as.numeric()

  # gam prediction does not include offset if not specified in formula
  # We could just have gam.pred as the new offset but we would need to
  # change the formula in gam which leads to problems while predicting
  off = gam.pred + train.offset

  ## INITIAL LASSO STEP
  #xmat = model.matrix(as.formula(paste("~", formulae$linear)), data = data)[,-1]
  xmat = as.formula(paste("~", formulae$linear)) %>% model.matrix(data = data) %>%
    (function(x) {x[,-1]}) %>% as.matrix()
  # The anonymous function is only defined to circumvent the note that we get
  # for using . and which is not allowed in CRAN

  y = lasso_response(response.name, families$lasso_family, data, weights)

  ## For cox fits specifying intercept to be T or F or NULL all give warnings
  ## Although the output with intercept=F is the same as specifying nothing
  if(families$lasso_family == "cox")
  {
    cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off,
                                       family=families$lasso_family, maxit=1000)
    lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off,
                                 family=families$lasso_family, maxit=1000)
  }
  else
  {
    cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off, intercept=F,
                                       family=families$lasso_family, maxit=1000)
    lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off, intercept=F,
                                 family=families$lasso_family, maxit=1000)
  }


  lasso.pred = stats::predict(cv.lasso.model, newx=xmat, s="lambda.min",
                              newoffset=0) %>% as.numeric()
  lasso.coefs = stats::coef(cv.lasso.model, s="lambda.min") %>% as.numeric()

  # glmnet prediction include offset
  # but suppressed by newoffset=0
  off = lasso.pred + train.offset

  convergence = FALSE

  convmat = matrix(0, nrow = 6, ncol = 1)
  convmat[5,1] = nzeros(lasso.coefs)

  for(i in 1:num.iter)
  {
    set.seed(seed)

    ## GAM STEP
    gam.model = mgcv::gam(eqn, family = families$gam_family, data = data,
                          weights = weights, offset = off)

    new.gam.pred = mgcv::predict.gam(gam.model, newdata = data) %>% as.numeric()
    new.gam.coefs = gam.model$coefficients %>% as.numeric()

    off = new.gam.pred + train.offset

    ## LASSO STEP
    ## For cox fits specifying intercept to be T or F or NULL all give warnings
    ## Although the output with intercept=F is the same as specifying nothing
    if(families$lasso_family == "cox")
    {
      cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off,
                                         family=families$lasso_family, maxit=1000)
      lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off,
                                   family=families$lasso_family, maxit=1000)
    }
    else
    {
      cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off, intercept=F,
                                         family=families$lasso_family, maxit=1000)
      lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off, intercept=F,
                                   family=families$lasso_family, maxit=1000)
    }

    new.lasso.pred = stats::predict(cv.lasso.model, newx=xmat,
                                    s="lambda.min", newoffset=0) %>% as.numeric()
    new.lasso.coefs = stats::coef(cv.lasso.model, s="lambda.min") %>% as.numeric()

    off = new.lasso.pred + train.offset

    ## CHECK CONVERGENCE
    if(meandist(lasso.pred, new.lasso.pred) < tolerance && meandist(gam.pred, new.gam.pred) < tolerance)
    {
      convergence = TRUE
      break
    }
    else
    {
      convnew = c(meandist(lasso.coefs, new.lasso.coefs),
                  cor(lasso.coefs, new.lasso.coefs),
                  meandist(gam.coefs, new.gam.coefs),
                  cor(gam.coefs, new.gam.coefs),
                  nzeros(lasso.coefs, new.lasso.coefs))
      convmat = cbind(convmat, convnew)

      lasso.pred = new.lasso.pred
      gam.pred = new.gam.pred
    }

  }

  return(list(lasso = list(cv.model = cv.lasso.model, model = lasso.model),
              gam = gam.model, convergence = convergence, convmat = convmat))
}



## Old gam and lasso loop where lasso was fitted first and then the gam
## Issue was that lasso fitted intercepts on top of intercept from gam
## We got rid of it by using intercept=F and reversing the order so we
## always start with some fitted intercept. If we didn't reverse we'd
## need another extra step just to fit a first intercept.
# lasso_gam_loop = function(data, response.name, families, formulae,
#                           num.iter, tolerance, offset.name, status, seed)
# {
#   off = train.offset = rep(0, nrow(data))
#   if(!is.null(offset.name)) train.offset = data %>% dplyr::pull(offset.name)
#   off = off + train.offset
#   set.seed(seed)
#
#   ## INITIAL LASSO STEP
#
#   xmat = model.matrix(as.formula(paste("~", formulae$linear)), data = data)[,-1]
#   response = data %>% dplyr::pull(response.name)
#   if(families$lasso_family != "cox")
#     y = response
#   else
#     y = survival::Surv(response, status)
#
#   cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off, family=families$lasso_family, maxit=1000)
#   lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off, family=families$lasso_family, maxit=1000)
#
#   lasso.pred = as.numeric(glmnet::predict.cv.glmnet(cv.lasso.model, newx=xmat, s="lambda.min", newoffset=0))
#   lasso.coefs = as.numeric(glmnet::coef.cv.glmnet(cv.lasso.model, s="lambda.min"))
#
#   # glmnet prediction include offset
#   off = lasso.pred + train.offset
#
#   ## INITIAL GAM STEP
#
#   eqn = as.formula(paste(response.name, "~", formulae$smooth))
#   gam.model = mgcv::gam(eqn, family = families$gam_family, data = data,
#                         weights = status, offset = off)
#
#   gam.pred = as.numeric(mgcv::predict.gam(gam.model, newdata = data))
#   gam.coefs = as.numeric(gam.model$coefficients)
#
#   # gam prediction does not include offset if not specified in formula
#   # We could just have gam.pred as the new offset but we would need to
#   # change the formula in gam which leads to problems while predicting
#   off = gam.pred + train.offset
#
#   convergence = FALSE
#
#   convmat = matrix(0, nrow = 6, ncol = 1)
#   convmat[5,1] = nzeros(lasso.coefs)
#
#   # lasso_dist = lasso_corr = vector()
#   # gam_dist = gam_corr = vector()
#   # lasso_zeros = matrix(nrow = 2, ncol = 0)
#   # lasso_zeros[1,1] = nzeros(lasso.coefs)
#
#   for(i in 1:num.iter)
#   {
#     set.seed(seed)
#     ## LASSO STEP
#
#     cv.lasso.model = glmnet::cv.glmnet(x=xmat, y=y, offset=off, family=families$lasso_family, maxit=1000)
#     lasso.model = glmnet::glmnet(x=xmat, y=y, offset=off, family=families$lasso_family, maxit=1000)
#
#     new.lasso.pred = as.numeric(glmnet::predict.cv.glmnet(cv.lasso.model, newx=xmat, s="lambda.min", newoffset=0))
#     new.lasso.coefs = as.numeric(glmnet::coef.cv.glmnet(cv.lasso.model, s="lambda.min"))
#
#     off = new.lasso.pred + train.offset
#
#     ## GAM STEP
#
#     gam.model = mgcv::gam(eqn, family = families$gam_family, data = data,
#                           weights = status, offset = off)
#
#     new.gam.pred = as.numeric(mgcv::predict.gam(gam.model, newdata = data))
#     new.gam.coefs = as.numeric(gam.model$coefficients)
#
#     off = new.gam.pred + train.offset
#
#     if(meandist(lasso.pred, new.lasso.pred) < tolerance && meandist(gam.pred, new.gam.pred) < tolerance)
#     {
#       convergence = TRUE
#       break
#     }
#     else
#     {
#       convnew = c(meandist(lasso.coefs, new.lasso.coefs),
#                   cor(lasso.coefs, new.lasso.coefs),
#                   meandist(gam.coefs, new.gam.coefs),
#                   cor(gam.coefs, new.gam.coefs),
#                   nzeros(lasso.coefs, new.lasso.coefs))
#       convmat = cbind(convmat, convnew)
#
#       lasso.pred = new.lasso.pred
#       gam.pred = new.gam.pred
#     }
#
#   }
#
#   return(list(lasso = list(cv.model = cv.lasso.model, model = lasso.model),
#               gam = gam.model, convergence = convergence, convmat = convmat))
# }

Any scripts or data that you put into this service are public.

plsmselect documentation built on Dec. 1, 2019, 1:11 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

plsmselect
Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

R/internal.R
In plsmselect: Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

Defines functions find_family formula_setup detect_one_two lasso_response meandist nzeros lasso_gam_loop

Documented in find_family formula_setup lasso_gam_loop meandist nzeros

Try the plsmselect package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

plsmselect Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

R/internal.R In plsmselect: Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

Defines functions find_family formula_setup detect_one_two lasso_response meandist nzeros lasso_gam_loop

Documented in find_family formula_setup lasso_gam_loop meandist nzeros

Try the plsmselect package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

plsmselect
Linear and Smooth Predictor Modelling with Penalisation and Variable Selection

R/internal.R
In plsmselect: Linear and Smooth Predictor Modelling with Penalisation and Variable Selection