R/msqrobsum__bugfix.R

Defines functions msqrobsum

### original code from this commit: https://github.com/statOmics/MSqRobSum/commit/2875dcde1f89578685ed0a3316d6f66fa510b732
### here adapted to get optimal multiprocessing for vastly reduced computation times  &  compatibility with latest dplyr

### FRANK: removed all documentation/examples/"additional functions we don't need" and kept minimal set of imports
#' @import dplyr
#' @import tidyr
#' @importFrom limma squeezeVar
#' @importFrom lme4 refit getME findbars lmerControl
#' @importFrom purrr map imap map_lgl map_chr
### FRANK: add 2 imports below
#' @importFrom foreach %dopar%
#' @importFrom parallel clusterExport
#' @importFrom tidyselect any_of
### FRANK: below code is adapted only on lines/sections tagged with my name
msqrobsum <- function(
  data, formulas, group_vars = 'protein', contrasts = NULL
  , mode = c('msqrobsum','msqrob', 'sum')
  , robust_lmer_iter = 'auto'
  , squeeze_variance = TRUE
  , p_adjust_method = c("BH",p.adjust.methods)
  , keep_model = FALSE
  , rlm_args = list(maxit = 20L)
  , lmer_args = list(control = lme4::lmerControl(calc.derivs = FALSE)) ### FRANK: add lme4
  , parallel_args = list(strategy = 'multisession')
  ## EXPERIMENTAL OPTIONS
  , type_df = 'traceHat'
  ## , type_df = c('traceHat','conservative')
  , squeeze_covariate = FALSE
  , fit_fun = do_mm
){
  # rlang::exec(plan, !!!parallel_args) ### FRANK: remove this line, updated multiprocessing below
  mode = match.arg(mode)

  if (robust_lmer_iter == 'auto')
    robust_lmer_iter <- ifelse(mode == 'msqrob', 20L, 1L)
  type_df = match.arg(type_df)
  p_adjust_method = match.arg(p_adjust_method)
  if (missing(formulas) & mode == 'sum') formulas = ''
  if ((mode != 'sum') & length(unlist(purrr::map(formulas,lme4::findbars))) == 0) ### FRANK: add lme4 and purrr
    stop('msqrobsum only works with formulas with at least 1 random effect specified')
  if (is(data,'MSnSet')) data <- MSnSet2df(data)
  if (is(contrasts, 'character')){
    if (!all(c(formulas) %>% purrr::map_lgl(~{contrasts %in% purrr::map_chr(lme4::findbars(.),all.vars)}))) ### FRANK: add lme4 and purrr
      stop('Contrasts can only be constructed from variable name when the variable is specified as random effect')
    contrasts <- make_simple_contrast(data, contrasts)}
  group_vars <- purrr::map_chr(group_vars, ~match.arg(.,names(data))) ### FRANK: add purrr
  model_vars <- c(formulas) %>% purrr::map(all.vars) %>% unlist %>% unique ### FRANK: add purrr


  ### FRANK: begin multiprocessing fix
  ### begin original code
  # df <- data %>% select(group_vars,model_vars, 'expression', 'feature','sample') %>%
  #   group_by_at(group_vars) %>% nest %>%
  #   mutate(mm = future_lapply(data, fit_fun, robust_lmer_iter = robust_lmer_iter
  #                             , rlm_args = rlm_args, lmer_args =lmer_args
  #                             , contrasts = contrasts, keep_model = keep_model
  #                             , formulas = formulas, mode = mode)) %>%
  #   unnest(mm) %>% ungroup
  ### end original code
  fit_fun = match.fun(fit_fun)
  # prep data
  df <- data %>% select(tidyselect::any_of(c(group_vars, model_vars)), 'expression', 'feature','sample') %>% group_by_at(group_vars) %>% nest
  ## actual code change, from future package to foreach. Internal version number for updates to this codebase; v2.2
  if(!exists("cl")) stop("first, setup a cluster for multiprocessing")
  ## functions from global/parent environment
  parallel::clusterExport(cl, varlist = c("do_mm", "do_lmerfit", "summarizeRobust", "getVcovBetaBUnscaled", "getBetaB", "calculate_df", "getDf", "get_contrasts"), envir = parent.env(environment())) # .GlobalEnv)
  # arguments from local environment
  parallel::clusterExport(cl, varlist = c("robust_lmer_iter", "rlm_args", "lmer_args", "contrasts", "keep_model", "formulas", "mode"), envir = environment()) # environment(fun = NULL)
  df$mm = foreach::foreach(d = df$data, .packages = c("MASS", "lme4", "purrr", "dplyr", "tibble")) %dopar% {
    fit_fun(d, robust_lmer_iter = robust_lmer_iter
            , rlm_args = rlm_args, lmer_args =lmer_args
            , contrasts = contrasts, keep_model = keep_model
            , formulas = formulas, mode = mode)
  }
  # debug; x = do_mm(df$data[[1]], formulas, contrasts, mode, robust_lmer_iter, rlm_args, lmer_args, keep_model = T)
  ### FRANK: end multithreading fix

  df = df %>% unnest(cols = mm) %>% ungroup() ### FRANK; add ungroup

  if (mode == 'sum') return(df)
  ## Return also failed ones afterward
  rows_fail = apply(df, 1, function(x) any(lengths(x) == 0 | is.na(x)) ) ### FRANK; fix fail group filter @ dplyr 1.0 compatability
  df_failed <- df[rows_fail,] ### FRANK
  df <- df[!rows_fail,] ### FRANK
  # df_failed <- filter(df, is.na(df))### FRANK
  # df <- filter(df, !is.na(df))### FRANK
  if(!nrow(df)) {warning("No models could be fitted"); return(df_prot_failed)}

  ### FRANK: identify issues with lme4::lmer() when all proteins/models failed to help users diagnose R installation issues. Related to MS-DAP GitHub issue #36
  if(!"sigma" %in% colnames(df)) { # if all fit_fun() calls throw errors, the 'mm' column is NULL/empty and unnest() does not yield a sigma column
    assert_lme4_functional() # util function, new in MS-DAP 1.1.3
    stop("msqrob model estimation failed for all proteins") # halt even if we didn't find lme4::lmer() issues because below code will fail when 'sigma' column is missing
  }

  ## Squeeze variance
  df <- mutate(df, sigma_post = sigma, df_prior = 0, sigma_prior = 0)
  if(squeeze_variance) {
    ## no shrinkage if df < 1 because greatly influences emp. bay.
    ## TODO check only works when at least 3 protein
    id = df$df >= 1L
    sq = limma::squeezeVar(df$sigma[id]^2, df$df[id]) ### FRANK: add limma
    ## experimental: use intercept of model as covariate to
    ## squeeze variance to correct for  mean variance
    if(squeeze_covariate) sq = limma::squeezeVar(df$sigma[id]^2, df$df[id], covariate = df$intercept[id]) ### FRANK: add limma
    df[id,] = mutate(df[id,],
                     sigma_prior = sqrt(sq$var.prior), sigma_post = sqrt(sq$var.post), df_prior = sq$df.prior)} ### FRANK
  if(type_df == "conservative"){
    ## Calculate df on protein level, assumption is that there is only one protein value/run,
    ## TODO remove conservative option? Now broken, fix vars = colnames(...)
    df <- mutate(df, df_prior = 0
                 , df = map2_dbl(data, model,~calculate_df(.x,.y, vars = colnames(pData(msnset)))))}
  df <- mutate(df, df_post = df + df_prior)

  ## Calculate q and p values for contrasts (if contrasts are found)
  df <- df %>% select(group_vars ,df_post,sigma_post, contrasts) %>% unnest(contrasts) %>% ### FRANK: specify the unnest
    # df <- df %>% select(group_vars ,df_post,sigma_post, contrasts) %>% unnest %>% ### FRANK: orig line
    ## skip if there are no contrasts
    {if(nrow(.)) {
      mutate(., se = sigma_contrast * sigma_post
             , t = logFC/se
             , pvalue = pt(-abs(t), df_post) * 2) %>%
        select(-sigma_post, - df_post) %>%
        group_by(contrast) %>%
        mutate(qvalue = p.adjust(pvalue, method = p_adjust_method)) %>%
        group_by_at(group_vars) %>% group_nest(.key="contrasts") ### FRANK: adjust to dplyr updates
      # group_by_at(group_vars) %>% nest(contrasts = - group_vars) ### FRANK: orig code
    } else .} %>%
    select(group_vars, contrasts) %>%
    left_join(select(df, -contrasts),., by = group_vars)

  ## mutate(df_failed, contrasts = list(tibble())) %>% bind_rows(df,.)
  ## give empty tibble instead of NULL when no contrast (nicer to work with posthoc)
  # bind_rows(df,df_failed) %>% ### FRANK
  bind_rows(df, df_failed %>% select(-contrasts)) %>% ### FRANK
      mutate(contrasts = purrr::map(contrasts, ~{if (is(.x,'tbl')) .x else tibble()})) ### FRANK: add purrr
}

MSnSet2df <- function(msnset){
  ## Converts Msnset to a tidy dataframe
  ## Always creates feature and vector column so these shouldn't be defined by user.
  ## convenient for downstream analysis steps.
  if (!requireNamespace("Biobase", quietly = TRUE)) {
    stop("Package \"Biobase\" needed for this function to work. Please install it.",
         call. = FALSE)
  }
  if(any(c("sample", "feature", "expression") %in% c(colnames(fData(msnset)),colnames(pData(msnset))))){
    stop("Column names in the \"fData\" or \"pData\" slot of the \"msnset\" object cannot be named
         \"sample\", \"feature\" or \"expression\". Please rename.")
  }


  dt <- as_tibble(Biobase::exprs(msnset), rownames="feature") %>% gather(sample, expression, - feature, na.rm=TRUE) ### FRANK: update deprecated dplyr code, use as_tibble to convert matrix while maintaining rownames
  # dt <- as.data.frame(Biobase::exprs(msnset)) %>% mutate(feature = rownames(.)) %>% gather(sample, expression, - feature, na.rm=TRUE) ### FRANK: orig line
  dt <- fData(msnset) %>% mutate(feature = (rownames(.))) %>% left_join(dt,. , by = 'feature')
  dt <- pData(msnset) %>% mutate(sample = rownames(.)) %>% left_join(dt,. , by = 'sample')
  return(dt) ### FRANK: as_data_frame is a deprecated function. downstream code uses only tibbles, we return dt for now. if data.frame is desired, use as.data.frame(dt)
  # as_data_frame(dt) ### FRANK: orig line
}

#' @import tibble
do_mm <- function(d, formulas, contrasts, mode ='msqrobsum', robust_lmer_iter = 1L
                  , rlm_args = list(), lmer_args = list(), keep_model = TRUE){
  out <- list()
  if (mode != 'msqrob') {
    d <- mutate(d,expression = rlang::exec(summarizeRobust, expression, feature, sample, !!!rlm_args)) %>%
      dplyr::select(-feature) %>% dplyr::distinct() # FRANK: bugfix for R 4.2
    out$data_summarized <- list(d)
  }
  if (mode == 'sum') return(new_tibble(out ,nrow = 1L))
  for (form in c(formulas)) {
    ## TODO do lm fit if there are no random effects (all output inside loop, do while loop)
    model <- try(do_lmerfit(d, form,  robust_lmer_iter,lmer_args), silent = TRUE)
    if (is(model,"lmerMod")) break #else message(paste0('Cannot fit ', format(form)))
  }
  if (keep_model) out$model <- list(model)
  if (!is(model,"lmerMod")) return(new_tibble(out ,nrow = 1L))
  out$formula <- as.character(list(attributes(model@frame)$formula))
  out$df <- getDf(model)
  out$sigma <- sigma(model)
  out$contrasts <- list(get_contrasts(model,contrasts))
  out$intercept <- model@beta[1]
  new_tibble(out , nrow = 1L)
}

#' @importFrom MASS psi.huber
do_lmerfit <- function(df, form, robust_lmer_iter, args_lmer = list()){
  tol <- 1e-6
  fit <- rlang::exec(lme4::lmer, form, data = df, !!!args_lmer)
  ##Initialize SSE
  sseOld <- fit@devcomp$cmp['pwrss']
  while (robust_lmer_iter > 0){
    robust_lmer_iter= robust_lmer_iter-1
    res <- resid(fit)
    fit@frame$`(weights)` <- psi.huber(res/(mad(res,0)))
    fit <- lme4::refit(fit) ### FRANK: add lme4
    sse <- fit@devcomp$cmp['pwrss']
    if(abs(sseOld-sse)/sseOld <= tol) break
    sseOld <- sse
  }
  fit
}

#' @importFrom MASS rlm
summarizeRobust <- function(expression, feature, sample, ...) {
  ## Assumes that intensities mx are already log-transformed
  ## characters are faster to construct and work with then factors
  feature <- as.character(feature)
  ##If there is only one 1 peptide for all samples return expression of that peptide
  if (length(unique(feature)) == 1L) return(expression)
  sample <- as.character(sample)
  ## modelmatrix breaks on factors with 1 level so make vector of ones (will be intercept)
  if (length(unique(sample)) == 1L) sample <- rep(1,length(sample))

  ## sum contrast on peptide level so sample effect will be mean over all peptides instead of reference level
  X <- model.matrix(~ -1 + sample + feature,contrasts.arg = list(feature = 'contr.sum'))
  ## MASS::rlm breaks on singular values.
  ## check with base lm if singular values are present.
  ## if so, these coefficients will be zero, remove this column from model matrix
  ## rinse and repeat on reduced modelmatrix untill no singular values are present
  repeat {
    fit <- .lm.fit(X,expression)
    id <- fit$coefficients != 0
    X <- X[ , id, drop =FALSE]
    if (!any(!id)) break
  }
  ## Last step is always rlm
  fit <- rlm(X, expression, ...)
  ## Only return the estimated effects effects as summarised values
  ## sampleid = seq_along(unique(sample))
  ## return(X[,sampleid,drop = FALSE] %*% fit$coefficients[sampleid])
  fit$coefficients[paste0('sample',sample)]
}

#' @import tibble
get_contrasts <- function(model, contrasts){
  ## TODO allow for contrasts from fixed effects
  ## tricky because reference level can change
  betaB <- getBetaB(model)
  vcov <- getVcovBetaBUnscaled(model)
  coefficients <- names(betaB)
  id <- coefficients %in% rownames(contrasts)
  if (!any(id)) return(new_tibble(list(), nrow = 0))
  coefficients <- coefficients[id]
  vcov <- vcov[id,id]
  betaB <- betaB[id]

  ## check for which contrasts I have data
  missing_levels <- !(rownames(contrasts) %in% coefficients)
  id <- !apply(contrasts,2,function(x){any(x[missing_levels] != 0)})
  contrasts <- contrasts[coefficients, id, drop = FALSE]
  ## If no contrasts could be found, terminate
  if (is.null(colnames(contrasts))) return(new_tibble(list(), nrow = 0))
  new_tibble(list(contrast = colnames(contrasts)
                  , logFC = logFC <- (t(contrasts)%*%betaB)[,1]
                  , sigma_contrast = sqrt(diag(t(contrasts)%*%vcov%*%contrasts))
  ),nrow = sum(id))
}

getBetaB <- function(model) {
  betaB <- c(as.vector(lme4::getME(model,"beta")),as.vector(lme4::getME(model,"b"))) ### FRANK: add lme4
  names(betaB) <- c(colnames(model@pp$X), unlist(purrr::imap(model@flist,~{paste0(.y,levels(.x))}))) ### FRANK: add purrr
  betaB
}

getVcovBetaBUnscaled <- function(model){
  X <- lme4::getME(model,"X") ### FRANK: add lme4
  Z <- lme4::getME(model,"Z") ### FRANK: add lme4
  vcovInv <- Matrix::crossprod(cbind2(X,Z))
  Ginv <- Matrix::solve(Matrix::tcrossprod(lme4::getME(model,"Lambda")) + ### FRANK: add lme4
                          Matrix::Diagonal(ncol(Z),1e-18))
  i <- -seq_len(ncol(X))
  vcovInv[i,i] <- vcovInv[i,i]+Ginv
  as.matrix(Matrix::solve(vcovInv))
}

calculate_df <- function(df, model, vars){
  ## Get all the variables in the formula that are not defined in vars
  form <- attributes(model@frame)$formula
  vars_formula <- all.vars(form)
  vars_drop <- vars_formula[!vars_formula %in% vars]
  ## Sum of number of columns -1 of Zt mtrix of each random effect that does not involve a variable in vars_drop
  mq <- lme4::getME(model,'q_i') ### FRANK: add lme4
  id <- !purrr::map_lgl(names(mq),~{any(stringr::str_detect(.x,vars_drop))}) ### FRANK: add purrr
  p <- sum(mq[id]) - sum(id)
  ## Sum of fixed effect parameters that do not involve a variable in vars_drop
  mx <- lme4::getME(model,'X') ### FRANK: add lme4
  id <- !purrr::map_lgl(colnames(mx),~{any(stringr::str_detect(.x,vars_drop))}) ### FRANK: add purrr
  p <- p + sum(id)

  ## n is number of sample because 1 protein defined per sample
  n <- n_distinct(df$sample)
  n-p
}

getDf <- function(object){
  w <- object@frame$"(weights)"
  if (is.null(w)) w <- 1
  sigma <- sigma(object)
  sum((resid(object)* sqrt(w))^2)/sigma^2
}

make_simple_contrast <- function(data, contrast_var){
  c <- pull(data,contrast_var) %>% unique %>% paste0(contrast_var, .) %>% sort %>% as.factor
  comp <- combn(c,2,simplify = FALSE)
  condIds <- purrr::map(comp, ~which(c %in% .x)) ### FRANK: add purrr
  L <- rep(0,nlevels(c))
  L <- sapply(comp,function(x){L[x]=c(-1,1);L})
  rownames(L) <- levels(c)
  colnames(L) <- purrr::map_chr(comp, ~paste(rev(.x),collapse = '-')) ### FRANK: add purrr
  L
}
ftwkoopmans/msdap documentation built on March 5, 2025, 12:15 a.m.