R/correlation_utils.R

Defines functions cohenf2_formula_mixed_model cohenf2_formula_fixed_model format_pvalue general_linear_model_without_interaction general_linear_model_correlation multiple_compare_groupe dataPrep

dataPrep <- function(data_src, roi_start, roi_end, sep='\t')
{
  ### This function is to do the required data preprocessing from the targeted tsv file 
  data_src <- read.table(data_src, sep = sep, header = T, dec = ',')
  
  ### checkout dimension
  data_src_dim <- dim(data_src)
  cat("The source data tsv dimension is: ")
  print(data_src_dim)
  cat('\n')
  
  ### make sure the numerical data_src to be numerical  
  data_src[, setdiff(colnames(data_src), c("participant_id", "session_id", "group"))] <- apply(data_src[, setdiff(colnames(data_src), c("participant_id", "session_id", "group"))], 2, function(x)as.numeric(as.character(x))) # 2 means columns to run 
  
  ## Make sure all the categorical factor to be character
  data_src[, c("participant_id", 'session_id', "group")] <- apply(data_src[, c("participant_id", 'session_id', "group")], 2, as.character)
  ###############
  #########Make sure there is no NA in the data
  #################
  NA_res <- apply(data_src, 2, function(x) sum(is.na(x)))# 0 OK
  cat("Note, important, check out the number of every column, make sure it is 0: ")
  print(NA_res)
  cat('\n')
  
  nom_ROIs_volume <- setdiff(colnames(data_src), c("participant_id", 'session_id', "group", "age", "sex", "disease_duration", "cpz", "age_disease_onset", "panss_pos", "panss_neg", "education"))
  
  return(list(data_src, nom_ROIs_volume))
}

###########################
#####  Group comparison
############################
multiple_compare_groupe <- function(data, var_group = 'group', vars = setdiff(colnames(data_analyse), c('group', 'participant_id', "session_id")), factorVars = c('icv'), test = 'wilcox', arrondi = 2, indicateur = 'median', format_pvalue = T, correct = 'bonferroni'){
  #  ##### When you wanna use this to compare two groups, you can just use this
  # do.call constructs and exectues a function call.
  #mclapply is a parallelized version of lapply mclapply(x, FUN...)
  # if the var in vars is in factorVars, and if it is sex, gointo first if, else gointo else, and if not in factorVars, gointo second else
  # for sex, is to find the correlation between the var and the groups, if they are independent 
  
  ###### Choose between non-parametric(wilcox) and parametric(t-test):
  ###Note: One principled response to this is "safety first": with no way to reliably verify the normality assumption in a small sample, stick to non-parametric methods
  ######
  groupe <- sort(unique(as.character(data[, var_group])))
  
  table <- do.call('rbind', mclapply(vars, function(var){
    if(var %in% factorVars){
      #### This is to consider just sex 
      if(length(unique(data[, var])) == 2){
        ### This is for sex
        res <- do.call('cbind', lapply(groupe, function(gr){
          data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1]), ' (', 
                                    format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1])/
                                                     sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
        }))
        colnames(res) <- groupe
        return(cbind(data.frame(var = paste(var, sort(unique(data[, var]))[1], sep=' = ')), 
                     res,
                     data.frame(pvalue = chisq.test(data[, var_group], data[, var])$p.value)))
      }else{
        ### This is to do the real multiple comparison for the diff ROIs
        res <- do.call('rbind', lapply(sort(unique(data[, var])), function(mod){
          cbind(data.frame(var = paste(var, mod, sep=' = ')), 
                do.call('cbind', lapply(groupe, function(gr){
                  data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == mod), ' (', 
                                            format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == mod)/
                                                             sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
                })))
        }))
        colnames(res) <- c('var', groupe)
        return(cbind(res,
                     data.frame(pvalue = c(chisq.test(data[, var_group], data[, var])$p.value, rep(NA, length(unique(data[, var]))-1)))))
      }
    }else{
      res <- do.call('cbind', lapply(groupe, function(gr){
        data.frame(groupe = ifelse(indicateur == 'median', 
                                   paste(format_arrondi(median(subset(data, get(var_group) == gr)[,var])), 
                                         ' [', format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.25)), ',',
                                         format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.75)), ']', sep=''), 
                                   paste(format_arrondi(mean(subset(data, get(var_group) == gr)[,var])), 
                                         ' (', format_arrondi(sd(subset(data, get(var_group) == gr)[,var])), ')', sep='')))
      }))
      colnames(res) <- groupe
      
      return(cbind(data.frame(var = var), 
                   res, 
                   pvalue = ifelse(length(groupe) == 2, 
                                   ifelse(test == 'wilcox',
                                          wilcox.test(data[,var] ~ data[,var_group])$p.value, 
                                          t.test(data[,var] ~ data[,var_group], var.equal = ifelse(var.test(data[,var] ~ data[,var_group])$p.value<0.05, F, T))$p.value), 
                                   ifelse(test == 'wilcox', 
                                          kruskal.test(data[,var] ~ factor(data[,var_group]))$p.value, 
                                          anova(lm(data[,var] ~ factor(data[,var_group])))[["Pr(>F)"]][1]))))
    }
  }, mc.cores = 3))
  
  ########## To get the group info
  res <- do.call('cbind', lapply(groupe, function(gr){
    data.frame(groupe = paste(nrow(subset(data, get(var_group) == gr)), ' (', 
                              format_arrondi(100*nrow(subset(data, get(var_group) == gr))/sum(!is.na(data[, var_group]))), '%)', sep= ''))
  }))
  colnames(res) <- groupe
  ############
  ########## concatenate the res and table
  table <- rbind(cbind(data.frame(var = 'N'), 
                       res, 
                       data.frame(pvalue = NA)), 
                 table)
  
  table$pvalue_multiple <- p.adjust(as.numeric(as.character(table$pvalue)), correct, sum(!is.na(table$pvalue)))
  if(format_pvalue) table[, c('pvalue', 'pvalue_multiple')] <- apply(table[, c('pvalue', 'pvalue_multiple')] , 2, function(x) format_pvalue(pvalue = as.numeric(as.character(x))))
  
  return(table)
}

##########################################################################
############## General linear model for correlation
##########################################################################
general_linear_model_correlation <- function(data, predictor, model_independent_var, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(model_independent_var, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var), 
                       do.call('cbind', lapply(predictor, function(e){
                         res <- data.frame(slope_estimate = summary(model_complet)$coefficients[4, 1],
                                           slope_std = summary(model_complet)$coefficients[4, 2],
                                           pvalue = summary(model_complet)$coefficients[4, 4],
                                           pvalue_mult = NA,
                                           pearson_r = cor(data.matrix(data[,c(e)]), data.matrix(data[,c(var)]),  method = "pearson"),
                                           residual_normality = ols_test_correlation(model_complet))
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model TODO
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

##########################################################################
############## General linear model without interaction between age and group
##########################################################################
general_linear_model_without_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenf2_formula_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], arrondi)}else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], arrondi)}
              res <- data.frame(data.frame(t(coef)),
                                f2cohen = format_arrondi(cohenf2_formula_mixed_model(model_complet, model_low), 2),
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], 
                                pvalue_mult = NA)
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

# Create a function to format the pvalue for the following result
format_pvalue <- function(pvalue, n.arrondi = 3) gsub(' ', '', ifelse(pvalue < 0.05, paste(format(ifelse(round(pvalue, n.arrondi) == 0, '<0.001', round(pvalue, n.arrondi)), nsmall=n.arrondi), '*', sep=''),  format(round(pvalue, n.arrondi), nsmall=n.arrondi)))

cohenf2_formula_fixed_model <- function(model_complet, model_emboite){
  # According to Cohen???s (1988) guidelines, f2f2??? 0.02, f2f2??? 0.15, and f2f2 ??? 0.35 represent small, medium, and large effect sizes, respectively.
  return( (summary(model_complet)$r.squared-summary(model_emboite)$r.squared)/(1-summary(model_complet)$r.squared))
}

cohenf2_formula_mixed_model <- function(model_complet, model_emboite){
  # According to Cohen???s (1988) guidelines, f2f2??? 0.02, f2f2??? 0.15, and f2f2 ??? 0.35 represent small, medium, and large effect sizes, respectively.
  return((r.squaredGLMM(model_complet)[1]-r.squaredGLMM(model_emboite)[1])/(1- r.squaredGLMM(model_complet)[1]))
}
anbai106/NeuroStatisticR documentation built on May 27, 2020, 2:20 a.m.