NeuroStatisticR:

Documented in data_prep qqplot_func

data_prep <- function(input_tsv, roi_start, roi_end, sep='\t')
{
  #' A function to process the tsv data from T1 modality
  #' @param input_tsv the tsv file containing all the ROI measurement. The file should strictly contian these columns: "participant_id	session_id	group	age_bl	prevdemals_family_code	sex", then following each column for each ROI
  #' @param roi_start the first ROI name
  #' @param roi_end the last ROI name
  #' @param sep the separator for the tsv file, by default is '\t'
  #' 
  #' @return a list of the preprocessed data from the tsv files
  #' @export
  #' @examples
  #' WARNING: the first 7 columns should be 'participant_id	session_id	group	age_bl	prevdemals_family_code	sex
  input_tsv <- read.table(input_tsv, sep = sep, header = T, dec = ',')
  
  
  #### Check out the two files contens are consitent 
  ses_info <- table(input_tsv$session_id)
  cat("The session info comparison: ", ses_info, '\n')
  print(ses_info)
  cat('\n')
  group_info <- table(input_tsv$group)
  cat("The group info comparison: ")
  print(group_info)
  cat('\n')
  age_info <- summary(as.numeric(as.character(input_tsv$age_bl)))
  cat("The age_bl info: ")
  print(age_info)
  cat('\n')
  summary(as.numeric(as.character(input_tsv$expected_years_aoo_bl)))
  sex_info <- table(input_tsv$sex)
  cat("The sex info: ")
  print(sex_info)
  cat('\n')
  
  ### checkout dimension
  input_tsv_dim <- dim(input_tsv)
  cat("The source data tsv dimension is: ")
  print(input_tsv_dim)
  cat('\n')
  
  ## simplifier les noms de participant_id
  input_tsv$participant_id <- gsub('sub-PREVDEMALS', '', input_tsv$participant_id)
  length(unique( input_tsv$participant_id))
  length(unique(substr(input_tsv$participant_id, 6, 9)))
  input_tsv$participant_id <- substr(input_tsv$participant_id, 6, 9)
  
  ### make sure the numerical input_tsv to be numerical  
  input_tsv[, setdiff(colnames(input_tsv), c("participant_id", 'prevdemals_family_code', "session_id", "group", "sex"))] <- apply(input_tsv[, setdiff(colnames(input_tsv), c("participant_id", 'prevdemals_family_code', "session_id", "group", "sex"))], 2, function(x)as.numeric(as.character(x))) # 2 means columns to run 
  
  ## Suppression des colonnes non utiles
  data_analyse <- input_tsv[, c(which(colnames(input_tsv) %in% c('participant_id', 'prevdemals_family_code', 'group', 'age_bl', 'sex')), match(roi_start,  colnames(input_tsv)):match(roi_end, colnames(input_tsv)))]
  
  ## Make sure all the categorical factor to be character
  data_analyse[, c("participant_id", 'prevdemals_family_code', "group", "sex")] <- apply(data_analyse[, c("participant_id", 'prevdemals_family_code', "group", "sex")], 2, as.character)
  ###############
  #########Make sure there is no NA in the data
  #################
  NA_res <- apply(data_analyse, 2, function(x) sum(is.na(x)))# 0 OK
  cat("Note, important, check out the number of every column, make sure it is 0: ")
  print(NA_res)
  cat('\n')
  
  nom_ROIs_volume <- setdiff(colnames(data_analyse), c("participant_id", 'prevdemals_family_code', "group", "age_bl", "sex"))
  
  ### extract the MEAN from the nom_ROIs_volume
  nom_ROIs_volume <- nom_ROIs_volume[which(regexpr('_Mean', nom_ROIs_volume, fixed = T) != (-1))]
  
  ###Redefine the data_analyses
  data_analyse <- data_analyse[, c(c("participant_id", 'prevdemals_family_code', "group", "age_bl", "sex"), nom_ROIs_volume)]
  
  ###Compute the mean and std of the CN, PS, PT.
  data_analyse_CN <- data_analyse[which(data_analyse$group %in% c('CN')), ]
  data_analyse_PS <- data_analyse[which(data_analyse$group %in% c('PS')), ]
  data_analyse_PT <- data_analyse[which(data_analyse$group %in% c('PT')), ]
  
  data_analyse_CN_age_mean <- mean(data_analyse_CN[, c("age_bl")])
  data_analyse_CN_age_std <- sd(data_analyse_CN[, c("age_bl")])
  cat("The mean and stand deviation of CN: ")
  print(data_analyse_CN_age_mean)
  print(data_analyse_CN_age_std)
  cat('\n')
  sex_info_CN <- table(data_analyse_CN$sex)
  cat("The sex info for CN: ")
  print(sex_info_CN)
  cat('\n')
  
  
  data_analyse_PS_age_mean <- mean(data_analyse_PS[, c("age_bl")])
  data_analyse_PS_age_std <- sd(data_analyse_PS[, c("age_bl")])
  cat("The mean and stand deviation of PS: ")
  print(data_analyse_PS_age_mean)
  print(data_analyse_PS_age_std)
  cat('\n')
  sex_info_PS <- table(data_analyse_PS$sex)
  cat("The sex info for PS: ")
  print(sex_info_PS)
  cat('\n')
  
  data_analyse_PT_age_mean <- mean(data_analyse_PT[, c("age_bl")])
  data_analyse_PT_age_std <- sd(data_analyse_PT[, c("age_bl")])
  cat("The mean and stand deviation of PT: ")
  print(data_analyse_PT_age_mean)
  print(data_analyse_PT_age_std)
  cat('\n')
  sex_info_PT <- table(data_analyse_PT$sex)
  cat("The sex info for PT: ")
  print(sex_info_PT)
  cat('\n')
  
  return(list(data_analyse, nom_ROIs_volume))
}
##########
## QQPLOT
qqplot_data <- function (vec) # argument: vector of numbers
{
  # following four lines from base R's qqline()
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  return(data.frame(resids = vec, 
                    slope = slope, 
                    int = int))
}

qqplot_func <- function(data_validation, ROIs){
  return(do.call('rbind', lapply(ROIs, function(var) 
    cbind(data.frame(var = gsub('.TIV', '', var)), 
          qqplot_data(data_validation$res[which(data_validation$var == var)])))))
}

##### FUNC to do group comparison
# Create a function to round the numerical data
format_arrondi <- function(x, arrondi = 2)format(round(x, arrondi), nsmall=arrondi, scientific=TRUE)

# Create a function to format the pvalue for the following result
format_pvalue <- function(pvalue, n.arrondi = 3) gsub(' ', '', ifelse(pvalue < 0.05, paste(format(ifelse(round(pvalue, n.arrondi) == 0, '<0.001', round(pvalue, n.arrondi)), nsmall=n.arrondi), '*', sep=''),  format(round(pvalue, n.arrondi), nsmall=n.arrondi)))

## studentized residuals for lmer
StudentResid = function(fit){
  #### This is a function to calculate the studentized residuals for mixed-effect linear model
  res = residuals(fit)
  H = hatvalues(fit)
  sigma = summary(fit)$sigm
  sres = sapply(1:length(res), function(i) res[[i]]/(sigma*sqrt(1-H[[i]])))
  return(sres)
}

# this is a function to calculate the cohen's f value for fixed model
cohenD_fixed_model <- function(model_complet, model_emboite){
  return( (summary(model_complet)$r.squared-summary(model_emboite)$r.squared)/(1-summary(model_complet)$r.squared))
}
## this is a function to calculate the cohen's f value for mixed model
# According to Cohen’s (1988) guidelines, f2f2≥ 0.02, f2f2≥ 0.15, and f2f2 ≥ 0.35 represent small, medium, and large effect sizes, respectively.
# this is a function to calculate the cohen's f value for mixed model
cohenD_mixed_model <- function(model_complet, model_emboite){
  cohenf = (r.squaredGLMM(model_complet)[2]-r.squaredGLMM(model_emboite)[2])/r.squaredGLMM(model_complet)[2]
  return(cohenf[1])
}

analysisPCA <- function(groupPalette, data_analyse, nom_ROIs_volume)
{
  #### This is a function to run PCA for group factor
  data_PCA <- data_analyse[, setdiff(colnames(data_analyse), c('age_bl', 'sex'))]
  data_PCA$group <- factor(data_PCA$group) # encode a vector as a factor
  res.pca_vol_normalise <- PCA(data_PCA[, c('group', nom_ROIs_volume)], scale.unit = TRUE, ncp = length(nom_ROIs_volume), quali.sup = 1, graph = F) 
  plot.PCA(res.pca_vol_normalise, 
           habillage = 1, 
           ellipse = coord.ellipse(cbind.data.frame(data_analyse$group,res.pca_vol_normalise$ind$coord), bary=T), 
           cex = 2, label = 'none', col.hab = groupPalette)
  plot.PCA(res.pca_vol_normalise, axes=c(1, 2), choix="var", cex = 0.7)
  return(res.pca_vol_normalise)
}

##################
###################
# Linear model function
#####################
######################
general_linear_model_with_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) 
                           model_complet <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         if(regexpr(':', e) != (-1)){ coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)}else{coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1) & regexpr(':', names(coef(model_complet))) == (-1))], arrondi)}
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenD_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], arrondi)}else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], arrondi)}
              res <- data.frame(data.frame(t(coef)),
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], 
                                pvalue_mult = NA)
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

##########################################################################
############## General linear model without interaction between age and group
##########################################################################
general_linear_model_without_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenD_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              ### This is actually for model construction with interaction.
              ## this sentense is to detect the interaction, if e is not interaction, use the complete model in the beginning
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1))
                model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              ## This is the reduced_model without the var
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              ### This is for interaction
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], arrondi)}
              ## This is for without interaction
              else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], arrondi)}
              res <- data.frame(data.frame(t(coef)),
                                f2cohen = format_arrondi(cohenD_mixed_model(model_complet, model_low), 2), ### cohen's f2
                                cohenD = as.numeric(coef) / sum(resid(model_complet)^2),
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], # Performs a Likelihood Ratio Test between two nested IRT models.
                                pvalue_mult = NA)
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

######################
########### Post Hoc analysis function with interaction
########################

postHOC_GR_ageGR <- function(data, ROIs, model = 'fixe', pvalue_GR, pvalue_ageGR, correction = 'BH', arrondi = 4){
  
  results <- do.call('rbind', lapply(ROIs, function(var){
    if(model == 'fixe'){
      if(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
        model_complet <- lm(as.formula(paste(var, ' - 1 + group + age_bl + sex + group:age_bl', sep = ' ~ ')), data = data)
      }else{
        model_complet <- lm(as.formula(paste(var, ' - 1 + group + age_bl + sex', sep = ' ~ ')), data = data)
      }
    }else{
      #### fixed effect 
      if(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
        model_complet <- lmer(as.formula(paste(var, ' - 1 + group + age_bl + sex + group:age_bl  + (1|prevdemals_family_code)', sep = ' ~ ')), data = data)
      }else{
        model_complet <- lmer(as.formula(paste(var, ' - 1 + group + age_bl + sex + (1|prevdemals_family_code)', sep = ' ~ ')), data = data)
      }
    }
    
    # if(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1) | regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
    return(data.frame(var = var, 
                      GR_PS_CN_coeff = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                              ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0, 0, 0), 1)))$test$coeff[1], arrondi), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0), 1)))$test$coeff[1], arrondi)),
                                              NA), 
                      GR_PS_CN_pvalue = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                               ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0, 0, 0), 1)))$test$pvalues[1], arrondi), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0), 1)))$test$pvalues[1], arrondi)),
                                               NA), 
                      GR_PS_CN_pvalue_corr = NA, 
                      # GR_CN_PT = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                      #                   summary(glht(model_complet, linfct = matrix(c(-1, 0, 1, 0, 0, 0, 0), 1)))$test$pvalues[1],
                      #                   NA), 
                      GR_PT_PS_coeff = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                              ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0, 0, 0), 1)))$test$coeff[1], arrondi), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0), 1)))$test$coeff[1], arrondi)),
                                              NA),
                      GR_PT_PS_pvalue = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                               ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0, 0, 0), 1)))$test$pvalues[1], arrondi), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0), 1)))$test$pvalues[1], arrondi)),
                                               NA),
                      GR_PT_PS_pvalue_corr = NA, 
                      ageGR_PS_CN_coeff = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                 summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 1, 0), 1)))$test$coeff[1], 
                                                 NA), 
                      ageGR_PS_CN_pvalue = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                  format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 1, 0), 1)))$test$pvalues[1], arrondi), 
                                                  NA), 
                      ageGR_PS_CN_pvalue_corr = NA, 
                      # ageGR_CN_PT = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                      #                      summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 0, 1), 1)))$test$pvalues[1], 
                      #                      NA), 
                      ageGR_PT_PS_coef = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, 0, 0, 0, 0, -1, 1), 1)))$test$coeff[1], arrondi), 
                                                NA),
                      ageGR_PT_PS_pvalue = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                  summary(glht(model_complet, linfct = matrix(c(0, 0, 0, 0, 0, -1, 1), 1)))$test$pvalues[1], 
                                                  NA), 
                      ageGR_PT_PS_pvalue_corr = NA))
    # }
  }))
  
  results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue')] <- t(apply(results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue')], 1, function(x) p.adjust(x, 'holm', 2)))
  results[, c('ageGR_PS_CN_coeff', 'ageGR_PT_PS_pvalue')] <- t(apply(results[, c('ageGR_PS_CN_coeff', 'ageGR_PT_PS_pvalue')], 1, function(x) p.adjust(x, 'holm', 2)))
  results[, c('GR_PS_CN_pvalue_corr', 'GR_PT_PS_pvalue_corr', 'ageGR_PS_CN_pvalue_corr', 'ageGR_PT_PS_pvalue_corr')] <- apply(results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue', 'ageGR_PS_CN_pvalue', 'ageGR_PT_PS_pvalue')], 2, function(x) p.adjust(as.numeric(x), correction, sum(!is.na(x))))
  
  results[, which(regexpr('pvalue', colnames(results)) != (-1))] <- apply( results[, which(regexpr('pvalue', colnames(results)) != (-1))], 2, function(x) format_pvalue(as.numeric(x)))
  
  
  return(results)
}

###############################
########### model validation
###################################
data_validation <- function(data, ROIs, model = 'fixe'){
  if(model == 'fixe'){
    return(do.call('rbind', mclapply(ROIs, function(var){
      model <- lm(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group', sep = '')), data = data)
      return(data.frame(var = var, 
                        id = data$participant_id,
                        y = data[,var], 
                        y_hat = fitted(model), 
                        res = residuals(model), 
                        res_studentise = studres(model), 
                        levier = hatvalues(model)) ## In statistics and in particular in regression analysis, leverage is a measure of how far away the independent variable values of an observation are from those of the other observations.
      )
    }, mc.cores = 3)))
  }else{
    return(do.call('rbind', mclapply(ROIs, function(var){
      model <- lmer(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group + (1|prevdemals_family_code)', sep = '')), data = data)
      return(data.frame(var = var, 
                        id = data$participant_id,
                        y = data[,var], 
                        y_hat = fitted(model), 
                        res = residuals(model), 
                        levier = hatvalues(model))
      )
    }, mc.cores = 3)))
  }
}

################
###### Data validation table creation
###############
random_effect_inspect <- function(data, ROIs, arrondi = 3){
  return(do.call('rbind', mclapply(ROIs, function(var){
    model <- lmer(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group + (1|prevdemals_family_code)', sep = '')), data = data)
    return(cbind(data.frame(var = var), 
                 setNames(data.frame(t(data.frame(res = format_arrondi(ranef(model)$prevdemals_family_code[, 1], arrondi)))), rownames(ranef(model)$prevdemals_family_code)), 
                 data.frame(var_alea = format_arrondi(attr(VarCorr(model)$prevdemals_family_code, "stddev")[1], arrondi), ### Get the stand deviatin of the random effect
                            var_res = format_arrondi(summary(model)$sigma, arrondi), 
                            prop_var_tot = format_arrondi(attr(VarCorr(model)$prevdemals_family_code, "stddev")[1]/(summary(model)$sigma+attr(VarCorr(model)$prevdemals_family_code, "stddev")[1]), arrondi))))
  }, mc.cores = 3)))
}

#################
############ heterogeneity check function for fixed effect linear model
#############################
inspect_heterogeneite_fixed <- function(data, ROIs){
  return(do.call('rbind', mclapply(ROIs, function(var){
    model <- lm(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group', sep = '')), data = data)
    return(data.frame(var = gsub('.TIV', '', gsub('Right', 'R', gsub('Left', 'L', var))), 
                      bp = format_pvalue(bptest(model)$p.value),
                      white = format_pvalue(bptest(model, ~ age_bl*sex + I(age_bl^2) + age_bl*group*sex, data = data_analyse)$p.value))
    )
  }, mc.cores = 3)))
}

###########################
#####  Group comparison
############################

multiple_compare_groupe <- function(data, var_group = 'group', vars = setdiff(colnames(data_analyse), c('group', 'participant_id')), factorVars = c('sex'), test = 'wilcox', arrondi = 2, indicateur = 'median', format_pvalue = T, correct = 'bonferroni'){
  #  ##### When you wanna use this to compare two groups, you can just use this
  # do.call constructs and exectues a function call.
  #mclapply is a parallelized version of lapply mclapply(x, FUN...)
  # if the var in vars is in factorVars, and if it is sex, gointo first if, else gointo else, and if not in factorVars, gointo second else
  # for sex, is to find the correlation between the var and the groups, if they are independent 
  
  ###### Choose between non-parametric(wilcox) and parametric(t-test):
  ###Note: One principled response to this is "safety first": with no way to reliably verify the normality assumption in a small sample, stick to non-parametric methods
  ######
  groupe <- sort(unique(as.character(data[, var_group])))
  
  table <- do.call('rbind', mclapply(vars, function(var){
    if(var %in% factorVars){
      #### This is to consider just sex 
      if(length(unique(data[, var])) == 2){
        ### This is for sex
        res <- do.call('cbind', lapply(groupe, function(gr){
          data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1]), ' (', 
                                    format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1])/
                                                       sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
        }))
        colnames(res) <- groupe
        return(cbind(data.frame(var = paste(var, sort(unique(data[, var]))[1], sep=' = ')), 
                     res,
                     data.frame(pvalue = chisq.test(data[, var_group], data[, var])$p.value)))
      }else{
        ### This is to do the real multiple comparison for the diff ROIs
        res <- do.call('rbind', lapply(sort(unique(data[, var])), function(mod){
          cbind(data.frame(var = paste(var, mod, sep=' = ')), 
                do.call('cbind', lapply(groupe, function(gr){
                  data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == mod), ' (', 
                                            format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == mod)/
                                                               sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
                })))
        }))
        colnames(res) <- c('var', groupe)
        return(cbind(res,
                     data.frame(pvalue = c(chisq.test(data[, var_group], data[, var])$p.value, rep(NA, length(unique(data[, var]))-1)))))
      }
    }else{
      res <- do.call('cbind', lapply(groupe, function(gr){
        data.frame(groupe = ifelse(indicateur == 'median', 
                                   paste(format_arrondi(median(subset(data, get(var_group) == gr)[,var])), 
                                         ' [', format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.25)), ',',
                                         format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.75)), ']', sep=''), 
                                   paste(format_arrondi(mean(subset(data, get(var_group) == gr)[,var])), 
                                         ' (', format_arrondi(sd(subset(data, get(var_group) == gr)[,var])), ')', sep='')))
      }))
      colnames(res) <- groupe
      
      return(cbind(data.frame(var = var), 
                   res, 
                   pvalue = ifelse(length(groupe) == 2, 
                                   ifelse(test == 'wilcox',
                                          wilcox.test(data[,var] ~ data[,var_group])$p.value, 
                                          t.test(data[,var] ~ data[,var_group], var.equal = ifelse(var.test(data[,var] ~ data[,var_group])$p.value<0.05, F, T))$p.value), 
                                   ifelse(test == 'wilcox', 
                                          kruskal.test(data[,var] ~ factor(data[,var_group]))$p.value, 
                                          anova(lm(data[,var] ~ factor(data[,var_group])))[["Pr(>F)"]][1]))))
    }
  }, mc.cores = 3))
  
  ########## To get the group info
  res <- do.call('cbind', lapply(groupe, function(gr){
    data.frame(groupe = paste(nrow(subset(data, get(var_group) == gr)), ' (', 
                              format_arrondi(100*nrow(subset(data, get(var_group) == gr))/sum(!is.na(data[, var_group]))), '%)', sep= ''))
  }))
  colnames(res) <- groupe
  ############
  ########## concatenate the res and table
  table <- rbind(cbind(data.frame(var = 'N'), 
                       res, 
                       data.frame(pvalue = NA)), 
                 table)
  
  table$pvalue_multiple <- p.adjust(as.numeric(as.character(table$pvalue)), correct, sum(!is.na(table$pvalue)))
  if(format_pvalue) table[, c('pvalue', 'pvalue_multiple')] <- apply(table[, c('pvalue', 'pvalue_multiple')] , 2, function(x) format_pvalue(pvalue = as.numeric(as.character(x))))
  
  return(table)
}


#######################################################################
####### This is to plot the regression plot of ROI
#######################################################################

plot_regreesion_scatter <- function(data, ROI, subtitle, xlabel="", ylabel="", legend_postion="right",  y_axis_limits=NULL, y_label_format=NULL){ 
  
  plot_data <- data[, c('participant_id', 'group', 'sex', "age_bl", ROI)]
  nom_ROIs_volume_TIV_sig_newname = c('participant_id', 'group', 'sex', "age_bl", subtitle)
  colnames(plot_data) <- nom_ROIs_volume_TIV_sig_newname
  ggplot_img <- ggplot(melt(plot_data, id.vars = c('participant_id', 'group', 'sex', "age_bl")),
                       aes(x = age_bl, y = value, color=group)) + geom_point(size=3) +
    geom_smooth(data = melt(plot_data, id.vars = c('participant_id', 'group', 'sex', "age_bl")),
                aes(x = age_bl, y = value, color=group), method = "lm", se = FALSE)  + facet_wrap(~variable, scales="free") + scale_colour_manual(labels = c("C9-", "C9+"), values=c("blue", "deeppink4")) + theme_bw() +
    theme(axis.line.x = element_line(colour = "black", size=1.5),
          axis.line.y = element_line(color="black", size=1.5),
          axis.text=element_text(size=20, colour="black", face="bold"),
          axis.title=element_text(size=20, colour="black", face="bold"),
          axis.ticks = element_line(colour = "black", size=1.5, linetype = 'longdash'),
          axis.ticks.length=unit(0.3,"cm"),
          axis.ticks.x = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(), 
          strip.background = element_blank(),
          legend.position = legend_postion,
          legend.title = element_text(size=20, colour="black", face="bold"),
          legend.text = element_text(size=20, colour="black", face="bold"),
          strip.text = element_text(size=24, colour="black", face="bold", vjust=3)) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(labels = y_label_format, expand = c(0, 0)) + labs(x = xlabel, y = ylabel, color = "Group", face="bold") + coord_cartesian(xlim=c(20, 81), ylim=y_axis_limits)
  
  
  return(ggplot_img)
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot_ggplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}