R/T1_utilities_function.R

Defines functions multiplot_ggplot plot_regreesion_scatter multiple_compare_groupe inspect_heterogeneite_fixed random_effect_inspect data_validation postHOC_GR_ageGR general_linear_model_without_interaction general_linear_model_with_interaction analysisPCA cohenD_fixed_model StudentResid format_pvalue format_arrondi qqplot_func data_prep

Documented in data_prep qqplot_func

data_prep <- function(input_tsv, roi_start, roi_end, sep='\t', decimal='.')
{
  #' A function to process the tsv data from T1 modality
  #' @param input_tsv the tsv file containing all the ROI measurement. The file should strictly contian these columns: "participant_id	session_id	group	age_bl	prevdemals_family_code	sex", then following each column for each ROI
  #' @param roi_start the first ROI name
  #' @param roi_end the last ROI name
  #' @param sep the separator for the tsv file, by default is '\t'
  #' @param decimal how to define the decimal in the tsv file, by default is "."
  #' 
  #' @return a list of the preprocessed data from the tsv files
  #' WARNING: the first 7 columns should be 'participant_id	session_id	group	age_bl	prevdemals_family_code	sex	EstimatedTotalIntraCranialVol'
  input_tsv <- read.table(input_tsv, sep = sep, header = T, dec = decimal)
  
  info_data <- input_tsv[, c("participant_id", "session_id", "group", "age_bl", "prevdemals_family_code", "sex", "EstimatedTotalIntraCranialVol")]
  
  #### Check out the two files contens are consitent
  ses_info <- table(info_data$session_id)# 75 M0 OK
  cat("The session info comparison: ", ses_info, '\n')
  print(ses_info)
  cat('\n')
  group_info <- table(info_data$group)
  cat("The group info comparison: ")
  print(group_info)
  cat('\n')
  age_info <- summary(as.numeric(as.character(info_data$age_bl)))# 0 OK
  cat("The age_bl info : ")
  print(age_info)
  cat('\n')
  table(info_data$sex)#43 F, 32 M OK
  
  ### checkout dimension
  input_tsv_dim <- dim(input_tsv)
  cat("The source data csv dimension is: ")
  print(input_tsv_dim)
  cat('\n')
  
  ## simplifier les noms de participant_id
  input_tsv$participant_id <- gsub('sub-PREVDEMALS', '', input_tsv$participant_id)
  length(unique(input_tsv$participant_id))# 75
  length(unique(substr(input_tsv$participant_id, 6, 9)))# ok 75
  input_tsv$participant_id <- substr(input_tsv$participant_id, 6, 9)
  
  ### make sure the numerical input_tsv to be numerical
  input_tsv[, setdiff(colnames(input_tsv), c("participant_id", "session_id", "group", "prevdemals_family_code", "sex"))] <- apply(input_tsv[, setdiff(colnames(input_tsv), c("participant_id", "session_id", "group", "prevdemals_family_code", "sex"))], 2, function(x)as.numeric(as.character(x))) # 2 means columns to run
  
  ## Calculate the mean TIV of the CN + PS
  data_analyse_CN_PS <- input_tsv[which(input_tsv$group %in% c('CN', 'PS')), ]
  mean_TIV <- mean(data_analyse_CN_PS$EstimatedTotalIntraCranialVol)
  
  ## Suppression des colonnes non utiles
  data_analyse <- input_tsv[, c(which(colnames(input_tsv) %in% c('participant_id', 'group', 'age_bl', "prevdemals_family_code", 'sex', 'EstimatedTotalIntraCranialVol')), match(roi_start,  colnames(input_tsv)):match(roi_end, colnames(input_tsv)))]
  
  ## Make sure all the categorical factor to be character
  data_analyse[, c("participant_id", "prevdemals_family_code", "group", "sex")] <- apply(data_analyse[, c("participant_id", "prevdemals_family_code", "group", "sex")], 2, as.character)
  ### Make sure there is no NA in the data
  apply(data_analyse, 2, function(x) sum(is.na(x)))# 0 OK
  
  ### Calculate the TIV for each roi
  nom_ROIs_volume <- setdiff(colnames(data_analyse), c("participant_id", "group", "age_bl", "prevdemals_family_code", "sex", "EstimatedTotalIntraCranialVol"))
  nom_ROIs_volume_TIV <- paste(nom_ROIs_volume, 'TIV', sep = '.')
  ### Append the TIV with the model variables togethe, here, the normalized TIV * mean_TIV of all the subjects
  data_TIV <- apply(data_analyse[, nom_ROIs_volume], 2, function(x) mean_TIV*x/data_analyse$EstimatedTotalIntraCranialVol)
  nom_ROIs_volume_TIV <- gsub('_', '.', nom_ROIs_volume)
  colnames(data_TIV) <- nom_ROIs_volume_TIV# add the columns 2 the dataframe
  data_analyse <- cbind(data_analyse[1:6], data_TIV)# combine these two dataframes
  
  ###Compute the mean and std of the CN, PS, PT.
  data_analyse_CN <- data_analyse[which(data_analyse$group %in% c('CN')), ]
  data_analyse_PS <- data_analyse[which(data_analyse$group %in% c('PS')), ]
  data_analyse_PT <- data_analyse[which(data_analyse$group %in% c('PT')), ]
  
  data_analyse_CN_age_mean <- mean(data_analyse_CN[, c("age_bl")])
  data_analyse_CN_age_std <- sd(data_analyse_CN[, c("age_bl")])
  cat("The mean and stand deviation of CN: ")
  print(data_analyse_CN_age_mean)
  print(data_analyse_CN_age_std)
  cat('\n')
  sex_info_CN <- table(data_analyse_CN$sex)
  cat("The sex info for CN: ")
  print(sex_info_CN)
  cat('\n')
  
  
  data_analyse_PS_age_mean <- mean(data_analyse_PS[, c("age_bl")])
  data_analyse_PS_age_std <- sd(data_analyse_PS[, c("age_bl")])
  cat("The mean and stand deviation of PS: ")
  print(data_analyse_PS_age_mean)
  print(data_analyse_PS_age_std)
  cat('\n')
  sex_info_PS <- table(data_analyse_PS$sex)
  cat("The sex info for PS: ")
  print(sex_info_PS)
  cat('\n')
  
  data_analyse_PT_age_mean <- mean(data_analyse_PT[, c("age_bl")])
  data_analyse_PT_age_std <- sd(data_analyse_PT[, c("age_bl")])
  cat("The mean and stand deviation of PT: ")
  print(data_analyse_PT_age_mean)
  print(data_analyse_PT_age_std)
  cat('\n')
  sex_info_PT <- table(data_analyse_PT$sex)
  cat("The sex info for PT: ")
  print(sex_info_PT)
  cat('\n')
  
  # chi2 test for sex
  cat("The chi square test for sex: ")
  mydata = cbind(table(data_analyse_CN[, c("sex")]),table(data_analyse_PS[, c("sex")]))
  print(chisq.test(mydata)$p.value)
  pvalue_sex = chisq.test(mydata)$p.value
  cat('\n')
  
  # Mann-whitney test for age
  cat("The Mann-Whitney test for age: ")
  print(wilcox.test(data_analyse_CN[, c("age_bl")], data_analyse_PS[, c("age_bl")], exact=FALSE)$p.value)
  pvalue_age = wilcox.test(data_analyse_CN[, c("age_bl")], data_analyse_PS[, c("age_bl")], exact=FALSE)$p.value
  cat('\n')
  
  return(list(data_analyse, nom_ROIs_volume, nom_ROIs_volume_TIV, pvalue_sex, pvalue_age, data_analyse_PS_age_mean, data_analyse_PS_age_std, data_analyse_CN_age_mean, data_analyse_CN_age_std))
}



qqplot_data <- function (vec) # argument: vector of numbers
{
  # following four lines from base R's qqline()
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  return(data.frame(resids = vec, 
                    slope = slope, 
                    int = int))
}

qqplot_func <- function(data_validation, ROIs){
  #' A function for QQ plot
  #'
  #' This function allows you to express your love of cats.
  #' input_tsv: str, path to tsv
  #' roi_start: str, firt ROI
  #' roi_end: str, last ROI
  #' sep: the delimiter of the tsv
  #' decimal: the decimal in the tsv, . or ,
  return(do.call('rbind', lapply(ROIs, function(var) 
    cbind(data.frame(var = gsub('.TIV', '', var)), 
          qqplot_data(data_validation$res[which(data_validation$var == var)])))))
}

##### FUNC to do group comparison
# Create a function to round the numerical data
format_arrondi <- function(x, arrondi = 2)format(round(x, arrondi), nsmall=arrondi)

# Create a function to format the pvalue for the following result
format_pvalue <- function(pvalue, n.arrondi = 3) gsub(' ', '', ifelse(pvalue < 0.05, paste(format(ifelse(round(pvalue, n.arrondi) == 0, '<0.001', round(pvalue, n.arrondi)), nsmall=n.arrondi), '*', sep=''),  format(round(pvalue, n.arrondi), nsmall=n.arrondi)))

## studentized residuals for lmer
StudentResid = function(fit){
  #### This is a function to calculate the studentized residuals for mixed-effect linear model
  res = residuals(fit)
  H = hatvalues(fit)
  sigma = summary(fit)$sigm
  sres = sapply(1:length(res), function(i) res[[i]]/(sigma*sqrt(1-H[[i]])))
  return(sres)
}

cohenD_fixed_model <- function(model_complet, model_emboite){
  return( (summary(model_complet)$r.squared-summary(model_emboite)$r.squared)/(1-summary(model_complet)$r.squared))
}

analysisPCA <- function(groupPalette, data_analyse, nom_ROIs_volume)
{
  #### This is a function to run PCA for group factor
  data_PCA <- data_analyse[, setdiff(colnames(data_analyse), c('age_bl', 'sex'))]
  data_PCA$group <- factor(data_PCA$group) # encode a vector as a factor
  res.pca_vol_normalise <- PCA(data_PCA[, c('group', nom_ROIs_volume)], scale.unit = TRUE, ncp = length(nom_ROIs_volume), quali.sup = 1, graph = F)
  # this is the individual factor map for all the participants
  # explanation for Confidence ellipses: The variance of the underlying population relates to the confidence ellipse. High variance will mean that the data are all over the place, so the mean is not well estimated, so the confidence ellipse will be larger than if the variance were smaller.
  # plot.PCA(res.pca_vol_normalise,
  #          habillage = 1,
  #          ellipse = coord.ellipse(cbind.data.frame(data_analyse$group,res.pca_vol_normalise$ind$coord), bary=T),
  #          cex = 2, label = 'none', col.hab = groupPalette)
  # The squared cosine shows the importance of a component for a given observation.
  fviz_pca_ind(res.pca_vol_normalise, col.ind="cos2") +
    scale_color_gradient2(low="white", mid="blue",
                          high="red", midpoint=0.50) + theme_minimal()
  #plot.PCA(res.pca_vol_normalise, axes=c(1, 2), choix="var", cex = 0.7)
  # this is the variable factor maps for cos2, higher value, give red color which means high correlation with the first two PCs
  # The squared cosine shows the importance of a component for a given observation.
  fviz_pca_var(res.pca_vol_normalise, col.var="cos2") +
    scale_color_gradient2(low="white", mid="blue",
                          high="red", midpoint=0.5) + theme_minimal()
  # this is to plot the scree plot of the data
  fviz_screeplot(res.pca_vol_normalise, ncp=10)
  return(res.pca_vol_normalise)
}
##################
###################
# Linear model function
#####################
######################
general_linear_model_with_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 4, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) 
                           model_complet <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         if(regexpr(':', e) != (-1)){ coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)}else{coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1) & regexpr(':', names(coef(model_complet))) == (-1))], arrondi)}
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenD_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], 4)}else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], 4)}
              res <- data.frame(data.frame(t(coef)),
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], 
                                pvalue_mult = NA)
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

##########################################################################
############## General linear model without interaction between age and group
##########################################################################
general_linear_model_without_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 6, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenD_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], 4)}else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], 4)}
              res <- data.frame(data.frame(t(coef)),
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], 
                                pvalue_mult = NA)
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
    results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  }
  ############ format the result tabel
  results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}


######################
########### Post Hoc analysis function with interaction
########################

postHOC_GR_ageGR <- function(data, ROIs, model = 'fixe', pvalue_GR, pvalue_ageGR, correction = 'BH', arrondi = 4){
  
  results <- do.call('rbind', lapply(ROIs, function(var){
    if(model == 'fixe'){
      if(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
        model_complet <- lm(as.formula(paste(var, ' - 1 + group + age_bl + sex + group:age_bl', sep = ' ~ ')), data = data)
      }else{
        model_complet <- lm(as.formula(paste(var, ' - 1 + group + age_bl + sex', sep = ' ~ ')), data = data)
      }
    }else{
      #### fixed effect 
      if(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
        model_complet <- lmer(as.formula(paste(var, ' - 1 + group + age_bl + sex + group:age_bl  + (1|prevdemals_family_code)', sep = ' ~ ')), data = data)
      }else{
        model_complet <- lmer(as.formula(paste(var, ' - 1 + group + age_bl + sex + (1|prevdemals_family_code)', sep = ' ~ ')), data = data)
      }
    }
    
    # if(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1) | regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1)){
    return(data.frame(var = var, 
                      GR_PS_CN_coeff = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                              ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0, 0, 0), 1)))$test$coeff[1], arrondi), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0), 1)))$test$coeff[1], arrondi)),
                                              NA), 
                      GR_PS_CN_pvalue = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                               ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0, 0, 0), 1)))$test$pvalues[1], arrondi), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(-1, 1, 0, 0, 0), 1)))$test$pvalues[1], arrondi)),
                                               NA), 
                      GR_PS_CN_pvalue_corr = NA, 
                      # GR_CN_PT = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                      #                   summary(glht(model_complet, linfct = matrix(c(-1, 0, 1, 0, 0, 0, 0), 1)))$test$pvalues[1],
                      #                   NA), 
                      GR_PT_PS_coeff = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                              ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0, 0, 0), 1)))$test$coeff[1], arrondi), 
                                                     format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0), 1)))$test$coeff[1], arrondi)),
                                              NA),
                      GR_PT_PS_pvalue = ifelse(regexpr('\\*', pvalue_GR[which(ROIs == var)]) != (-1), 
                                               ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0, 0, 0), 1)))$test$pvalues[1], arrondi), 
                                                      format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, -1, 1, 0, 0), 1)))$test$pvalues[1], arrondi)),
                                               NA),
                      GR_PT_PS_pvalue_corr = NA, 
                      ageGR_PS_CN_coeff = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                 summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 1, 0), 1)))$test$coeff[1], 
                                                 NA), 
                      ageGR_PS_CN_pvalue = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                  format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 1, 0), 1)))$test$pvalues[1], arrondi), 
                                                  NA), 
                      ageGR_PS_CN_pvalue_corr = NA, 
                      # ageGR_CN_PT = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                      #                      summary(glht(model_complet, linfct = matrix(c(0, 0, 0, -1, 0, 0, 1), 1)))$test$pvalues[1], 
                      #                      NA), 
                      ageGR_PT_PS_coef = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                format_arrondi(summary(glht(model_complet, linfct = matrix(c(0, 0, 0, 0, 0, -1, 1), 1)))$test$coeff[1], arrondi), 
                                                NA),
                      ageGR_PT_PS_pvalue = ifelse(regexpr('\\*', pvalue_ageGR[which(ROIs == var)]) != (-1), 
                                                  summary(glht(model_complet, linfct = matrix(c(0, 0, 0, 0, 0, -1, 1), 1)))$test$pvalues[1], 
                                                  NA), 
                      ageGR_PT_PS_pvalue_corr = NA))
    # }
  }))
  
  results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue')] <- t(apply(results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue')], 1, function(x) p.adjust(x, 'holm', 2)))
  results[, c('ageGR_PS_CN_coeff', 'ageGR_PT_PS_pvalue')] <- t(apply(results[, c('ageGR_PS_CN_coeff', 'ageGR_PT_PS_pvalue')], 1, function(x) p.adjust(x, 'holm', 2)))
  results[, c('GR_PS_CN_pvalue_corr', 'GR_PT_PS_pvalue_corr', 'ageGR_PS_CN_pvalue_corr', 'ageGR_PT_PS_pvalue_corr')] <- apply(results[, c('GR_PS_CN_pvalue', 'GR_PT_PS_pvalue', 'ageGR_PS_CN_pvalue', 'ageGR_PT_PS_pvalue')], 2, function(x) p.adjust(as.numeric(x), correction, sum(!is.na(x))))
  
  results[, which(regexpr('pvalue', colnames(results)) != (-1))] <- apply( results[, which(regexpr('pvalue', colnames(results)) != (-1))], 2, function(x) format_pvalue(as.numeric(x)))
  
  
  return(results)
}

###############################
########### model validation
###################################
data_validation <- function(data, ROIs, model = 'fixe'){
  if(model == 'fixe'){
    return(do.call('rbind', mclapply(ROIs, function(var){
      model <- lm(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group', sep = '')), data = data)
      return(data.frame(var = var, 
                        id = data$participant_id,
                        y = data[,var], 
                        y_hat = fitted(model), 
                        res = residuals(model), 
                        res_studentise = studres(model), 
                        levier = hatvalues(model)) ## In statistics and in particular in regression analysis, leverage is a measure of how far away the independent variable values of an observation are from those of the other observations.
      )
    }, mc.cores = 3)))
  }else{
    return(do.call('rbind', mclapply(ROIs, function(var){
      model <- lmer(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group + (1|prevdemals_family_code)', sep = '')), data = data)
      return(data.frame(var = var, 
                        id = data$participant_id,
                        y = data[,var], 
                        y_hat = fitted(model), 
                        res = residuals(model), 
                        levier = hatvalues(model))
      )
    }, mc.cores = 3)))
  }
}

################
###### Data validation table creation
###############
random_effect_inspect <- function(data, ROIs, arrondi = 3){
  return(do.call('rbind', mclapply(ROIs, function(var){
    model <- lmer(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group + (1|prevdemals_family_code)', sep = '')), data = data)
    return(cbind(data.frame(var = var), 
                 setNames(data.frame(t(data.frame(res = format_arrondi(ranef(model)$prevdemals_family_code[, 1], arrondi)))), rownames(ranef(model)$prevdemals_family_code)), 
                 data.frame(var_alea = format_arrondi(attr(VarCorr(model)$prevdemals_family_code, "stddev")[1], arrondi), ### Get the stand deviatin of the random effect
                            var_res = format_arrondi(summary(model)$sigma, arrondi), 
                            prop_var_tot = format_arrondi(attr(VarCorr(model)$prevdemals_family_code, "stddev")[1]/(summary(model)$sigma+attr(VarCorr(model)$prevdemals_family_code, "stddev")[1]), arrondi))))
  }, mc.cores = 3)))
}

#################
############ heterogeneity check function for fixed effect linear model
#############################
inspect_heterogeneite_fixed <- function(data, ROIs){
  return(do.call('rbind', mclapply(ROIs, function(var){
    model <- lm(as.formula(paste(var, ' ~ age_bl + sex + group + age_bl*group', sep = '')), data = data)
    return(data.frame(var = gsub('.TIV', '', gsub('Right', 'R', gsub('Left', 'L', var))), 
                      bp = format_pvalue(bptest(model)$p.value),
                      white = format_pvalue(bptest(model, ~ age_bl*sex + I(age_bl^2) + age_bl*group*sex, data = data_analyse)$p.value))
    )
  }, mc.cores = 3)))
}

###########################
#####  Group comparison
############################

multiple_compare_groupe <- function(data, var_group = 'group', vars = setdiff(colnames(data_analyse), c('group', 'participant_id')), factorVars = c('sex'), test = 'wilcox', arrondi = 2, indicateur = 'median', format_pvalue = T, correct = 'bonferroni'){
  #  ##### When you wanna use this to compare two groups, you can just use this
  # do.call constructs and exectues a function call.
  #mclapply is a parallelized version of lapply mclapply(x, FUN...)
  # if the var in vars is in factorVars, and if it is sex, gointo first if, else gointo else, and if not in factorVars, gointo second else
  # for sex and prevdemals_family_code, is to find the correlation between the var and the groups, if they are independent 
  
  ###### Choose between non-parametric(wilcox(permutation test)) and parametric(t-test):
  ###Note: One principled response to this is "safety first": with no way to reliably verify the normality assumption in a small sample, stick to non-parametric methods
  ######
  groupe <- sort(unique(as.character(data[, var_group])))
  
  table <- do.call('rbind', mclapply(vars, function(var){
    if(var %in% factorVars){
      if(length(unique(data[, var])) == 2){
        res <- do.call('cbind', lapply(groupe, function(gr){
          data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1]), ' (', 
                                    format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1])/
                                                     sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
        }))
        colnames(res) <- groupe
        return(cbind(data.frame(var = paste(var, sort(unique(data[, var]))[1], sep=' = ')), 
                     res,
                     data.frame(pvalue = chisq.test(data[, var_group], data[, var])$p.value)))
      }else{
        res <- do.call('rbind', lapply(sort(unique(data[, var])), function(mod){
          cbind(data.frame(var = paste(var, mod, sep=' = ')), 
                do.call('cbind', lapply(groupe, function(gr){
                  data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == mod), ' (', 
                                            format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == mod)/
                                                             sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
                })))
        }))
        colnames(res) <- c('var', groupe)
        return(cbind(res,
                     data.frame(pvalue = c(chisq.test(data[, var_group], data[, var])$p.value, rep(NA, length(unique(data[, var]))-1)))))
      }
    }else{
      res <- do.call('cbind', lapply(groupe, function(gr){
        data.frame(groupe = ifelse(indicateur == 'median', 
                                   paste(format_arrondi(median(subset(data, get(var_group) == gr)[,var])), 
                                         ' [', format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.25)), ',',
                                         format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.75)), ']', sep=''), 
                                   paste(format_arrondi(mean(subset(data, get(var_group) == gr)[,var])), 
                                         ' (', format_arrondi(sd(subset(data, get(var_group) == gr)[,var])), ')', sep='')))
      }))
      colnames(res) <- groupe
      
      return(cbind(data.frame(var = var), 
                   res, 
                   pvalue = ifelse(length(groupe) == 2, 
                                   ifelse(test == 'wilcox',
                                          wilcox.test(data[,var] ~ data[,var_group])$p.value, 
                                          t.test(data[,var] ~ data[,var_group], var.equal = ifelse(var.test(data[,var] ~ data[,var_group])$p.value<0.05, F, T))$p.value), 
                                   ifelse(test == 'wilcox', 
                                          kruskal.test(data[,var] ~ factor(data[,var_group]))$p.value, 
                                          anova(lm(data[,var] ~ factor(data[,var_group])))[["Pr(>F)"]][1]))))
    }
  }, mc.cores = 3))
  
  res <- do.call('cbind', lapply(groupe, function(gr){
    data.frame(groupe = paste(nrow(subset(data, get(var_group) == gr)), ' (', 
                              format_arrondi(100*nrow(subset(data, get(var_group) == gr))/sum(!is.na(data[, var_group]))), '%)', sep= ''))
  }))
  colnames(res) <- groupe
  table <- rbind(cbind(data.frame(var = 'N'), 
                       res, 
                       data.frame(pvalue = NA)), 
                 table)
  
  table$pvalue_multiple <- p.adjust(as.numeric(as.character(table$pvalue)), correct, sum(!is.na(table$pvalue)))
  if(format_pvalue) table[, c('pvalue', 'pvalue_multiple')] <- apply(table[, c('pvalue', 'pvalue_multiple')] , 2, function(x) format_pvalue(pvalue = as.numeric(as.character(x))))
  
  return(table)
}

#######################################################################
####### This is to plot the regression plot of ROI
#######################################################################

plot_regreesion_scatter <- function(data, ROI, subtitle, xlabel="", ylabel="", legend_postion="right", y_axis_limits=NULL, y_label_format=NULL){

  plot_data <- data[, c('participant_id', 'group', 'sex', "age_bl", ROI)]
  nom_ROIs_volume_TIV_sig_newname = c('participant_id', 'group', 'sex', "age_bl", subtitle)
  colnames(plot_data) <- nom_ROIs_volume_TIV_sig_newname
  ggplot_img <- ggplot(melt(plot_data, id.vars = c('participant_id', 'group', 'sex', "age_bl")),
                                    aes(x = age_bl, y = value, color=group)) + geom_point(size=3) +
    geom_smooth(data = melt(plot_data, id.vars = c('participant_id', 'group', 'sex', "age_bl")),
                aes(x = age_bl, y = value, color=group), method = "lm", se = FALSE)  + facet_wrap(~variable, scales="free") + scale_colour_manual(labels = c("C9-", "C9+"), values=c("blue", "deeppink4")) + theme_bw() +
    theme(axis.line.x = element_line(colour = "black", size=1.5),
          axis.line.y = element_line(color="black", size=1.5),
          axis.text=element_text(size=20, colour="black", face="bold"),
          axis.title=element_text(size=20, colour="black", face="bold"),
          axis.ticks = element_line(colour = "black", size=1.5, linetype = 'longdash'),
          axis.ticks.length=unit(0.3,"cm"),
          axis.ticks.x = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          strip.background = element_blank(),
          legend.position = legend_postion,
          legend.title = element_text(size=20, colour="black", face="bold"),
          legend.text = element_text(size=20, colour="black", face="bold"),
          strip.text = element_text(size=24, colour="black", face="bold", vjust=3)) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(labels = y_label_format, expand = c(0, 0)) + labs(x = xlabel, y = ylabel, color = "Group", face="bold") + coord_cartesian(xlim=c(20, 81), ylim=y_axis_limits)


  return(ggplot_img)
}


# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot_ggplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
anbai106/NeuroStatisticR documentation built on May 27, 2020, 2:20 a.m.