NeuroStatisticR:

Documented in data_prep

data_prep <- function(input_tsv, roi_start, roi_end, sep='\t')
{
  ### This function is to do the required data preprocessing from the targeted tsv file for dti statistics in prevdemals.
  ## Param: 
  # input_tsv: str, the path to the tsv file, the tsv file's first 8 columns should be 'participant_id', 'session_id', 'prevdemals_family_code', 'group', 'age_bl', 'sex', and later columns should be the ROIs
  # roi_start
  #NOTE, should be careful with the seperator of the tsv file and xlsx file, also the decimal in the tsv file.
  input_tsv <- read.table(input_tsv, sep = sep, header = T, dec = ',')
  
  
  #### Check out the two files contens are consitent 
  ses_info <- table(input_tsv$session_id)
  cat("The session info comparison: ", ses_info, '\n')
  print(ses_info)
  cat('\n')
  group_info <- table(input_tsv$group)
  cat("The group info comparison: ")
  print(group_info)
  cat('\n')
  age_info <- summary(as.numeric(as.character(input_tsv$age_bl)))
  cat("The age_bl info: ")
  print(age_info)
  cat('\n')
  summary(as.numeric(as.character(input_tsv$expected_years_aoo_bl)))
  sex_info <- table(input_tsv$sex)
  cat("The sex info: ")
  print(sex_info)
  cat('\n')
  
  ### checkout dimension
  input_tsv_dim <- dim(input_tsv)
  cat("The source data tsv dimension is: ")
  print(input_tsv_dim)
  cat('\n')
  
  ## simplifier les noms de participant_id
  input_tsv$participant_id <- gsub('sub-PREVDEMALS', '', input_tsv$participant_id)
  length(unique( input_tsv$participant_id))
  length(unique(substr(input_tsv$participant_id, 6, 9)))
  input_tsv$participant_id <- substr(input_tsv$participant_id, 6, 9)
  
  ### make sure the numerical input_tsv to be numerical  
  input_tsv[, setdiff(colnames(input_tsv), c("participant_id", 'prevdemals_family_code', "session_id", "group", "sex"))] <- apply(input_tsv[, setdiff(colnames(input_tsv), c("participant_id", 'prevdemals_family_code', "session_id", "group", "sex"))], 2, function(x)as.numeric(as.character(x))) # 2 means columns to run 
  
  ## Suppression des colonnes non utiles
  data_analyse <- input_tsv[, c(which(colnames(input_tsv) %in% c('participant_id', 'prevdemals_family_code', 'group', 'age_bl', 'sex')), match(roi_start,  colnames(input_tsv)):match(roi_end, colnames(input_tsv)))]
  
  ## Make sure all the categorical factor to be character
  data_analyse[, c("participant_id", 'prevdemals_family_code', "group", "sex")] <- apply(data_analyse[, c("participant_id", 'prevdemals_family_code', "group", "sex")], 2, as.character)
  ###############
  #########Make sure there is no NA in the data
  #################
  NA_res <- apply(data_analyse, 2, function(x) sum(is.na(x)))# 0 OK
  cat("Note, important, check out the number of every column, make sure it is 0: ")
  print(NA_res)
  cat('\n')
  
  nom_ROIs_volume <- setdiff(colnames(data_analyse), c("participant_id", 'prevdemals_family_code', "group", "age_bl", "sex"))
  
  ###Redefine the data_analyses
  data_analyse <- data_analyse[, c(c("participant_id", 'prevdemals_family_code', "group", "age_bl", "sex"), nom_ROIs_volume)]

  ###Compute the mean and std of the CN, PS, PT.
  data_analyse_CN <- data_analyse[which(data_analyse$group %in% c('CN')), ]
  data_analyse_PS <- data_analyse[which(data_analyse$group %in% c('PS')), ]
  data_analyse_PT <- data_analyse[which(data_analyse$group %in% c('PT')), ]
  
  data_analyse_CN_age_mean <- mean(data_analyse_CN[, c("age_bl")])
  data_analyse_CN_age_std <- sd(data_analyse_CN[, c("age_bl")])
  cat("The mean and stand deviation of CN: ")
  print(data_analyse_CN_age_mean)
  print(data_analyse_CN_age_std)
  cat('\n')
  sex_info_CN <- table(data_analyse_CN$sex)
  cat("The sex info for CN: ")
  print(sex_info_CN)
  cat('\n')
  
  
  data_analyse_PS_age_mean <- mean(data_analyse_PS[, c("age_bl")])
  data_analyse_PS_age_std <- sd(data_analyse_PS[, c("age_bl")])
  cat("The mean and stand deviation of PS: ")
  print(data_analyse_PS_age_mean)
  print(data_analyse_PS_age_std)
  cat('\n')
  sex_info_PS <- table(data_analyse_PS$sex)
  cat("The sex info for PS: ")
  print(sex_info_PS)
  cat('\n')
  
  data_analyse_PT_age_mean <- mean(data_analyse_PT[, c("age_bl")])
  data_analyse_PT_age_std <- sd(data_analyse_PT[, c("age_bl")])
  cat("The mean and stand deviation of PT: ")
  print(data_analyse_PT_age_mean)
  print(data_analyse_PT_age_std)
  cat('\n')
  sex_info_PT <- table(data_analyse_PT$sex)
  cat("The sex info for PT: ")
  print(sex_info_PT)
  cat('\n')
  
  return(list(data_analyse, nom_ROIs_volume))
}


general_linear_model_without_interaction_for_es <- function(data, effets, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
  if(model == 'fixe'){
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
      
      results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
                       do.call('cbind', lapply(effets, function(e){
                         ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
                         model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
                         coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)
                         
                         res <- data.frame(data.frame(t(coef)),
                                           f2cohen = format_arrondi(cohenD_fixed_model(model_complet, model_low), 2), 
                                           pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2], 
                                           # pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
                                           pvalue_mult = NA)
                         colnames(res) <- paste(e, colnames(res), sep='_')
                         return(res)
                       })))
    }))
  }else{
    ##### mixed Model 
    results_tableau <- do.call('rbind', lapply(ROIs, function(var){
      model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
      
      cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R',  var))))))), 
            do.call('cbind', lapply(effets, function(e){
              ### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
              ### This is actually for model construction with interaction.
              if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1))
                model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              ## This is the reduced_model without the var
              model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
              ### This is for interaction
              if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], arrondi)}
              ## This is for without interaction
              else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], arrondi)}
              res <- data.frame(data.frame(t(coef)),
                                f2cohenCondition = format_arrondi(cohenf2_mixed_model(model_complet, model_low, 2), 2),
                                f2cohenMarginal = format_arrondi(cohenf2_mixed_model(model_complet, model_low, 1), 2),
                                R2CompletCondition = r.squaredGLMM(model_complet)[2],
                                R2LowContdition = r.squaredGLMM(model_low)[2],
                                R2CompletMarginal = r.squaredGLMM(model_complet)[1],
                                R2LowMarginal = r.squaredGLMM(model_low)[1],
                                pvalue =  anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2], 
                                pvalue_mult = NA
                                )
              colnames(res) <- paste(e, colnames(res), sep='_')
              return(res)
            })))
    }))
  }
  
  #### multiple correction for the pvalue
  #for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
  #  results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
  #}
  ############ format the result tabel
  #results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
  
  return(results_tableau)
}

# Create a function to round the numerical data
format_arrondi <- function(x, arrondi = 2)format(round(x, arrondi), nsmall=arrondi, scientific=TRUE)

# According to Cohen’s (1988) guidelines, f2f2≥ 0.02, f2f2≥ 0.15, and f2f2 ≥ 0.35 represent small, medium, and large effect sizes, respectively.
cohenf2_mixed_model <- function(model_complet, model_emboite, r_type){
  #### r_type 1 represents the marginal r2, 2 is the conditional r2
  cohenf = (r.squaredGLMM(model_complet)[r_type]-r.squaredGLMM(model_emboite)[r_type])/(1- r.squaredGLMM(model_complet)[r_type])
  return(cohenf[1])
}

f.format_pvalue <- function(pvalue, n.arrondi = 3, alpha = 0.05){
  gsub(' ', '', ifelse(pvalue < alpha, 
                       paste(ifelse(round(pvalue, n.arrondi) == 0, 
                                    paste('<0.', paste(rep('0', n.arrondi-1), collapse=''),  '1', sep= ''), 
                                    f.format_arrondi(pvalue, n.arrondi)), '*', sep=''),  
                       f.format_arrondi(pvalue, n.arrondi)))
}