dataPrep <- function(data_src, roi_start, roi_end, sep='\t')
{
### This function is to do the required data preprocessing from the targeted tsv file
data_src <- read.table(data_src, sep = sep, header = T, dec = ',')
### checkout dimension
data_src_dim <- dim(data_src)
cat("The source data tsv dimension is: ")
print(data_src_dim)
cat('\n')
### make sure the numerical data_src to be numerical
data_src[, setdiff(colnames(data_src), c("participant_id", "session_id", "group"))] <- apply(data_src[, setdiff(colnames(data_src), c("participant_id", "session_id", "group"))], 2, function(x)as.numeric(as.character(x))) # 2 means columns to run
## Make sure all the categorical factor to be character
data_src[, c("participant_id", 'session_id', "group")] <- apply(data_src[, c("participant_id", 'session_id', "group")], 2, as.character)
###############
#########Make sure there is no NA in the data
#################
NA_res <- apply(data_src, 2, function(x) sum(is.na(x)))# 0 OK
cat("Note, important, check out the number of every column, make sure it is 0: ")
print(NA_res)
cat('\n')
nom_ROIs_volume <- setdiff(colnames(data_src), c("participant_id", 'session_id', "group", "age", "sex", "disease_duration", "cpz", "age_disease_onset", "panss_pos", "panss_neg", "education"))
return(list(data_src, nom_ROIs_volume))
}
###########################
##### Group comparison
############################
multiple_compare_groupe <- function(data, var_group = 'group', vars = setdiff(colnames(data_analyse), c('group', 'participant_id', "session_id")), factorVars = c('icv'), test = 'wilcox', arrondi = 2, indicateur = 'median', format_pvalue = T, correct = 'bonferroni'){
# ##### When you wanna use this to compare two groups, you can just use this
# do.call constructs and exectues a function call.
#mclapply is a parallelized version of lapply mclapply(x, FUN...)
# if the var in vars is in factorVars, and if it is sex, gointo first if, else gointo else, and if not in factorVars, gointo second else
# for sex, is to find the correlation between the var and the groups, if they are independent
###### Choose between non-parametric(wilcox) and parametric(t-test):
###Note: One principled response to this is "safety first": with no way to reliably verify the normality assumption in a small sample, stick to non-parametric methods
######
groupe <- sort(unique(as.character(data[, var_group])))
table <- do.call('rbind', mclapply(vars, function(var){
if(var %in% factorVars){
#### This is to consider just sex
if(length(unique(data[, var])) == 2){
### This is for sex
res <- do.call('cbind', lapply(groupe, function(gr){
data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1]), ' (',
format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == sort(unique(data[, var]))[1])/
sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
}))
colnames(res) <- groupe
return(cbind(data.frame(var = paste(var, sort(unique(data[, var]))[1], sep=' = ')),
res,
data.frame(pvalue = chisq.test(data[, var_group], data[, var])$p.value)))
}else{
### This is to do the real multiple comparison for the diff ROIs
res <- do.call('rbind', lapply(sort(unique(data[, var])), function(mod){
cbind(data.frame(var = paste(var, mod, sep=' = ')),
do.call('cbind', lapply(groupe, function(gr){
data.frame(groupe = paste(sum(subset(data, get(var_group) == gr)[,var] == mod), ' (',
format_arrondi(100*sum(subset(data, get(var_group) == gr)[,var] == mod)/
sum(!is.na(subset(data, get(var_group) == gr)[,var]))), '%)', sep= ''))
})))
}))
colnames(res) <- c('var', groupe)
return(cbind(res,
data.frame(pvalue = c(chisq.test(data[, var_group], data[, var])$p.value, rep(NA, length(unique(data[, var]))-1)))))
}
}else{
res <- do.call('cbind', lapply(groupe, function(gr){
data.frame(groupe = ifelse(indicateur == 'median',
paste(format_arrondi(median(subset(data, get(var_group) == gr)[,var])),
' [', format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.25)), ',',
format_arrondi(quantile(subset(data, get(var_group) == gr)[,var], 0.75)), ']', sep=''),
paste(format_arrondi(mean(subset(data, get(var_group) == gr)[,var])),
' (', format_arrondi(sd(subset(data, get(var_group) == gr)[,var])), ')', sep='')))
}))
colnames(res) <- groupe
return(cbind(data.frame(var = var),
res,
pvalue = ifelse(length(groupe) == 2,
ifelse(test == 'wilcox',
wilcox.test(data[,var] ~ data[,var_group])$p.value,
t.test(data[,var] ~ data[,var_group], var.equal = ifelse(var.test(data[,var] ~ data[,var_group])$p.value<0.05, F, T))$p.value),
ifelse(test == 'wilcox',
kruskal.test(data[,var] ~ factor(data[,var_group]))$p.value,
anova(lm(data[,var] ~ factor(data[,var_group])))[["Pr(>F)"]][1]))))
}
}, mc.cores = 3))
########## To get the group info
res <- do.call('cbind', lapply(groupe, function(gr){
data.frame(groupe = paste(nrow(subset(data, get(var_group) == gr)), ' (',
format_arrondi(100*nrow(subset(data, get(var_group) == gr))/sum(!is.na(data[, var_group]))), '%)', sep= ''))
}))
colnames(res) <- groupe
############
########## concatenate the res and table
table <- rbind(cbind(data.frame(var = 'N'),
res,
data.frame(pvalue = NA)),
table)
table$pvalue_multiple <- p.adjust(as.numeric(as.character(table$pvalue)), correct, sum(!is.na(table$pvalue)))
if(format_pvalue) table[, c('pvalue', 'pvalue_multiple')] <- apply(table[, c('pvalue', 'pvalue_multiple')] , 2, function(x) format_pvalue(pvalue = as.numeric(as.character(x))))
return(table)
}
##########################################################################
############## General linear model for correlation
##########################################################################
general_linear_model_correlation <- function(data, predictor, model_independent_var, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
if(model == 'fixe'){
results_tableau <- do.call('rbind', lapply(ROIs, function(var){
model_complet <- lm(as.formula(paste(var, paste(model_independent_var, collapse = ' + '), sep = ' ~ ')), data = data)
results <- cbind(data.frame(var),
do.call('cbind', lapply(predictor, function(e){
res <- data.frame(slope_estimate = summary(model_complet)$coefficients[4, 1],
slope_std = summary(model_complet)$coefficients[4, 2],
pvalue = summary(model_complet)$coefficients[4, 4],
pvalue_mult = NA,
pearson_r = cor(data.matrix(data[,c(e)]), data.matrix(data[,c(var)]), method = "pearson"),
residual_normality = ols_test_correlation(model_complet))
colnames(res) <- paste(e, colnames(res), sep='_')
return(res)
})))
}))
}else{
##### mixed Model TODO
}
#### multiple correction for the pvalue
for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
}
############ format the result tabel
results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
return(results_tableau)
}
##########################################################################
############## General linear model without interaction between age and group
##########################################################################
general_linear_model_without_interaction <- function(data, effets, ROIs, model = 'fixe', arrondi = 10, correction = 'BH'){
if(model == 'fixe'){
results_tableau <- do.call('rbind', lapply(ROIs, function(var){
model_complet <- lm(as.formula(paste(var, paste(effets, collapse = ' + '), sep = ' ~ ')), data = data)
results <- cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R', var))))))),
do.call('cbind', lapply(effets, function(e){
### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
model_low <- lm(as.formula(paste(var, paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), sep = ' ~ ')), data = data)
coef <- format_arrondi(coef(model_complet)[which(regexpr(e, names(coef(model_complet))) != (-1))], arrondi)
res <- data.frame(data.frame(t(coef)),
f2cohen = format_arrondi(cohenf2_formula_fixed_model(model_complet, model_low), 2),
pvalue = anova(model_complet, model_low, test='LRT')[["Pr(>Chi)"]][2],
# pvalueWALDconsist = waldtest(model_low, model_complet, vcov = vcov_consist)$"Pr(>F)"[2],
pvalue_mult = NA)
colnames(res) <- paste(e, colnames(res), sep='_')
return(res)
})))
}))
}else{
##### mixed Model
results_tableau <- do.call('rbind', lapply(ROIs, function(var){
model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(effets, collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
cbind(data.frame(var = gsub('_', '.', gsub('.cortex', '', gsub('_volume', '', gsub('.TIV', '', gsub('Left', 'L', gsub('Right', 'R', var))))))),
do.call('cbind', lapply(effets, function(e){
### This here is to define the full model and reduced model, here, really just consider the interaction's effect when creating the reduced model
if(any(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1)) & regexpr(':', e) == (-1)) model_complet <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets) != (-1) & regexpr(':', effets) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
model_low <- lmer(as.formula(paste(var, ' ~ ', paste(setdiff(effets, effets[which(regexpr(e, effets, fixed = T) != (-1))]), collapse = ' + '), ' + (1|prevdemals_family_code)', sep = '')), data = data)
if(regexpr(':', e) != (-1)){ coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1))], arrondi)}else{coef <- format_arrondi(fixef(model_complet)[which(regexpr(e, names(fixef(model_complet))) != (-1) & regexpr(':', names(fixef(model_complet))) == (-1))], arrondi)}
res <- data.frame(data.frame(t(coef)),
f2cohen = format_arrondi(cohenf2_formula_mixed_model(model_complet, model_low), 2),
pvalue = anova(model_complet, model_low, test='LRT')[["Pr(>Chisq)"]][2],
pvalue_mult = NA)
colnames(res) <- paste(e, colnames(res), sep='_')
return(res)
})))
}))
}
#### multiple correction for the pvalue
for(col in colnames(results_tableau)[which(regexpr('_mult', colnames(results_tableau)) != (-1))]){
results_tableau[, col] <- p.adjust(results_tableau[, gsub('_mult', '', col), ], 'BH', nrow(results_tableau))
}
############ format the result tabel
results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] <- apply(results_tableau[, which(regexpr('pvalue', colnames(results_tableau)) != (-1))] , 2, function(x) format_pvalue(x))
return(results_tableau)
}
# Create a function to format the pvalue for the following result
format_pvalue <- function(pvalue, n.arrondi = 3) gsub(' ', '', ifelse(pvalue < 0.05, paste(format(ifelse(round(pvalue, n.arrondi) == 0, '<0.001', round(pvalue, n.arrondi)), nsmall=n.arrondi), '*', sep=''), format(round(pvalue, n.arrondi), nsmall=n.arrondi)))
cohenf2_formula_fixed_model <- function(model_complet, model_emboite){
# According to Cohen???s (1988) guidelines, f2f2??? 0.02, f2f2??? 0.15, and f2f2 ??? 0.35 represent small, medium, and large effect sizes, respectively.
return( (summary(model_complet)$r.squared-summary(model_emboite)$r.squared)/(1-summary(model_complet)$r.squared))
}
cohenf2_formula_mixed_model <- function(model_complet, model_emboite){
# According to Cohen???s (1988) guidelines, f2f2??? 0.02, f2f2??? 0.15, and f2f2 ??? 0.35 represent small, medium, and large effect sizes, respectively.
return((r.squaredGLMM(model_complet)[1]-r.squaredGLMM(model_emboite)[1])/(1- r.squaredGLMM(model_complet)[1]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.