###### Function to identifiy studies significantly affecting trait of interest - multivariable MR ######
# #' Identify studies for prior using multivariable MR
# #'
# #' From a pruned Z-matrix of strong MR instruments, performs multivariable MR
# #'
# #' @inheritParams bGWAS
# NOT EXPORTED
identify_studiesMR <- function(ZMatrix, MR_shrinkage, MR_threshold, stepwise_threshold, Z_matrices, save_files=FALSE, verbose=FALSE){
Log = c()
if(save_files) Files_Info = readr::read_tsv("PriorGWASs.tsv", progress = FALSE, col_types = readr::cols())
tmp = paste0("#Preparation of the MR analyses to identify significant studies... \n")
Log = update_log(Log, tmp, verbose)
names(ZMatrix) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
filter(!.data$. %in% c('rs','chrm','pos','alt','ref')) %>%
pull() -> All_study_names
All_study_names %>%
as.data.frame(stringsAsFactors = FALSE) %>%
slice(-length(.data$.)) %>%
pull() -> Prior_study_names
tmp = paste0("Conventionnal GWAS of interest : ", All_study_names[length(All_study_names)], "\n")
Log = update_log(Log, tmp, verbose)
# Function to automatically generate the formula for linear model
generate_Formula <- function(outcome, study_names, with_intercept=F ) {
formula = ifelse(with_intercept,
paste(outcome, ' ~ 1 + ',paste(collapse=' + ', paste(sep='','`',study_names,'`'))),
paste(outcome, ' ~ -1 + ',paste(collapse=' + ', paste(sep='','`',study_names,'`'))))
return(formula)
}
# Function to automatically get subsetted matrix of instruments
get_Instruments <- function(Zmat, Zlim){
# get exposures
Zmat %>%
names() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
slice(6:(ncol(Zmat)-1)) %>%
pull() -> exposures
# subset : any row, with abs(exposure)>Zlimit
Zmat %>%
filter_at(vars(exposures), any_vars(abs(.data$.)>Zlim)) -> Zmat
return(Zmat)
}
# Compute and save the univariable regressions (used to check directionnality in multivariable regression):
tmp = paste0("# Univariable regressions for each trait... \n")
Log = update_log(Log, tmp, verbose)
tmp= " Number of trait-specific instruments per univariable regression: \n"
Log = update_log(Log, tmp, verbose)
Zlimit = stats::qnorm(MR_threshold/2, lower.tail = F)
all_uni_coefs = data.frame()
All_study_names %>%
as.data.frame(stringsAsFactors=F) %>%
slice(length(All_study_names)) %>%
pull() -> my_outcome
for(one_study in Prior_study_names) {
# create a ZMatrix_uni containing instruments only for one_study
ZMatrix %>%
filter(abs(eval(parse(text=one_study))) > Zlimit) -> ZMatrix_uni
tmp= paste0(" . ", get_names(one_study, Z_matrices), " : ", nrow(ZMatrix_uni), " \n")
Log = update_log(Log, tmp, verbose)
paste(my_outcome, ' ~ -1 + `',one_study,'`', sep='', collapse="") -> uni_form
stats::lm(data=ZMatrix_uni, formula = uni_form) -> uni_fit
uni_coefs <- data.frame(stats::coef(summary(uni_fit)))
uni_coefs %>%
tibble::rownames_to_column("nm") -> uni_coefs
uni_coefs %>%
mutate(adj.r.squared = summary(uni_fit)$adj.r.squared,
r.squared = summary(uni_fit)$r.squared) -> uni_coefs
all_uni_coefs %>%
bind_rows(uni_coefs) -> all_uni_coefs
}
all_uni_coefs %>%
set_names(c("study", "estimate", "std_error", "Tstat", "P",
"adj_Rsquared", "Rsquared")) -> all_uni_coefs
if(save_files){ # add univariable coeffs
order_inFiles = match(all_uni_coefs$study, Files_Info$File)
Files_Info$uni_estimate = NA
Files_Info$uni_estimate[order_inFiles] = pull(all_uni_coefs, .data$estimate)
Files_Info$uni_std_error = NA
Files_Info$uni_std_error[order_inFiles] = pull(all_uni_coefs, .data$std_error)
Files_Info$uni_T = NA
Files_Info$uni_T[order_inFiles] = pull(all_uni_coefs, .data$Tstat)
Files_Info$uni_P = NA
Files_Info$uni_P[order_inFiles] = pull(all_uni_coefs, .data$P)
Files_Info$uni_adj_Rsquared = NA
Files_Info$uni_adj_Rsquared[order_inFiles] = pull(all_uni_coefs, .data$adj_Rsquared)
Files_Info$uni_Rsquared = NA
Files_Info$uni_Rsquared[order_inFiles] = pull(all_uni_coefs, .data$Rsquared)
}
tmp = paste0("Done! \n")
Log = update_log(Log, tmp, verbose)
tmp = paste0("# Stepwise selection (all traits)... \n")
Log = update_log(Log, tmp, verbose)
if(is.null(stepwise_threshold)){
stepwise_threshold = 0.05/length(Prior_study_names)
tmp = paste0("The p-value threshold used for stepwise selection is ", format(round(stepwise_threshold, 4), scientific = F), " (", length(Prior_study_names), " Prior GWASs tested). \n")
Log = update_log(Log, tmp, verbose)
}
if(all(all_uni_coefs$P>stepwise_threshold)){
tmp = "No study significant - Analysis failed"
Log = update_log(Log, tmp, verbose)
if(save_files) utils::write.table(Files_Info, file="PriorGWASs.tsv", sep="\t", quote=F, row.names=F )
res=list(log_info = Log,
stop = T)
return(res)
}
# if only one study, no need for stepwise selection
if(sum(all_uni_coefs$P<=stepwise_threshold)==1){
tmp = "Only one study reaching significance, no need for stepwise selection. \n"
Log = update_log(Log, tmp, verbose)
# add the study
all_uni_coefs %>%
filter(.data$P==min(.data$P)) %>%
pull(.data$study) -> significant_studies
all_uni_coefs %>%
filter(!.data$study %in% significant_studies) %>%
pull(.data$study) -> non_significant_studies
tmp = paste0("Adding the study :", get_names(significant_studies, Z_matrices) , " \n")
Log = update_log(Log, tmp, verbose)
### UPDATE Z-MATRIX
ZMatrix %>%
select(1:5, significant_studies, ncol(ZMatrix)) %>%
get_Instruments(., Zlimit) -> ZMatrix_final
myFormula_final = generate_Formula(my_outcome, significant_studies)
stats::lm(data=ZMatrix_final, formula = myFormula_final) %>%
summary %>%
stats::coef() %>%
as.data.frame() %>% # 1st create a data.frame (to get rownames)
tibble::rownames_to_column("nm") %>% # then convert to tibble
as_tibble() -> coefs
coefs %>%
set_names(c("study", "estimate", "std_error", "Tstat", "P")) -> coefs
# return results
if(save_files){
order_inFiles = match(coefs$study, Files_Info$File)
Files_Info$multi_estimate = NA
Files_Info$multi_estimate[order_inFiles] = pull(coefs, .data$estimate)
Files_Info$multi_std_error = NA
Files_Info$multi_std_error[order_inFiles] = pull(coefs, .data$std_error)
Files_Info$multi_T = NA
Files_Info$multi_T[order_inFiles] = pull(coefs, .data$Tstat)
Files_Info$multi_P = NA
Files_Info$multi_P[order_inFiles] = pull(coefs, .data$P)
}
tmp = paste0("Estimating adjusted R-squared: \n")
Log = update_log(Log, tmp, verbose)
stats::lm(data=ZMatrix_final, formula = myFormula_final) %>%
summary %>%
.["adj.r.squared"] %>%
as.numeric() -> R2_Multi
tmp = paste0("- in-sample adjusted R-squared for the all-chromosomes multivariable regression is ", round(R2_Multi,4), " \n")
Log = update_log(Log, tmp, verbose)
tmp = paste0("- out-of-sample R-squared (masking one chromosome at a time), for the multivariable regression will be estimated when calculating the prior. \n")
Log = update_log(Log, tmp, verbose)
if(save_files) utils::write.table(Files_Info, file="PriorGWASs.tsv", sep="\t", quote=F, row.names=F )
res=list(log_info = Log,
studies = significant_studies,
coeffs = coefs,
ZMat = ZMatrix_final)
return(res)
}
all_uni_coefs %>%
filter(.data$P==min(.data$P)) %>%
pull(.data$study) -> significant_studies
all_uni_coefs %>%
filter(!.data$study %in% significant_studies) %>%
pull(.data$study) -> non_significant_studies
#studies_to_test = non_significant_studies # same now, but some studies (inconsistent direction / loop : might be removed from the test set, and not from the non.significant set)
all_uni_coefs %>%
filter(!.data$study %in% significant_studies) %>%
filter(P<0.05) %>%
pull(.data$study) -> studies_to_test
tmp = paste0("Studies tested (reaching p<0.05 in univariable models) : \n ", paste(get_names(c(significant_studies,studies_to_test), Z_matrices), collapse = " \n "), "\n")
Log = update_log(Log, tmp, verbose)
if(save_files){ # add Status : stepwise exclusion, will be updated if the studies are added
Files_Info$status[Files_Info$File %in% non_significant_studies] = "Excluded during stepwise selection"
}
steps = data.frame(Direction=character(), Study=character())
Convergence=F
tmp = paste0("Adding the first study :", get_names(significant_studies, Z_matrices) , " \n")
Log = update_log(Log, tmp, verbose)
it = 0
while(!Convergence){
if(it>50){ # are we in a loop?
tmp=c("We are probably in a loop...\n ", "Analysis failed \n",
"Please have a look at the stepwise procedure results ",
"and update \"prior_studies\" before relaunching the analysis ",
"to avoid looping again.")
Log = update_log(Log, tmp, verbose)
if(save_files) utils::write.table(Files_Info, file="PriorGWASs.tsv", sep="\t", quote=F, row.names=F )
res=list(log_info = Log,
stop = T)
return(res)
}
it=it+1
tmp=paste0(" iteration ", it, ": ", length(significant_studies), " studies \n")
update_log(Log, tmp, verbose)
### UPDATE Z-MATRIX
myFormula = generate_Formula(my_outcome,
significant_studies)
# keep only our studies
ZMatrix %>%
select(-non_significant_studies) %>%
get_Instruments(Zlim=Zlimit) -> ZMatrix_subset
## RUN MODEL - with studies already included
tmp="#Run model \n"
update_log(Log, tmp, verbose)
model = stats::lm(data=ZMatrix_subset, formula = myFormula)
stats::lm(data=ZMatrix_subset, formula = myFormula) %>%
summary %>%
stats::coef() %>%
as.data.frame() %>% # 1st create a data.frame (to get rownames)
tibble::rownames_to_column("nm") %>% # then convert to tibble
as_tibble() -> coefs
## TRY TO ADD
no_change = T
tmp = paste0("#Test if any study can be added with p<", format(round(stepwise_threshold, 4), scientific = F), " \n")
Log = update_log(Log, tmp, verbose)
PValues = data.frame(study=studies_to_test, P=NA, Estimate=NA, stringsAsFactors = F)
for(one_to_add in studies_to_test){
### UPDATE Z-MATRIX
test_formula = generate_Formula(my_outcome,
c(significant_studies, one_to_add))
# keep only our studies
non_significant_studies[!non_significant_studies %in% one_to_add] -> studies_not_used
ZMatrix %>%
select(-studies_not_used) %>%
get_Instruments(Zlim=Zlimit) -> ZMatrix_test
## RUN MODEL
model_test = stats::lm(data=ZMatrix_test, formula = test_formula)
PValues[PValues$study==one_to_add,c(2,3)] = c(stats::coef(summary(model_test))[one_to_add,"Pr(>|t|)"], stats::coef(summary(model_test))[one_to_add,"Estimate"])
}
if(any(PValues$P<stepwise_threshold)){
no_change = F
PValues %>%
filter(.data$P==min(.data$P)) %>%
pull(.data$study) -> study_to_add
# test if direction is consistent with uni before adding
all_uni_coefs %>%
filter(.data$study==study_to_add) %>%
pull(.data$estimate) /
PValues %>%
filter(.data$study==study_to_add) %>%
pull(.data$Estimate) -> ratio
if(ratio<0){
tmp = paste0("Study :", get_names(study_to_add, Z_matrices), " cannot be added, direction is not consistent between univariable and multivariable model.\n")
Log = update_log(Log, tmp, verbose)
studies_to_test[!studies_to_test == study_to_add] -> studies_to_test
if(save_files){ # add Status : AIC exclusion
Files_Info$status[Files_Info$File %in% study_to_add] = "Excluded during forward selection (unconsistent direction)"
}
steps=rbind(steps, data.frame(Direction="+", Study="none: direction issue"))
} else {
tmp = paste0("Adding one study :", get_names(study_to_add, Z_matrices), " \n")
Log = update_log(Log, tmp, verbose)
steps=rbind(steps, data.frame(Direction="+", Study=study_to_add))
significant_studies=c(significant_studies, study_to_add)
studies_to_test = studies_to_test[-which(studies_to_test %in% study_to_add)]
non_significant_studies = non_significant_studies[-which(non_significant_studies %in% study_to_add)]
if(save_files){ # add Status : USE
Files_Info$status[Files_Info$File %in% study_to_add] = "USED"
}
tmp = paste0("Done! \n")
Log = update_log(Log, tmp, verbose)
}
} else {
steps=rbind(steps, data.frame(Direction="+", Study="none"))
}
# Add it if needed
myFormula = generate_Formula(my_outcome,
significant_studies)
# keep only our studies
ZMatrix %>%
select(-non_significant_studies) %>%
get_Instruments(Zlim=Zlimit) -> ZMatrix_subset
## RUN MODEL
tmp="#Update model \n"
update_log(Log, tmp, verbose)
model = stats::lm(data=ZMatrix_subset, formula = myFormula)
stats::lm(data=ZMatrix_subset, formula = myFormula) %>%
summary %>%
stats::coef() %>%
as.data.frame() %>% # 1st create a data.frame (to get rownames)
tibble::rownames_to_column("nm") %>% # then convert to tibble
as_tibble() -> coefs
tmp = paste0("#Test if any study has p>", format(round(stepwise_threshold, 4), scientific = F), " now \n")
Log = update_log(Log, tmp, verbose)
# Remove traits with multivariable p-value larger that 0.05
coefs %>%
arrange(desc(.data$`Pr(>|t|)`)) -> coefs
if(any(coefs$`Pr(>|t|)` > stepwise_threshold)){
no_change = F
coefs %>%
pull(.data$nm) -> st
st[1] -> study_to_remove
steps=rbind(steps, data.frame(Direction="-", Study=study_to_remove))
tmp = paste0("Excluding one study :", get_names(study_to_remove, Z_matrices), " \n")
Log = update_log(Log, tmp, verbose)
significant_studies[!significant_studies %in% study_to_remove] -> significant_studies
non_significant_studies = c(non_significant_studies,study_to_remove)
studies_to_test = c(studies_to_test, study_to_remove)
if(save_files){ # add Status : AIC exclusion
Files_Info$status[Files_Info$File %in% study_to_remove] = "Excluded during stepwise selection"
}
tmp = paste0("Done! \n")
Log = update_log(Log, tmp, verbose)
} else {
steps=rbind(steps, data.frame(Direction="-", Study=""))
}
if(length(studies_to_test)==0){
tmp = paste0("All studies included \n")
Log = update_log(Log, tmp, verbose)
Convergence=T
}
if(no_change){
tmp = paste0("It converged! \n")
Log = update_log(Log, tmp, verbose)
Convergence=T
}
}
# compute, and then save, the 'full-22-in-one-go' regression results
tmp = paste0("# Final regression... \n")
Log = update_log(Log, tmp, verbose)
tmp = paste0("The studies used are: \n")
Log = update_log(Log, tmp, verbose)
tmp = c(paste0(paste0("- ", get_names(significant_studies, Z_matrices)), collapse="\n"), "\n")
Log = update_log(Log, tmp, verbose)
ZMatrix %>%
select(1:5, significant_studies, ncol(ZMatrix)) %>%
get_Instruments(Zlim=Zlimit) -> ZMatrix_final
myFormula_final = generate_Formula(my_outcome, significant_studies)
stats::lm(data=ZMatrix_final, formula = myFormula_final) %>%
summary %>%
stats::coef() %>%
as.data.frame() %>% # 1st create a data.frame (to get rownames)
tibble::rownames_to_column("nm") %>% # then convert to tibble
as_tibble() -> coefs
coefs %>%
set_names(c("study", "estimate", "std_error", "Tstat", "P")) -> coefs
if(save_files){
order_inFiles = match(coefs$study, Files_Info$File)
Files_Info$multi_estimate = NA
Files_Info$multi_estimate[order_inFiles] = pull(coefs, .data$estimate)
Files_Info$multi_std_error = NA
Files_Info$multi_std_error[order_inFiles] = pull(coefs, .data$std_error)
Files_Info$multi_T = NA
Files_Info$multi_T[order_inFiles] = pull(coefs, .data$Tstat)
Files_Info$multi_P = NA
Files_Info$multi_P[order_inFiles] = pull(coefs, .data$P)
}
tmp = paste0("Estimating adjusted R-squared: \n")
Log = update_log(Log, tmp, verbose)
stats::lm(data=ZMatrix_final, formula = myFormula_final) %>%
summary -> summary_lm
summary_lm["adj.r.squared"] %>%
as.numeric() -> R2_Multi
tmp = paste0("- in-sample adjusted R-squared for the all-chromosomes multivariable regression is ", round(R2_Multi,4), " \n")
Log = update_log(Log, tmp, verbose)
tmp = paste0("- out-of-sample R-squared (masking one chromosome at a time), for the multivariable regression will be estimated when calculating the prior. \n")
Log = update_log(Log, tmp, verbose)
if(save_files) utils::write.table(Files_Info, file="PriorGWASs.tsv", sep="\t", quote=F, row.names=F )
res=list(log_info = Log,
studies = significant_studies,
coeffs = coefs,
ZMat = ZMatrix_subset)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.