#' Run a bunch of ROI stats in a linear model, per your specifications
#'
#' @param df A data frame in long format, organized as QuANTs-style output: dependent variable (freq. thicknesses, volumes, or cbf values, etc) in a column called "value"
#' a column called "label" with the label numbers. Each row is one measurement for one person. Can have as many columns as you want (ie, ignores everything not specified")
#' Also requires columns named as specified used as varOfInterest, covarOfInterest (if used), and otherCovarString (if used)
#' @param varOfInterest A column name from df that you want to run stats on. Often either a column containing group status info or a score for a regression
#' @param covarOfInterest A column name you want to use as a covariate in your linear model. Will also be used for plotting.
#' @param otherCovarString A string you want to use for covariates of no interest. Won't be used for plots. For multiple values, use lm()-style formula "Covar1+Covar2+..."
#' @param statsTermOfInterest A string you want to use to select p-values (eg, "MutationC9:AgeAtMRI"). Defaults to first non-Intercept row in summary(lm())
#' @param lut A look up table with label numbers in the first column and label names in the second column. Can have extra rows or columns, they will be ignored
#' @param colorMat A table of colors that might get used in plots
#' @param sigValue A value used for thresholding your statistics. If you want to show all data use 1.
#' @param multipleCorrectionMethod Goes into p.adjust() to account for multiple comparisons. Will do this accounting for all labels in your input df. Default is none, common other choices are "bonferroni" and "fdr"
#' @param numberSavedPlots how many plots to save individually. If specifying number greater than the number of significant regions, it will stop after the last one
#' @param outFileRoot Optional file root for output. If specified, will save 1) .txt file with all significant results and 2) plots for up to 5 labels (with 5 smallest p-values)
#' @return A list object with 3 elements
#' 1) a df containing significant labels from the lm()
#' 2) summay(lm()) output from each label in your input df
#' 3) a plot for each region in your df 1) (ie, one plot for each significant label)
#' @import ggplot2
#' @importFrom RColorBrewer brewer.pal
#' @importFrom gridExtra arrangeGrob
#' @export
testgetStats <- function (df, varOfInterest, covarOfInterest, otherCovarString, statsTermOfInterest, lut, colorMat, sigValue, multipleCorrectionMethod, numberSavedPlots, outFileRoot) {
if (missing(df)) {
stop("runs a bunch of stats on labels and makes some figures")
}
# we might want multiple corrections, but we might not.
if (missing(multipleCorrectionMethod)) {
multipleCorrectionMethod <- "none"
}
# if no significant value threshold input, keep all stats
if(missing(sigValue)) {
print("i see everything")
sigValue <- 1
}
# if no palatte supplied, do something
if (missing(colorMat)) {
stop("need colors")
}
# check for look up table / create dummy one
if (missing(lut)) {
print("No look up table, wingin' it")
lut <- rbind(unique(df$label), paste("label",unique(df$label), sep=""))
lut <- t(lut)
lut[,1] <- as.double(lut[,1])
}
# get rid of extra factor levels if i'm lazily filtering
df <- droplevels(df)
# set up output stats matrix
numLabels <- length(unique(df$label))
statsmat <- as.data.frame(matrix(NA,nrow=numLabels, ncol = 6))
statslist <- list()
# if using more labels than you think you are, stop it
if (dim(lut)[1] < numLabels ) {
stop("more labels in df than in lut, stopping...")
}
# string vs formula vs other object parsing in R is garbage
# in any case, create a new object that's the formula to put into lm()
# (cbf) (what we specify for input into our model)
# bunch of stuff for getting strings into happy lm()-fitting format
if (missing(varOfInterest)) {
stop("need varOfInterest assigned to run lm")
}
regressionString <- paste("value", '~', varOfInterest, sep=" ")
if (!missing(covarOfInterest) & !is.na(covarOfInterest)) {
regressionString <- paste(regressionString, "+" , covarOfInterest, sep = " ")
}
if (!missing(otherCovarString) & !is.na(otherCovarString)) {
regressionString <- paste(regressionString, "+" , otherCovarString, sep = " ")
}
print(regressionString)
frm <- as.formula(regressionString)
# run same stats for each label
for (i in 1:numLabels) {
# get label for this regression
tmpLabel <- unique(df$label)[i]
# put it in your output matrix
statsmat[i,1]<- tmpLabel
# give it index (helpful if returning the list with all stats output)
statsmat[i,5] <- i
# give it a name
statsmat[i,6] <- lut[lut[,1] == tmpLabel, 2]
# set up to run linear regression
# subset our whole input matrix for just this label, and run the lm on that
statslist[[i]] <- summary(lm(frm, data=df[df$label==tmpLabel,]))
# organize stats output:
# if no statsTermOfInterest supplied, get top row
if (missing(statsTermOfInterest)) {
statsmat[i,2] <- statslist[[i]]$coefficients[2,3]
# and get the corresponding p-value
statsmat[i,3] <- statslist[[i]]$coefficients[2,4]
} else {
# assume first input is most important. Put the t-stat for it here.
statsmat[i,2] <- statslist[[i]]$coefficients[rownames(statslist[[i]]$coefficients)==statsTermOfInterest,3]
# and get the corresponding p-value
statsmat[i,3] <- statslist[[i]]$coefficients[rownames(statslist[[i]]$coefficients)==statsTermOfInterest,4]
}
}
# we may want to correct for multiple comparisons. If specified in function input, do that now:
# statsmat[,3] is the p value column for all labels
statsmat[,4] <- p.adjust(statsmat[,3],multipleCorrectionMethod)
# label columns appropriately
colnames(statsmat) <- c("label","tstat","pvalue","p_adj","index","LabelName")
# threshold your ROIs by whatever threshold specified
sigResults <- statsmat[statsmat[,4] < sigValue, ]
sigResults <- as.data.frame(sigResults)
# order stats from smallest p-value to largest
sigResults <- sigResults[with(sigResults,order(p_adj)),]
# if there's an output file root, save the significant results table
if (!missing(outFileRoot)) {
# write output in format for the lausanneTableToFigure.sh script
outRoot <- paste(outFileRoot, "_", multipleCorrectionMethod, sigValue, sep="")
write.table(sigResults,paste(outRoot,".txt", sep=""), quote = F, row.names = F, col.names=F)
} else {
outFileRoot <- paste(varOfInterest)
}
# create a flexible (list) object to store all of the significant plots
sigPlots <- list()
# but only actually make the plots if there is something to plot
if (nrow(sigResults) > 0) {
# iterate over each significant roi
for ( s in 1:dim(sigResults)[1]) {
# hold label for ease later
loi <- sigResults$label[s]
# stupid ggplot variable vs column name handling
ggvarOfInterest <- ensym(varOfInterest)
# if there's no covariate, just plot the main effect
if (missing(covarOfInterest) | is.na(covarOfInterest)){
# if our variable of interest is an integer or double or numeric, make a scatterplot,
# but if it's a factor or character then make a boxplot
if ( (class(df[,varOfInterest]) == "integer" | class(df[,varOfInterest]) == "double" | class(df[,varOfInterest]) == "numeric" ) ) {
sigPlots[[s]] <- ggplot2::ggplot(df[df$label == loi ,], aes(x= !!ggvarOfInterest, y=value)) + geom_point() + geom_smooth(method = "lm") + ggtitle(paste(lut[lut[,1] == loi,2]))
} else {
sigPlots[[s]] <- ggplot2::gplot(df[df$label==loi ,], aes(x= !!ggvarOfInterest, y=value, fill=!!ggvarOfInterest)) + geom_boxplot() + geom_jitter(height = 0, width = .3) + scale_fill_manual(values=colorMat) + ggtitle(paste(lut[lut[,1] == loi,2]))
}
# if we have our main effect and a covariate, we have a bunch of options, but in essense if it's a factor make it a color and plot the x-axis with the numeric one, but if there's two numberic ones color by the covariate and plot on the main variable
} else {
ggcovarOfInterest <- ensym(covarOfInterest)
if ( (class(df[,varOfInterest]) == "factor" | class(df[,varOfInterest]) == "character") & (class(df[,covarOfInterest]) == "integer" | class(df[,covarOfInterest]) == "double" | class(df[,varOfInterest]) == "numeric") ) {
sigPlots[[s]] <- ggplot2::ggplot(df[df$label==loi ,], aes(x= !!ggcovarOfInterest, y=value, color= !!ggvarOfInterest)) + geom_point() + geom_smooth(method = "lm") + scale_color_manual(values=colorMat) + ggtitle(paste(lut[lut[,1] == loi,2]))
} else if ( (class(df[,varOfInterest]) == "factor" | class(df[,varOfInterest]) == "character") & (class(df[,covarOfInterest]) == "factor" | class(df[,covarOfInterest]) == "character") ) {
sigPlots[[s]] <- ggplot2::ggplot(df[df$label==loi ,], aes(x= !!ggvarOfInterest, y=value, fill= !!ggcovarOfInterest)) + geom_boxplot() + geom_jitter(height = 0, width = .3) + ggtitle(paste(lut[lut[,1] == loi,2]))
} else if ( (class(df[,varOfInterest]) == "integer" | class(df[,varOfInterest]) == "double" | class(df[,varOfInterest]) == "numeric") & (class(df[,covarOfInterest]) == "factor" | class(df[,covarOfInterest]) == "character") ) {
sigPlots[[s]] <- ggplot2::ggplot(df[df$label==loi ,], aes(x= !!ggvarOfInterest, y=value, color= !!ggcovarOfInterest)) + geom_point() + geom_smooth(method = "lm") + ggtitle(paste(lut[lut[,1] == loi,2]))
} else {
sigPlots[[s]] <- ggplot2::ggplot(df[df$label==loi ,], aes(x= !!ggvarOfInterest, y=value, color= !!ggcovarOfInterest)) + geom_point() + geom_smooth(method = "lm") + ggtitle(paste(lut[lut[,1] == loi,2]))
}
}
}
numPlots <- 0
numPlots <- dim(sigResults)[1]
if (exists("outRoot")) {
if (numPlots == 1 ) {
ggplot2::ggsave(paste(outRoot, "sig", numPlots, ".png", sep=""), gridExtra::arrangeGrob(grobs = sigPlots[1]))
} else if (numPlots >= 4) {
# write output in format for the lausanneTableToFigure.sh script
numPlots <- 4
gridplots <- gridExtra::arrangeGrob(grobs = sigPlots[1:numPlots])
ggplot2::ggsave(paste(outRoot, "sig", numPlots, ".png", sep=""), gridplots)
} else if (numPlots > 1 & numPlots < 4) {
gridplots <- gridExtra::arrangeGrob(grobs = sigPlots[1:numPlots])
ggplot2::ggsave(paste(outRoot, "sig", numPlots, ".png", sep=""), gridplots)
}
if (!missing(numberSavedPlots)) {
if (numberSavedPlots > dim(sigResults)[1] ) {
numberSavedPlots <- dim(sigResults)[1]
}
for (i in 1:numberSavedPlots) {
ggplot2::ggsave(paste(outRoot, "sig", sigResults[i,6], ".png", sep=""), sigPlots[[i]])
}
}
}
}
outlist <- list()
outlist[[1]] <- sigResults
outlist[[2]] <- statslist
outlist[[3]] <- sigPlots
names(outlist) <- c("significantResults", "allRegionsStats","significantPlots")
return(outlist)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.