Nothing
#' Plot theta values of negative binomial models versus non-zero count
#' for count data
#' @param input A data frame with the first column as "Individual"
#' and all the columns of dependent variables (features, e.g. bacteria)
#' at the end of the table.
#' @param test_var The name of the independent variable you are testing for,
#' should be a character vector (e.g. c("Time"))
#' identical to its column name and make sure there is no space in it.
#' @param variable_col The column number of the position where the dependent
#' variable columns (e.g. bacteria) start in the table
#' @param fac_var The column numbers of the position where the columns
#' that aren't numerical (e.g. characters, categorical numbers,
#' ordinal numbers), should be a numerical vector (e.g. c(1, 2, 5:7))
#' @param not_used The column position of the columns not are irrelevant and
#' can be ignored when in the analysis.
#' This should be a number vector, and the default is NULL.
#' @param point_size The point size for plotting in 'ggplot2'. The default is 1.
#' @param x_interval_value The interval value for tick marks on x-axis.
#' The default is 5.
#' @param y_interval_value The interval value for tick marks on y-axis.
#' The default is 5.
#' @param verbose A boolean vector indicating whether to print detailed
#' message.
#' The default is TRUE.
#' @export
#' @import tidyr
#' @import reshape2
#' @import glmmTMB
#' @import ggplot2
#' @import dplyr
#' @import tibble
#' @importFrom stats as.formula confint cor.test kruskal.test
#' na.omit p.adjust wilcox.test
#' @importFrom rlang .data
#' @importFrom magrittr '%>%'
#' @name theta_plot
#' @return a 'ggplot' object
#' @details
#' This function outputs a plot that facilitates the setting of theta_cutoff
#' in longdat_disc() and longdat_cont(). This only applies when the dependent
#' variables are count data. longdat_disc() and longdat_cont() implements
#' negative binomial (NB) model for count data,
#' and if the theta (dispersion parameter) of
#' NB model gets too high, then the p value of it will be extremely low
#' regardless of whether there is real significance
#' or not. Therefore, the highest threshold of theta value is set and any
#' variable beyond the threshold will be excluded
#' from the test. The default value of theta_cutoff is set to 2^20 from the
#' observation that 2^20 is a clear cutoff line
#' for several datasets. Users can change theta_cutoff value to fit
#' their own data.
#' @examples
#' test_theta_plot <- theta_plot(input = LongDat_disc_master_table,
#' test_var = "Time_point", variable_col = 7, fac_var = c(1:3))
utils::globalVariables(c("values", "NB_theta", "Nonzero_count"))
theta_plot <- function(input, test_var, variable_col, fac_var, not_used = NULL,
point_size = 1, x_interval_value = 5,
y_interval_value = 5, verbose = TRUE) {
if (missing(input)) {
stop('Error! Necessary argument "input" is missing.')
}
if (missing(test_var)) {
stop('Error! Necessary argument "test_var" is missing.')
}
if (missing(variable_col)) {
stop('Error! Necessary argument "variable_col" is missing.')
}
if (missing(fac_var)) {
stop('Error! Necessary argument "fac_var" is missing.')
}
if (verbose == TRUE) {print("Start data preprocessing.")}
data <- input
# Remove the features (bacteria) whose column sum is 0
values <- as.data.frame(data[ , variable_col:ncol(data)])
values <- as.data.frame(apply(values, 2, as.numeric))
values <- values[ , colSums(values) > 0]
data <- as.data.frame(cbind(data[ , 1:(variable_col-1)], values))
non_zero_count <- c()
for (i in seq_len(ncol(values))) {
non_zero_count[i] <- sum(values[ , i] != 0)
}
# Here value column is defined by the user
predictor_names <- (colnames(data))[1: (variable_col - 1)]
melt_data <- reshape2::melt (data, id = predictor_names)
# Omit the rows whose value column equals to NA
melt_data <- melt_data %>% tidyr::drop_na(.data$value)
# Remove all dots in the bacteria name or it will cause problem
melt_data$variable <- gsub(".", "_", melt_data$variable, fixed = TRUE)
# Make sure that all the columns are in the right class
# Columns mentioned in fac_var, and the second last column in
# melt data are factors
# Columns not in fac_var, and the last column in melt data are
# numerical numbers
"%notin%" <- Negate("%in%")
num_var <- c(which(1:(ncol(melt_data)-2) %notin% fac_var), ncol(melt_data))
fac_var <- c(fac_var, ncol(melt_data)-1)
for (i in fac_var) {
melt_data[ ,i] <- as.factor(melt_data[ ,i])
}
for (i in num_var) {
melt_data[ ,i] <- as.numeric(melt_data[ ,i])
}
# Remove the not-used columns in melt_data
if (!is.null(not_used)) {
melt_data <- melt_data %>% dplyr::select(-c(not_used))
}
# Change the first column name of melt_data to "Individual"
colnames(melt_data)[1] <- "Individual"
# Variables are all the bacteria taxanomies
variables <- unique (melt_data$variable)
# Extract all the column names from melt_data except for the last two columns
# which are variables and values, and also exclude "Individual" and test_var
factors <- colnames(melt_data)[-c(ncol(melt_data), ncol(melt_data)-1)]
factors <- factors[-which(factors %in% c("Individual", test_var))]
factor_columns <- match(factors, colnames(melt_data))
N <- length (variables)
if (verbose == TRUE) {print("Finish data preprocessing.")}
################## Negative binomial model #################
if (verbose == TRUE) {print("Start running negative binomial models.")}
Theta <- as.data.frame(matrix(data = NA, nrow = N, ncol = 1))
for (i in 1:N) { # loop through all variables
aVariable <- variables[i]
if (verbose == TRUE) {print(i)}
subdata <- subset(melt_data, variable == aVariable)
tryCatch({
fmla2 <- as.formula(paste("value ~ (1| Individual) +", test_var))
m2 <- glmmTMB::glmmTMB(formula = fmla2, data = subdata,
family = nbinom2, REML = FALSE)
# Extract overdispersion theta out of model
Theta[i, 1] <- glmmTMB::sigma(m2)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
rownames(Theta) <- gsub(".", "_", variables, fixed = TRUE)
colnames(Theta) <- c("NB_theta")
all_info <- cbind(Theta, non_zero_count)
colnames(all_info)[2] <- "Nonzero_count"
if (verbose == TRUE) {print("Finish running negative binomial models.")}
################## Plot nonzero count v.s. theta ##################
if (verbose == TRUE) {print("Start plotting.")}
suppressWarnings(
plot <- ggplot2::ggplot(all_info, aes(x=Nonzero_count,
y = log(NB_theta, base = 2))) +
geom_point(size = point_size, alpha = 0.9, color = "dodgerblue2") +
theme_light() +
scale_y_discrete(limits = seq(-10, max(log(all_info$NB_theta,
base = 2)) + 10,
by = y_interval_value)) +
ggtitle("Non-zero count vs negative binomial theta") +
theme(plot.title = element_text(size = 18, face = "plain")) +
xlab("Non-zero count") +
ylab("log(Theta, base = 2)") +
scale_x_continuous(breaks = seq(0, nrow(data), by = x_interval_value)) +
expand_limits(x = c(0, nrow(data)), y = c(-10, max(log(all_info$NB_theta,
base = 2)) + 10)))
print("Finished plotting successfully!")
return(plot)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.