#' Describe samples using the variable of interest and covariates.
#'
#' @param variable.of.interest Name of the variable of interest in \code{colnames(data)}.
#' @param data Data frame of variables.
#' @return A data frame with one row for each continuous or ordinal variable
#' and one row for each level of each categorical variable.
#' In the first case, each row provides the name, mean value and standard deviation of each variable.
#' In the second case (categorical), each row provides the name of the variable level and
#' the number of cases and percentage of cases at that level.
#'
#' @export
ewaff.sample.characteristics <- function(variable.of.interest, data) {
msg("summarizing variables")
summarize.variable <- function(name, variable) {
msg(name)
if (is.character(variable)) variable <- as.factor(variable)
if (is.factor(variable)) {
n <- sapply(levels(variable), function(level) sum(variable == level,na.rm=T))
data.frame(variable=name,
value=names(n),
mean=format(n),
"sd/%"=format(n/length(variable)*100,digits=3,nsmall=1),
row.names=NULL)
}
else {
data.frame(variable=name,
value="",
mean=format(mean(variable,na.rm=T)),
"sd/%"=format(sd(variable,na.rm=T)),
row.names=NULL)
}
}
var.summary <- summarize.variable(variable.of.interest, data[[variable.of.interest]])
cov.summary <- lapply(setdiff(colnames(data), variable.of.interest), function(varname) {
summarize.variable(varname, data[[varname]])
})
cov.summary <- do.call(rbind, cov.summary)
rbind(var.summary, cov.summary)
}
#' Describe associations between covariates and the variable of interest.
#'
#' @param variable.of.interest Name of the variable of interest in \code{colnames(data)}.
#' @param data Data frame of variables.
#' @return A data frame with one or more rows for each covariate.
#'
#' If both the variable of interest and covariate are continuous or ordinal,
#' then the covariate uses one row showing the name, mean
#' and standard deviation of the covariate following the significance
#' of the association between the covariate and the variable of interest.
#'
#' If the covariate is categorical, then there is additionally one row for
#' each level showing the mean and standard deviation of the variable
#' of interest for samples at that covariate level.
#'
#' If the variable of interest is categorical but the covariate is not,
#' then there is one row for each variable level showing the
#' mean and standard deviation of the covariate at the given level.
#'
#' If both the variable of interest and covariate are categorical,
#' then mean is replaced with the number of samples at each pair of
#' variable/categorical levels and standard deviation with the percentage.
#' P-values indicate the significance of association using Fisher's exact test.
#'
#' @export
ewaff.covariate.associations <- function(variable.of.interest, data) {
msg("covariate associations")
ret <- sapply(setdiff(colnames(data), variable.of.interest), function(covariate) {
msg(covariate)
vars <- data[,c(variable.of.interest,covariate)]
for (j in colnames(vars)) {
if (!is.numeric(vars[[j]])
&& length(unique(na.omit(vars[[j]]))) <= 2)
vars[[j]] <- as.factor(vars[[j]])
}
ewaff.summarize.relationship(vars)
}, simplify=F)
ret
}
#' Describe the relationship between two variables.
#'
#' @param vars A data frame with at least two columns.
#' The first two columns will be compared.
#' @return A list whose elements depends on the types of the two variables.
#' In each case, the list contains the following elements:
#' \describe{
#' \item{var1}{Name of the first variable, i.e. colnames(vars)[1].}
#' \item{var2}{Name of the second variable.}
#' \item{r}{Spearman's correlation between the variables. This may be meaningless if one variable is an unordered factor.}
#' \item{r.p}{P-value corresponding to the correlation between the variables.}
#' \item{output}{The contents of the list formatted to be printed as markdown text.}
#' \item{plot}{A plot (ggplot2) visualizing the relationship.}
#' }
#'
#' If both variables are factors, then the list will include a frequency table (\code{freq})
#' and a corresponding matrix of p-values (\code{p.values}) obtained using Fisher's test to test for
#' enrichment in each cell of the frequency table.
#'
#' If one variable is a factor and the other numeric, then list will include
#' the F-statistic (\code{F.stat}) and p-value (\code{p.value}) from one-way analysis of variance.
#' There will also be a data frame (\code{cases}) with each row providing statistics from a t-test
#' comparing the numeric variable within and without each level of the factor variable.
#'
#' If both variables are numeric, then the list will include
#' the F-statistic (\code{F.stat}) and p-value (\code{p.value})
#' from the linear model fitting the variables.
#'
#' @export
ewaff.summarize.relationship <- function(vars) {
stopifnot(is.data.frame(vars))
values <- lapply(vars, function(var) if (!is.factor(var)) NA else levels(var))
samples <- do.call(expand.grid, c(values, list(stringsAsFactors=F)))
samples$idx <- lapply(1:nrow(samples), function(i)
which((is.na(samples[[1]][i]) | vars[[1]] == samples[[1]][i]) &
(is.na(samples[[2]][i]) | vars[[2]] == samples[[2]][i])))
if (is.factor(vars[[1]]) && is.factor(vars[[2]])) {
freq <- do.call(table, vars)
ret <- list(freq=freq)
fit <- cor.test(as.integer(vars[[1]]), as.integer(vars[[2]]), method="s")
ret$r <- fit$estimate[["rho"]]
ret$r.p <- fit$p.value
ret$p.values <- sapply(values[[1]], function(level1)
sapply(values[[2]], function(level2) {
is.value <- mapply(function(var,value) var == value,
vars,
list(level1, level2))
freq <- do.call(table, as.data.frame(is.value))
p <- NA
try(p <- fisher.test(freq, alternative="greater")$p.value, silent=T)
p
}))
ret$p.values <- t(ret$p.values)
ret$sig <- ret$p.values
ret$sig[] <- ""
ret$sig[which(ret$p.values < 0.05)] <- "+"
ret$sig[which(ret$p.values < 0.01)] <- "*"
ret$sig[which(ret$p.values < 0.001)] <- "**"
ret$sig[which(ret$p.values < 0.0001)] <- "***"
ret$sig[which(ret$p.values < 0.00001)] <- "****"
} else {
var1 <- sign(is.factor(vars[[1]])) + 1 ## variable that is not a factor
var2 <- 3-var1 ## the other variable
fit <- lm(vars[[var1]] ~ vars[[var2]])
f.stat <- summary(fit)$fstatistic
p.value <- 1 - pf(f.stat["value"],f.stat["numdf"],f.stat["dendf"])[["value"]]
ret <- list(f.stat=f.stat[["value"]],
p.value=p.value,
n=sum(!is.na(vars[[var1]]) & !is.na(vars[[var2]])))
if (!is.factor(vars[[var1]]) && !is.factor(vars[[var2]])) {
fit <- cor.test(vars[[1]], vars[[2]], method="s")
ret$r <- fit$estimate[["rho"]]
ret$r.p <- fit$p.value
} else { ## vars[[var2]] is a factor
fit <- cor.test(vars[[var1]], as.integer(vars[[var2]]), method="s")
ret$r <- fit$estimate[["rho"]]
ret$r.p <- fit$p.value
ret$cases <- do.call(rbind, lapply(1:nrow(samples), function(i) {
level <- samples[[var2]][i]
is.level <- vars[[var2]] == level
if (sum(is.level, na.rm=T) > 1 && sum(!is.level, na.rm=T) > 1)
fit <- t.test(vars[[var1]][which(is.level)],
vars[[var1]][which(!is.level)])
else
fit <- list(p.value=NA,statistic=list(t=NA))
data.frame(level=level,
mean=mean(vars[[var1]][which(is.level)], na.rm=T),
var=var(vars[[var1]][which(is.level)], na.rm=T),
n=sum(is.level, na.rm=T),
t.stat=fit$statistic[["t"]],
p.value=fit$p.value)
}))
names(ret$cases)[1] <- names(vars)[var2]
rownames(ret$cases) <- NULL
}
}
ret$var1 <- names(vars)[1]
ret$var2 <- names(vars)[2]
ret$output <- format.relationship(ret)
ret$plot <- visualize.relationship(vars, ret)
ret
}
format.relationship <- function(ret) {
if ("p.values" %in% names(ret)) {
list("statistics"=kable(data.frame(var1=ret$var1, var2=ret$var2,
R=ret$r, "p-value"=ret$r.p, check.names=F)),
"frequencies"=kable(ret$freq),
"enrichment p-values"=kable(ret$p.values))
}
else if ("cases" %in% names(ret)) {
list("statistics"=kable(data.frame(var1=ret$var1, var2=ret$var2,
F=ret$f.stat, "p-value"=ret$p.value,
R=ret$r, "p-value"=ret$r.p, check.names=F)),
"cases"=kable(ret$cases))
}
else {
list("statistics"=kable(data.frame(var1=ret$var1, var2=ret$var2,
F=ret$f.stat, "p-value"=ret$p.value,
R=ret$r, "p-value"=ret$r.p, check.names=F)))
}
}
visualize.relationship <- function(vars, ret) {
if ("p.values" %in% names(ret)) {
data <- melt(-log(ret$p.values,10))
colnames(data) <- c("var1","var2","p.value")
data$freq <- melt(ret$freq)$value
(ggplot(data, aes(x=as.factor(var2), y=as.factor(var1)))
+ geom_tile(aes(fill=p.value))
+ geom_text(aes(label=freq))
+ labs(x=names(vars)[2], y=names(vars)[1], fill="-log10(p-value)")
+ theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio=1)
+ ggtitle(""))
}
else {
if (!is.factor(vars[[1]]) && !is.factor(vars[[2]])) {
n.levels <- sapply(vars, function(var) length(unique(var)))
if (n.levels[1] <= n.levels[2] && n.levels[1] < 20)
vars[[1]] <- factor(vars[[1]], levels=sort(unique(vars[[1]])), ordered=T)
if (n.levels[2] < n.levels[1] && n.levels[2] < 20)
vars[[2]] <- factor(vars[[2]], levels=sort(unique(vars[[2]])), ordered=T)
}
if (is.factor(vars[[2]])) {
vars[1:2] <- vars[2:1]
names(vars)[1:2] <- names(vars)[2:1]
} ## if one is a factor, then it is the first one
data <- data.frame(x=vars[[1]], y=vars[[2]])
if (!is.factor(data$x) && !is.factor(data$y)) {
(ggplot(data, aes(x=x, y=y))
+ geom_point()
+ geom_smooth(method=lm)
+ xlab(names(vars)[1])
+ ylab(names(vars)[2])
+ ggtitle(""))
} else {
(ggplot(data, aes(x=x,y=y))
+ geom_boxplot()
+ xlab(names(vars)[1])
+ ylab(names(vars)[2])
+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
+ ggtitle(""))
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.