Nothing
# One explanatory variable at a time violin plotter
#
# @usage plot_violin_1x(dat, response_variable_name, explanatory_variable_name,
# title="", xlab="", ylab="",
# COLOURS=c("#e0f3db","#ccebc5","#a8ddb5","#7bccc4","#4eb3d3","#2b8cbe"),
# BAR_COLOURS=c("#636363","#1c9099","#de2d26"),
# CI=95, XTICKS=TRUE, LOG=FALSE, BASE=10)
#
# @param dat dataframe where the response and explanatory variables including interaction terms if applicable are explicitly written into columns (output of the parse_formula() function) [mandatory]
# @param response_variable_name string referring to the variable name of the response variable [mandatory]
# @param explanatory_variable_name string referring to the variable name of the explanatory variable [mandatory]
# @param title string corresponding to the explanatory term including additive and interaction terms in the formula [default=""]
# @param xlab string specifying the x-axis label [default=""]
# @param ylab string specifying the y-axis label [default=""]
# @param COLOURS vector of colors of the violin plots which are repeated if the length is less than the number of explanatory factor levels [default=c("#e0f3db", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe")]
# @param BAR_COLOURS vector of colors of standard deviation, standard error and 95 percent confidence interval error bars (error bar selection via leaving one of the three colors empty) [default=c("#636363", "#1c9099", "#de2d26")]
# @param CI numeric referring to the percent confidence interval [default=95]
# @param XTICKS logical referring to whether the explanatory variable is strictly categorical [default=TRUE]
# @param LOG logical referring to whether to transform the explanatory variable into the logarithm scale [default=FALSE]
# @param BASE numeric referring to the logarithm base to transform the explanatory variable with [default=1]
#
# @return Return 0 if successful
#
# @examples
# x1 = rep(rep(rep(c(1:5), each=5), times=5), times=5)
# x2 = rep(rep(letters[6:10], each=5*5), times=5)
# x3 = rep(letters[11:15], each=5*5*5)
# y = rep(1:5, each=5*5*5) + rnorm(rep(1:5, each=5), length(x1))
# data = data.frame(x1, x2, x3, y)
# formula = y ~ x1 + x2 + x3 + (x2:x3)
# DF = parse_formula(formula=formula, data=data)
# plot_violin_1x(dat=DF, response_variable_name="y", explanatory_variable_name="x3")
#
#' @importFrom stats qnorm density aggregate
#' @importFrom graphics par plot axis polygon arrows points grid par axTicks
#' @importFrom grDevices rgb
#
plot_violin_1x = function(dat, response_variable_name, explanatory_variable_name, title="", xlab="", ylab="", COLOURS=c("#e0f3db", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe"), BAR_COLOURS=c("#636363", "#1c9099", "#de2d26"), CI=95, XTICKS=TRUE, LOG=FALSE, BASE=10){
### FOR TESTING:
# dat=DF; response_variable_name="y"; explanatory_variable_name="x3"
# title=""; xlab=""; ylab=""; COLOURS=c("#e0f3db", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe"); BAR_COLOURS=c("#636363", "#1c9099", "#de2d26"); CI=95; XTICKS=TRUE; LOG=FALSE; BASE=10
### extract the dependent or response or y variable, as well as the independent or explanatory or x variable
x = as.character(eval(parse(text=paste0("dat$`", explanatory_variable_name, "`")))) ### numeric and categorical both treated as categorical
# x = as.factor(gsub("-", "_", x)) ### remove "-" because it will conflict with the string splitting if performing TukeyHSD()
y = eval(parse(text=paste0("dat$`", response_variable_name, "`"))) ### numeric
# y = rnorm(length(y)) ### null test
### merge them into a data frame for ease of handling
### while converting the x variable into both categorical and numeric variables
x_categorical = NA ### to prevent devtools error: Undefined global functions or variables: x_categorical
x_numeric = tryCatch(
as.numeric(gsub("_", "-", x)),
warning=function(e){
as.numeric(as.factor(gsub("_", "-", x)))
}
)
if (XTICKS==FALSE){
### for numeric explanatory variable
df = data.frame(y=y, x_categorical=as.factor(x_numeric), x_numeric=x_numeric) ### remove "-" because it will conflict with the string splitting if performing TukeyHSD()
df = droplevels(df[complete.cases(df), ])
df$y = as.numeric(y)
df$x_numeric = as.numeric(df$x_numeric)
### transform the x axis into log-scale for ease of viewing
if (LOG==TRUE){
if(sum(is.na(suppressWarnings(log(df$x_numeric, base=BASE))), na.rm=T) == 0){
df$x_numeric=log(df$x_numeric, base=BASE)
xlab = paste0("log", BASE, "(", xlab, ")")
} else {
df$x_numeric=log(df$x_numeric+abs(min(df$x_numeric))+1, base=BASE)
xlab = paste0("log", BASE, "(", xlab, "+", round(min(df$x_numeric)+1, 3), ")")
}
}
### convert numeric factors into categories
if (length(unique(round(df$x_numeric, 4))) == length(unique(df$x_numeric))){
df$x_categorical=as.factor(round(df$x_numeric, 4))
} else {
df$x_categorical=as.factor(df$x_numeric)
}
} else {
### for strictly categorical explanatory variable
df = data.frame(y=y, x_categorical=x, x_numeric=x_numeric)
df = droplevels(df[complete.cases(df), ])
df$y = as.numeric(y)
df$x_categorical = as.factor(df$x_categorical)
df$x_numeric = as.numeric(df$x_numeric)
}
### extract the levels and unique values of the x variable
x_LEVELS_AND_NUMBERS = aggregate(x_numeric ~ x_categorical, data=df, FUN=mean)
x_levels = as.character(x_LEVELS_AND_NUMBERS[,1])
x_numbers = as.numeric(x_LEVELS_AND_NUMBERS[,2])
### calculate the summary statistics of the x and y vairables
x_min = min(df$x_numeric)
x_max = max(df$x_numeric)
x_sd = sd(df$x_numeric)
y_min = min(df$y)
y_max = max(df$y)
y_sd = sd(df$y)
### calculate the maximum interval between two levels of the x variable (divide by 2)
max_x_interval = min(x_numbers[order(x_numbers)][2:length(x_numbers)] - x_numbers[order(x_numbers)][1:length(x_numbers)-1]) / 2
### repeat violin plot colours to fit the number of explanatory variable levels
COLOURS = rep(COLOURS, times=ceiling(length(x_levels)/length(COLOURS)))
### define las: i.e. the orientation of the x-axis tick labels
### as well as the plot margins and the x-axis label
max_nchar = max(unlist(lapply(x_levels, FUN=nchar)))
# orig_par = par(no.readonly=TRUE)
# on.exit(par(orig_par)) ### breaks the layout (justification: this funtion is only called by the main function: violinplotter())
if (max_nchar > 7){
las = 2
par(mar=c(max_nchar*0.70, 5, 7, 2))
xlab=""
} else {
las =1
par(mar=c(5, 5, 7, 2))
}
### find y-axis ticks location within the range of y
plot(y, type="n", main="", xlab="", ylab="", xaxt="n", yaxt="n")
y_ticks = axTicks(2)
par(new=TRUE)
### initialize the plot with or without the x-axis
if (XTICKS==TRUE){
### for strictly categorical explanatory variable
plot(x=c(x_min-max_x_interval, x_max+max_x_interval), y=c(y_min-y_sd, y_max+y_sd), new=FALSE, type="n", main=title, xlab=xlab, ylab=ylab, las=las, xaxt="n", yaxt="n")
axis(side=1, at=x_numbers, labels=x_levels, las=las)
axis(side=2, at=y_ticks, labels=y_ticks, las=2)
} else {
### for continuous explanatory variable
plot(x=c(x_min-max_x_interval, x_max+max_x_interval), y=c(y_min-y_sd, y_max+y_sd), new=FALSE, type="n", main=title, xlab=xlab, ylab=ylab, las=las, yaxt="n")
axis(side=2, at=y_ticks, labels=y_ticks, las=2)
}
### iteratively plot the density of each explanatory variable level
for (i in 1:length(x_levels)){
# i = 1
subdat = droplevels(subset(df, x_categorical==x_levels[i]))
### calculate the summary statistics of the response variable (y): mean, standard deviation, standard error, and 95% confidence interval
mu = mean(subdat$y)
sigma = sd(subdat$y)
se = sd(subdat$y)/sqrt(nrow(subdat)-1)
ci = qnorm(((CI/100)/2)+0.50) * se
if (is.na(sigma)==FALSE){
### calculate the density with binning adjustment proportional the number of observations divded by 1x10^5
d = density(subdat$y, adjust=max(c(1, nrow(subdat)/1e5)))
### restrict the range of the response variable to the input dataframe
d$y = d$y[(d$x >= min(df$y)) & (d$x <= max(df$y))]
d$x = d$x[(d$x >= min(df$y)) & (d$x <= max(df$y))]
### tranform the density (d$y) into the 0.00 to 1.00 range (in preparation to be mutiplied withe maximum interval variable: "max_x_interval")
d.y_min = min(d$y)
d.y_max = max(d$y)
d$y = (d$y - d.y_min) / (d.y_max - d.y_min)
### define the x-axis points of the polygon defined as the density values left and right of the explanatory variable (df$x) level or value
poly_x = c(x_numbers[i]-rev(d$y*max_x_interval), x_numbers[i]+(d$y*max_x_interval))
### define the y-axis points of the polygon defined as the range of values of the response variable (df$y)
poly_y = c(rev(d$x), d$x)
### draw violin polygon and error bars when the variance is greater than 0
if (sigma > 0){
### draw the polygon
polygon(x=poly_x, y=poly_y, border=NA, col=COLOURS[i])
### plot the summary statistics
suppressWarnings(arrows(x0=x_numbers[i], y0=mu+sigma, y1=mu-sigma, angle=90, code=3, lwd=2, length=0.1, col=BAR_COLOURS[1]))
suppressWarnings(arrows(x0=x_numbers[i], y0=mu+se, y1=mu-se, angle=90, code=3, lwd=2, length=0.1, col=BAR_COLOURS[2]))
suppressWarnings(arrows(x0=x_numbers[i], y0=mu+ci, y1=mu-ci, angle=90, code=3, lwd=2, length=0.1, col=BAR_COLOURS[3]))
}
}
points(x=x_numbers[i], y=mu, pch=20)
# print(x_levels[i])
# print(mean(subdat$y))
}
### plot grid lines
grid()
### show the summary statistics legend
if (sum(BAR_COLOURS!=rgb(1,0,0,alpha=0))>0){
legend("bottomright", inset=c(0, 1), xpd=TRUE, horiz=TRUE, bty="n", col=unlist(BAR_COLOURS)[BAR_COLOURS!=rgb(1,0,0,alpha=0)], cex=(par(no.readonly=TRUE)$cex*0.5), lty=1, lwd=2, legend=c("Standard Deviation", "Standard Error", paste0(CI, "% Confidence Interval"))[BAR_COLOURS!=rgb(1,0,0,alpha=0)])
}
### return the levels and unique values of the x variable
return(0)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.