#' @title Cumulative eigenvalue plot
#'
#' @description Compares eigenvalues by cumulative Bland-Altman type plots. Kolmogorov-Smirnov and Cramer-von Mises tests evaluated by permutation tests
#'
#' @param data A dataset, either a \code{data.frame} or a \code{matrix} with variables
#' in columns and observations in rows. WHAT TO DO ABOUT DATA.TABLES AND ALSO TIBBLES. BOTH WILL BE
#' PROBLEMATIC AS OF NOW.
#'
#' @param splitBy A grouping variable with two levels defining the two groups within the
#' dataset whose data structures we wish to compare. If \code{splitBy} has more than
#' two levels, only the first two levels are used.
#'
#' @param var The variable names in \code{data} to include in the PCADSC. If \code{NULL}
#' (the default), all variables except for \code{splitBy} are used.
#'
#' @param make.plot Should a plot be generated?
#'
#' @param B Number of simulations
#'
#' @examples
#' #load iris data
#' data(iris)
#'
#' #Define grouping variable, grouping the observations by whether their species is
#' #Setosa or not
#' iris$group <- "setosa"
#' iris$group[iris$Species != "setosa"] <- "non-setosa"
#'
#' #make a PCADSC object, splitting the data by "group"
#' cePlot(iris, "group",var=setdiff(names(iris), c("group", "Species")))
#'
#' @importFrom graphics polygon matlines lines legend
#' @importFrom stats quantile
#' @importFrom ggplot2 ggplot aes_string geom_polygon geom_line scale_x_continuous
#' scale_y_continuous theme_bw theme element_blank xlab ylab geom_line
#' scale_linetype_manual annotate unit
#' @export
cePlot <- function(data,splitBy,var=NULL,B=1000,make.plot=TRUE) {
#define var
if (is.null(var)) var <- setdiff(names(data), splitBy)
#TO DO:
#- check if any variables are essentially empty. This will cause an error
# as we then divide by zero when standardizing
#- Make sure it deals with factor splitBy varibales correctly
#If data is tibble, data.table or matrix, convert it to matrix
#If data is neither, throw error
if (any(class(data) %in% c("data.table", "tbl", "tbl_df", "matrix"))) {
data <- as.data.frame(data)
message("Note: The data was converted to a data.frame in order for cumeigen to run.")
}
if (class(data) != "data.frame") {
stop("The inputted data must be of type data.frame, data.table, tibble or matrix.")
}
#check if all variables are numeric
isNum <- sapply(data[, var], "is.numeric")
if (!all(isNum)) {
stop(paste("All variables must be numeric for PCA decomposition",
"to be meaningful. The following non-numeric variables",
"were found:", paste(names(isNum[!isNum]), collapse = ", ")))
}
#check if splitBy has more/less than two levels
splitLevels <- unique(data[, splitBy])
if (length(splitLevels) != 2) {
stop(paste("splitLevels must have exactly two levels,",
"but it was found to have", length(splitLevels),
"levels."))
}
# my stdData
stddata <- function(data) {
as.data.frame(lapply(data, function(x) (x - mean(x))/sd(x)))
}
# joint x values
n <- nrow(data)
d <- length(var)
x <- c(0,cumsum(eigen(1/(n-1)*t(as.matrix(stddata(data[,var])))%*%as.matrix(stddata(data[,var])),only.values=TRUE)$values))
#split and standardize data
data1 <- as.matrix(stddata(data[data[, splitBy]==splitLevels[1], var]))
data2 <- as.matrix(stddata(data[data[, splitBy]==splitLevels[2], var]))
n1 <- nrow(data1)
n2 <- nrow(data2)
# observed y values
y1 <- eigen(1/(n1-1)*t(data1)%*%data1,only.values=TRUE)$values
y2 <- eigen(1/(n2-1)*t(data2)%*%data2,only.values=TRUE)$values
y.obs <- c(0,cumsum(y1-y2))
KS.obs <- max(abs(y.obs))
CvM.obs <- sum(x*(y.obs^2))
# simulate partitions
y.sim <- matrix(0,d+1,B)
KS.sim <- rep(0,B)
CvM.sim <- rep(0,B)
for (i in 1:B) {
ii <- sample(1:n,n1)
data1 <- as.matrix(stddata(data[ii,var]))
data2 <- as.matrix(stddata(data[-ii,var]))
y1 <- eigen(1/(n1-1)*t(data1)%*%data1,only.values=TRUE)$values
y2 <- eigen(1/(n2-1)*t(data2)%*%data2,only.values=TRUE)$values
y.sim[,i] <- c(0,cumsum(y1-y2))
KS.sim[i] <- max(abs(y.sim[,i]))
CvM.sim[i] <- sum(x*(y.sim[,i]^2))
}
# p-values
KS.pvalue <- mean(KS.sim >= KS.obs)
CvM.pvalue <- mean(CvM.sim >= CvM.obs)
if (!make.plot) {
# return
return(list(KS.pvalue=KS.pvalue,CvM.pvalue=CvM.pvalue))
}
# make plot
# if (make.plot) {
y.min <- apply(y.sim,1,quantile,probs=0.025)
y.max <- apply(y.sim,1,quantile,probs=0.975)
nSims <- min(20, B)
y.sim.small <- y.sim[, 1:nSims]
pFsim <- data.frame(x = rep(x, ncol(y.sim.small)),
y = c(y.sim.small),
type = c(rep("sim", ncol(y.sim.small)*nrow(y.sim.small))),
run = c(rep(1:ncol(y.sim.small), each = length(x))))
yMaxVal <- 1.01*max(c(y.obs,as.vector(y.sim)))
yMinVal <- 1.01*min(c(y.obs,as.vector(y.sim)))
yBreaks <- round(c(0 + yMaxVal * c(1/3, 2/3, 3/3), 0,
0 - yMaxVal * c(1/3, 2/3, 3/3)),1)
ggplot(pFsim, aes_string(x = "x", y = "y")) +
annotate(geom = "polygon", x = c(x, rev(x)), y = c(y.min, rev(y.max)),
fill = "aliceblue") +
geom_line(aes_string(group = "run", linetype = factor("run")), col = "grey", size = 0) +
scale_x_continuous(limits = c(x[1],x[d+1]), breaks = 0:length(x)) +
scale_y_continuous(limits = c(yMinVal, yMaxVal),
breaks = yBreaks) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
xlab("Cumulated joint eigenvalues") +
ylab("Cumulated difference of eigenvalues") +
scale_linetype_manual(guide = FALSE, values = rep(1:5, 5*ceiling(nSims/5))) +
annotate(geom = "label", label = paste(paste(c("KS: p = ","CvM: p = "),
round(c(KS.pvalue,CvM.pvalue),3),sep=""),
collapse = "\n"),
x = -Inf, y = Inf, hjust = "left", vjust = "top",
label.r = unit(0, "lines"),
label.padding = unit(1, "lines")) +
annotate(geom = "line", x = x, y = y.obs, size = 1)
#}
# if (make.plot) {
# y.min <- apply(y.sim,1,quantile,probs=0.025)
# y.max <- apply(y.sim,1,quantile,probs=0.975)
# plot(c(x[1],x[d+1]),c(1.01*min(c(y.obs,as.vector(y.sim))),
# 1.01*max(c(y.obs,as.vector(y.sim)))),type="n",
# xlab="Cumulated joint eigenvalues",ylab="Cumulated difference of eigenvalues")
# polygon(c(x,rev(x)),c(y.min,rev(y.max)),col="aliceblue",border=NA)
# matlines(x,y.sim[,1:min(20,B)],col="gray")
# lines(x,y.obs,lwd=2)
# legend("topleft",paste(c("KS: p=","CvM: p="),round(c(KS.pvalue,CvM.pvalue),3),sep=""))
# }
# return
#return(list(KS.pvalue=KS.pvalue,CvM.pvalue=CvM.pvalue))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.