Nothing
#' Assumption Checking Function
#'
#' Checks some basic assumptions about the suitability of network analysis on
#' your data
#'
#'
#' @param data dataframe or matrix of observational data (rows: observations, columns: nodes)
#' @param type which assumptions to check? "network" tests the suitability for
#' network analysis in general. "impact" tests the suitability for analyzing impact
#' @param percent percent difference from grand mean that is acceptable when comparing variances.
#' @param split if type="impact", specifies the type of split to utilize
#' @param plot logical. Should histograms each variable be plotted?
#' @param binary.data logical. Defaults to FALSE
#' @param na.rm logical. Should missing values be removed?
#'
#' @details
#'
#' Network analysis rests on several assumptions. Among these:
#' - Variance of each node is (roughly) equal
#' - Distributions are (roughly) normal
#'
#' Comparing networks in impact rests on additional assumptions including:
#' - Overall variances are (roughly) equal in each half
#'
#' This function checks these assumptions and notifies any violations.
#' This function is not intended as a substitute for careful data visualization
#' and independent assumption checks.
#'
#' See citations in the references section for further details.
#'
#' @references
#'
#' Terluin, B., de Boer, M. R., & de Vet, H. C. W. (2016). Differences in Connection Strength between Mental Symptoms Might Be Explained by Differences in Variance: Reanalysis of Network Data Did Not Confirm Staging. PLOS ONE, 11(11), e0155205. Retrieved from https://doi.org/10.1371/journal.pone.0155205
#'
#' @export
assumptionCheck <- function(data, type=c("network", "impact"), percent=20,
split=c("median","mean", "forceEqual", "cutEqual", "quartiles"),
plot=FALSE, binary.data=FALSE, na.rm=TRUE) {
if(match.arg(type)=="network"){
varNodes <- apply(data, 2, stats::var, na.rm=na.rm)
normNodes <- apply(data, 2, stats::shapiro.test)
normNodes <- as.numeric(sapply(normNodes,`[`,2))
varviolNodes <- vector()
normviolNodes <- vector()
if(is.null(colnames(data))) {colnames(data) <- 1:dim(data)[2]}
names(normNodes) <- names(varNodes)<-colnames(data)
grandMean <- mean(varNodes)
for(i in 1:length(varNodes)) {
if(abs((varNodes[i]/grandMean)-1)>(percent/100)) {varviolNodes[i] <- names(varNodes[i])}
if(normNodes[i]<0.01) {normviolNodes[i] <- names(varNodes[i])}
}
varviolNodes <- stats::na.omit(varviolNodes)
normviolNodes <- stats::na.omit(normviolNodes)
if(length(varviolNodes != 0)) {message("Equal node variance violation: variance of the following nodes differs more than ",
percent, "% from grand mean: \n", paste0(varviolNodes, sep=" "), "\n")}
if(length(normviolNodes != 0)) {message("Shapiro-Wilk violation (p<0.01): Visually check normality of nodes: \n",
paste0(normviolNodes, sep=" "), "\n")}
cat("Grand mean of variances = ", grandMean, "\n")
cat("Node variances \n")
print(varNodes)
cat("\n Shapiro-Wilk Normality (p) \n")
print(normNodes)
if(plot){
op <- graphics::par(mfrow=c(round((dim(data)[2]/3)+.5), 3))
for(i in 1:dim(data)[2]) {graphics::hist(data[,i], main=NULL, xlab=colnames(data)[i], ylab=NULL)}
graphics::par(op)}
}
if(match.arg(type)=="impact"){
input <- stats::na.omit(data)
table <- as.data.frame(matrix(rep(NA, 2*dim(data)[2]), dim(data)[2], 2))
for(j in 1:dim(data)[2]) {
if(binary.data==FALSE) {
if(match.arg(split)=="median"){
hi <- input[input[,j]> stats::median(input[,j]),][,-j]
lo <- input[input[,j] < stats::median(input[,j]),][,-j]
}
if(match.arg(split)=="mean"){
hi <- input[input[,j]> mean(input[,j]),][,-j]
lo <- input[input[,j] < mean(input[,j]),][,-j]
}
if(match.arg(split)=="forceEqual"){
hi <- input[input[,j] > stats::median(input[,j]),]
lo <- input[input[,j] < stats::median(input[,j]),]
med <- input[input[,j] == stats::median(input[,j]),]
random_sample_med <- med[sample(1:dim(med)[1], abs(dim(hi)[1]-dim(lo)[1])),]
if(dim(hi)[1]<dim(lo)[1]){hi <- rbind(hi, random_sample_med)}
if(dim(hi)[1]>dim(lo)[1]){lo <- rbind(lo, random_sample_med)}
hi <- hi[,-j]
lo <- lo[,-j]
}
if(match.arg(split)=="cutEqual"){
hi <- input[input[,j] > stats::median(input[,j]),]
lo <- input[input[,j] < stats::median(input[,j]),]
if(dim(hi)[1]<dim(lo)[1]){lo <- lo[-sample(1:dim(lo)[1], abs(dim(hi)[1]-dim(lo)[1])),-j]}
if(dim(hi)[1]>dim(lo)[1]){hi <- hi[-sample(1:dim(hi)[1], abs(dim(hi)[1]-dim(lo)[1])),-j]}
}
if(match.arg(split)=="quartile"){
hi <- input[input[,j]>= stats::quantile(input[,j],probs=.75),][,-j]
lo <- input[input[,j]<= stats::quantile(input[,j],probs=.25),][,-j]
}
}
if(binary.data==TRUE) {
hi <- input[input[,j]==1,][,-j]
lo <- input[input[,j]==0,][,-j]
if(match.arg(split)=="cutEqual"){
if(dim(hi)[1]<dim(lo)[1]){lo <- lo[-sample(1:dim(lo)[1], abs(dim(hi)[1]-dim(lo)[1])),]}
if(dim(hi)[1]>dim(lo)[1]){hi <- hi[-sample(1:dim(hi)[1], abs(dim(hi)[1]-dim(lo)[1])),]}
}
}
table[j,1] <- sum(diag(stats::var(lo)))
table[j,2] <- sum(diag(stats::var(hi)))
if(abs((table[j,1]/table[j,2])-1)>(percent/100)) {message("Equal network variances violation: difference in variance between halves on node \"",
colnames(data)[j], "\" is greater than ", percent)}
}
colnames(table) <- c("Low Half (variance)", "High Half (variance)")
rownames(table) <- colnames(data)
cat("Total network variances \n")
table
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.