#' Chi Squared Experiment
#'
#' Chi.2.eks with custom chi2 function that generates a chi2 statistic as well as a craemer's v
#' for a given input. The output is displayed in terms of outputs and exposures in the form of the data frame p.df
#' and list spdf
#'
#' @param data an R object (data set)
#' @param out.idx Integer referring to the index of the name that matches "piqola"
#' @param min.length Numeric referring to the minimum length
#' @param max.levels Numeric referring to the maximum amount of levels
#' @param ver.idx Integer used in forecasting
#' @param out.raw Logical to display raw list
#' @param out.raw.name Character vector containing the name of the raw list
#' @param out.df Logical to display data frame
#' @param out.df.name Character vector containing the name of the data frame
#' @param out.list Logical to display list
#' @param out.list.name Character vector containing the name of the list
#' @param p.cutoff Numeric that refers to the probability coëfficient
#' @param dig Numeric that refers to the number of digits
#' @param suiwer Logical to leave those that give warnings
#' @param verbose Logical to display messages
#' @param debug Logical for debugging process
#' @param xlsx Logical to write xlsx file
#' @param outdir Character vector that contains the output directory
#' @param xlsx.name Character vector containing the name of the written xlsx file
#' @param plot Logical whether to plot the data
#' @param net.name Character vector containing the name of the gml network diagram
#' @export
chi.2.eksp <- function(data = des,
out.idx = grep("piqola", names(data)),
min.length = 4*16,
max.levels = 12,
ver.idx = NULL,
out.raw = FALSE,
out.raw.name = "out.raw",
out.df = TRUE,
out.df.name = "p.df",
out.list = TRUE,
out.list.name = "spdf",
p.cutoff = 0.05, # standard p.value
dig = 3, # number of digits
suiwer = TRUE, # los die wat 'n warning gee
verbose = TRUE,
debug = TRUE,
xlsx = TRUE,
outdir = datadir,
xlsx.name = "PIQOLA_chi2_uitkomste.xlsx",
plot = TRUE,
net.name = paste(naam, "chinet")){
message("Jy benodig die withWarnings funksie vir hierdie ding om te werk\nJy benodig ook die openxlsx pakket as jy xlsx wil uitskryf en igraphs as jy die netwerk-grafiek wil hê")
if (verbose == TRUE) {message("out.idx is ", length(out.idx), " lank" )}
dx <- which(apply(X = as.array(out.idx), MARGIN = 1, FUN = function(x) {sum(table(data[,x]))}) > min.length)
#dx = which(sapply(data[,out.idx], function(x) {sum(table(x))}) > min.length)
if (verbose == TRUE) message("out.idx is ", length(out.idx), " lank en dx is ", paste(dx, " ") )
out.idx = out.idx[dx]
xx = sapply(data[out.idx], nlevels) >= 2
if (verbose == TRUE) message("xx is ", paste(xx, " ", sep=""))
#which(sapply(data[out.idx], function(x) length(levels(as.factor(as.character(x)[which(table(x) != 0 & names(table(x)) != "")]))) >1))
out.idx = out.idx[xx]
if (verbose == TRUE) message("out.idx is ", paste(out.idx, " ", sep=""))
# if (verbose == TRUE) message("out.idx is ", length(out.idx), " lank na lank genoeg filter en lank genoeg is ", paste(lank.genoeg," ") ,"\n")
zz <- ((apply(X = as.array(out.idx), MARGIN = 1, FUN = function(idx) {nlevels(data[, idx])})) < max.levels)
#zz <- sapply(data[,out.idx], nlevels) < max.levels
if (verbose == TRUE) message("zz is ", paste(zz, " ", sep=""), "\n")
te.lank = na.omit(as.integer(which(zz == FALSE)))
if (verbose == TRUE) message("te lank is ", length(te.lank), " lank en dis ", paste(te.lank, " ") )
if (length(te.lank) > 0) {out.idx <- out.idx[- te.lank[!is.na(te.lank)]]}
if (verbose == TRUE) message("out.idx is ", length(out.idx), " lank na te lank filter")
if (verbose == TRUE) message("out.idx is nou ", paste(out.idx, " ", sep=""))
if (length(out.idx) < 1) {
warning("Hierdie veranderlike(s) is nie geskik vir 'n chi2 toets ens. nie. Ek stuur NULL terug.")
return(NULL)
}
if (verbose == TRUE ) message("Uitkomste:\n", paste(names(data)[out.idx], "\n", sep=""))
if (is.null(ver.idx) == TRUE) {ver.idx <- out.idx}
if (verbose == TRUE) {message("ver.idx is ", length(ver.idx), " lank")}
if (verbose == TRUE ) {message("Voorspellers:\n", paste(names(data)[ver.idx], "\n", sep="")) }
if (debug == TRUE){
assign("out.idx", out.idx, envir=.GlobalEnv)
assign("ver.idx", ver.idx, envir=.GlobalEnv)
}
z = apply(X = as.array(out.idx), MARGIN = 1, FUN = function(oidx) {
x <- data[, oidx]
apply(X = as.array(ver.idx), MARGIN = 1, FUN = function(vidx) {
w <- data[, vidx]
#if (debug == TRUE) message("x ", levels(as.factor(x)))
#if (debug == TRUE) message("w ", levels(as.factor(w)))
if (sum(margin.table(table(x, w), 2)[1]) != 0 &
sum(margin.table(table(x, w), 2)[2]) != 0 &
sum(margin.table(table(x, w), 1)[1]) != 0 &
sum(margin.table(table(x, w), 1)[2]) != 0 ) {
#returnobj <- withWarnings(my.chisq.test(x=x, y=w, correct=TRUE, simulate.p.value=FALSE))
returnobj <- my.chisq.test(x=x, y=w, correct=TRUE, simulate.p.value=FALSE)
return(returnobj)
}
return(NULL)
})
})
if (length(z) == 1) z <- z[[1]]
if (is.null(z)) {
warning("x en w sou nie saamgewerk het vir die my.chisq.test nie, so ons kan niks verder doen nie. Stuur NULL terug.")
return(NULL)
}
if (all(is.null(z))) {
warning("x en w sou nie saamgewerk het vir die my.chisq.test nie, so ons kan niks verder doen nie. Stuur NULL terug.")
return(NULL)
}
if (debug == TRUE) assign("z", z, envir = .GlobalEnv)
if (debug == TRUE) assign("ver.idx", ver.idx, envir = .GlobalEnv)
if (debug == TRUE) assign("data", data, envir = .GlobalEnv)
names(z) = names(data)[ver.idx]
names(z) = paste(names(z),"__",sep="")
if (out.raw == TRUE){
if (verbose == TRUE ) message("Ek skryf die rou lys uit want jy het gevra. Sy naam is: ", out.raw.name , " en hy is ", length(z) , " items lank")
assign(out.raw.name, z, envir=.GlobalEnv)
}
### Maak die out.df
#p.values = unlist(z)[grep("p.value$", names(unlist(z)))]
p.values = unlist(z)[grep("p.value", names(unlist(z)), fixed = TRUE)]
df = unlist(z)[grep("df", names(unlist(z)))]
statistic = unlist(z)[grep("statistic", names(unlist(z)))]
n = unlist(z)[grep("\\.n$", names(unlist(z)))]
craemer = unlist(z)[grep("craemer", names(unlist(z)))]
out.names <- rep(names(data)[out.idx], each = length(ver.idx))
ver.names <- rep(names(data)[ver.idx], times = length(out.idx))
#warnings = unlist(lapply(z, lapply, "[", "Warnings"))
#names(warnings) = gsub("__.","__",names(warnings))
#waarsku = rep("Geen", length(p.values))
waarsku = rep("None", length(p.values))
#war.idx = match(gsub("Warnings", "", names(warnings)), gsub("Value.p.value", "", p.names))
# war.idx = match(gsub("warnings", "", names(warnings)), gsub("Value.p.value", "", p.names))
# if (any(!is.na(war.idx))) { waarsku[war.idx[!is.na(war.idx)]] = warnings[war.idx[!is.na(war.idx)]]}
p.df <- data.frame(outcome = out.names,
exposure = ver.names,
p.value = round(as.numeric(p.values), dig),
df = df,
n = n,
statistic = round(as.numeric(statistic), dig),
craemer = round(as.numeric(craemer), dig),
warnings = waarsku
)
p.df = p.df[order(p.df$p.value),]
rownames(p.df) = 1:nrow(p.df)
if (verbose == TRUE) message("Dataframe van ", dim(p.df)[1], " by ", dim(p.df)[2], " gemaak\nName is: ", paste(names(p.df), " ", sep=" "))
# hierdie is suboptimaal: gebruik transform
# maak hom korter:
#if (suiwer == TRUE) p.df = p.df[which(p.df$warnings == "Geen"), ]
if (suiwer == TRUE) p.df = p.df[which(p.df$warnings == "None"), ]
p.cutoff.idx = which(p.df$p.value < p.cutoff)
if (length(p.cutoff.idx) > 0){
p.df = p.df[p.cutoff.idx, ]
if (verbose) message("p.cutoff is ", p.cutoff)
}else{
stop("Nie een p waarde is groter as p.cutoff nie")
}
ref.idx = which(p.df$outcome != p.df$exposure)
if (length(ref.idx) > 0){
p.df = p.df[ref.idx, ] # remove self-reference
}
if (out.df == TRUE){
message("Ek skryf 'n dataframe uit want jy het gevra. Sy naam is ", out.df.name, "\nsy dimensies is ", paste(dim(p.df), collapse = " by "))
assign(out.df.name, p.df, envir=.GlobalEnv)
}
## Maak out.list
spdf = split(p.df, f=p.df$outcome)
if (verbose == TRUE) message("\nspdf gemaak deur p.df te split volgens outcome. Sy lengte is ", length(spdf))
if (verbose == TRUE & debug == TRUE) message("Resultaat-lys gemaak: lengte van ", length(spdf))
nspdf = gsub("(^[[:print:]]+)_+[[:print:]]+\\.p\\.value", "\\1", names(spdf))
if (verbose == TRUE) message("nspdf is :", paste(nspdf, " "))
if (verbose == TRUE & debug == TRUE) message("Resultaat-lys gemaak: lengte van ", length(spdf))
names(spdf) = nspdf
if (verbose == TRUE) message("\nspdf se naam gegee")
if (length(grep("\\.$", names(data))) > 0) {
names(data) = gsub("\\.$", "" , names(data))
}
if (verbose == TRUE & debug == TRUE ) {
message("\nnames spdf \n", paste(names(spdf), " "))
}
if (verbose == TRUE & debug == TRUE) {
##message("\ndata \n", paste(names(data), " "))
}
if (verbose == TRUE & debug == TRUE) {
message("\ngrep('\\.1$', names(spdf))", grep("\\.1$", names(spdf), value=TRUE))
}
if (length(grep("\\.1$", names(spdf))) > 0) {spdf = spdf[-grep("\\.1$", names(spdf))]}
if (verbose == TRUE & debug == TRUE) {message("\nspdf dim ", length(spdf))}
# s.idx = lapply(data[,nspdf], FUN=function(x) {sum(table(x))})
# if (verbose == TRUE & debug == TRUE) {message("\ns.idx ", s.idx)}
# spdf = spdf[which(s.idx > 16*4)] # throw away the short variables
if (verbose == TRUE) message("\nResultaat-lys: lengte van ", length(spdf))
if (out.list == TRUE){
if (verbose == TRUE) message("Ek skryf 'n lys uit want jy het gevra. Sy naam is ", out.list.name)
assign(out.list.name, spdf, envir=.GlobalEnv)
}
if (xlsx == TRUE){
old.wd = getwd()
setwd(outdir)
library(openxlsx)
write.xlsx(p.df,
file=paste(xlsx.name, ".xlsx", sep=""),
sheetName="p.values")
if (verbose == TRUE) message("Ek skryf 'n xlsx uit\nSy naam is ", xlsx.name, " en hy sit in \n", outdir)
setwd(old.wd)
}
if (plot == TRUE){
require(igraph)
if (nrow(p.df) > 1){
p.dfGraph = p.df
p.dfGraph[, "outcome"] = gsub("piqola_", "", p.dfGraph[, "outcome"])
p.dfGraph[, "exposure"] = gsub("piqola_", "", p.dfGraph[, "exposure"])
g = graph.data.frame(p.dfGraph[,c("outcome", "exposure")])
if (debug == TRUE) assign("p.df", p.df, envir = .GlobalEnv)
plot(g, vertex.size = 6, vertex.size2 = 1 , mark.border=0.25, edge.arrow.size=0.25, vertex.label.dist = 0.47)
if(!exists("naam")) naam = "g"
assign(net.name, g, envir=.GlobalEnv)
write.graph(g, file = paste(outdir, net.name, ".graphml", sep=""), format = "graphml")
if (verbose == TRUE) message("Ek skryf 'n gml netwerk diagram uit\nSy naam is ", net.name, " en hy sit in \n", outdir)
}
}
if (!is.null(p.df)) {
for (c in 1:ncol(p.df)) {
if (is.numeric(p.df[, c])) {
p.df[, c] <- round(x = p.df[, c], digits = 5)
}
}
}
return(p.df)
}
#' my.chisq.test
#'
#' Helper function called for chi.2.exp that also supplies n and craemer's v. Test this vs
#' assocstats in vcd package
#'
#' @param x Data frame
#' @param y R object that x will be compared to
#' @param correct Logical that applies Yates continuity correction
#' @param p Integer that applies the replicate function over the length of x
#' @param rescale.p Logical to rescale the probability
#' @param simulate.p.value Logical that simulates a probability value
#' @param B Numeric that describes the number of replicates the simulated p value is based on
#' @export
my.chisq.test <- function (x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)),
rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
{
DNAME <- deparse(substitute(x))
if (is.data.frame(x))
x <- as.matrix(x)
if (is.matrix(x)) {
if (min(dim(x)) == 1L)
x <- as.vector(x)
}
if (!is.matrix(x) && !is.null(y)) {
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
DNAME2 <- deparse(substitute(y))
xname <- if (length(DNAME) > 1L || nchar(DNAME, "w") >
30)
""
else DNAME
yname <- if (length(DNAME2) > 1L || nchar(DNAME2, "w") >
30)
""
else DNAME2
OK <- complete.cases(x, y)
x <- factor(x[OK])
y <- factor(y[OK])
if ((nlevels(x) < 2L) || (nlevels(y) < 2L))
stop("'x' and 'y' must have at least 2 levels")
x <- table(x, y)
names(dimnames(x)) <- c(xname, yname)
DNAME <- paste(paste(DNAME, collapse = "\n"), "and",
paste(DNAME2, collapse = "\n"))
}
if (any(x < 0) || any(is.na(x)))
stop("all entries of 'x' must be nonnegative and finite")
if ((n <- sum(x)) == 0)
stop("at least one entry of 'x' must be positive")
if (simulate.p.value) {
setMETH <- function() METHOD <<- paste(METHOD, "with simulated p-value\n\t (based on",
B, "replicates)")
almost.1 <- 1 - 64 * .Machine$double.eps
}
if (is.matrix(x)) {
METHOD <- "Pearson's Chi-squared test"
nr <- as.integer(nrow(x))
nc <- as.integer(ncol(x))
if (is.na(nr) || is.na(nc) || is.na(nr * nc))
stop("invalid nrow(x) or ncol(x)", domain = NA)
sr <- rowSums(x)
sc <- colSums(x)
E <- outer(sr, sc, "*")/n
v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
V <- outer(sr, sc, v, n)
dimnames(E) <- dimnames(x)
if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
setMETH()
tmp <- .Call(C_chisq_sim, sr, sc, B, E)
STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
PARAMETER <- NA
PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC))/(B +
1)
}
else {
if (simulate.p.value)
warning("cannot compute simulated p-value with zero marginals")
if (correct && nrow(x) == 2L && ncol(x) == 2L) {
YATES <- min(0.5, abs(x - E))
if (YATES > 0)
METHOD <- paste(METHOD, "with Yates' continuity correction")
}
else YATES <- 0
STATISTIC <- sum((abs(x - E) - YATES)^2/E)
PARAMETER <- (nr - 1L) * (nc - 1L)
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
else {
if (length(x) == 1L)
stop("'x' must at least have 2 elements")
if (length(x) != length(p))
stop("'x' and 'p' must have the same number of elements")
if (any(p < 0))
stop("probabilities must be non-negative.")
if (abs(sum(p) - 1) > sqrt(.Machine$double.eps)) {
if (rescale.p)
p <- p/sum(p)
else stop("probabilities must sum to 1.")
}
METHOD <- "Chi-squared test for given probabilities"
E <- n * p
V <- n * p * (1 - p)
STATISTIC <- sum((x - E)^2/E)
names(E) <- names(x)
if (simulate.p.value) {
setMETH()
nx <- length(x)
sm <- matrix(sample.int(nx, B * n, TRUE, prob = p),
nrow = n)
ss <- apply(sm, 2L, function(x, E, k) {
sum((table(factor(x, levels = 1L:k)) - E)^2/E)
}, E = E, k = nx)
PARAMETER <- NA
PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B + 1)
}
else {
PARAMETER <- length(x) - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
N = sum(x)
K = min(dim(x))
CRAEMER = sqrt(STATISTIC / (sum(x) * (K-1)))
names(STATISTIC) <- "X-squared"
names(PARAMETER) <- "df"
names(N) <- "n"
names(K) <- "k"
names(CRAEMER) <- "creamer"
if (any(E < 5) && is.finite(PARAMETER))
warning("Chi-squared approximation may be incorrect")
WARN <- any(E < 5) && is.finite(PARAMETER)
structure(list(statistic = STATISTIC,
parameter = PARAMETER,
p.value = PVAL,
method = METHOD,
data.name = DNAME,
observed = x,
expected = E,
residuals = (x - E)/sqrt(E),
stdres = (x - E)/sqrt(V),
n = N,
warnings = WARN,
k = K,
craemer = CRAEMER
),
class = "htest")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.