#' @title Chi Square Resampler (One at a Time)
#' @description An app to illustrate use of the chi-square statistic to test
#' for a relationship between two categorical variables. The P-value is computed
#' by resampling, and the resamples are done one at a time. A histogram of resampled chi-square statistics is displayed
#' after each resample, and summary information is output to the
#' console.
#'
#' @rdname ChisqSimSlow
#' @usage ChisqSimSlow(form,data,effects=c("random","fixed"))
#' @param form a formula of the form ~x+y. When using
#' fixed effects (see below for explanation), x should be the
#' variable that is considered the predictor variable.
#' @param data A data frame from which x and y are drawn.
#' @param effects When effects="fixed", the re-sampling is performed under
#' the condition that the row sums in the re-sampled two-way table (with x
#' for rows) are the same as the row sums in the two-way table based on the
#' original data. When effects="random", then both row and column sums
#' in the re-sampled table may vary: only the sum of the counts is
#' constant. (Note: in the re-sampling procedure for chisq.test
#' in the stats package of R, both row and column sums are
#' required to equal the corresponding sums for the original data.)
#' @return Graphical and numerical output
#' @export
#' @author Homer White \email{hwhite0@@georgetowncollege.edu}
#' @examples
#' \dontrun{
#' ChisqSimSlow(~weather+crowd.behavior,data=ledgejump,effects="fixed")
#' }
ChisqSimSlow <- function(form,data,effects=c("random","fixed")) {
#form is of the form ~var1+var2, both factors
#best if var1 is explanatory, var2 is response
#data is the data frame containing these variables
#effects== "fnixed" performs resampling subject to constraint that row total
#are fixed to observed row totals
effects <- match.arg(effects)
#Some utility functions
rowsampler <- function(x,p) {rmultinom(1,size=sum(x),prob=p)} #used in slow sampling with fixed effects
table.samp <- function(x) { #used in slow sampling with fixed effects
nullprobs <- colSums(x)/sum(x)
resamp <- t(apply(x,1,rowsampler,p=nullprobs))
rownames(resamp) <- rownames(x)
colnames(resamp) <- colnames(x)
as.table(resamp)
}
slowsamp <- function(x,effects) {
rnam <- rownames(x)
cnam <- colnames(x)
sofar <- matrix(0,nrow=nrow(x),ncol=ncol(x))
rownames(sofar) <- rnam
colnames(sofar) <- cnam
callquick <- FALSE
quickfin <- function(orig,unfin,effects) { #Quick finish for fixed effects
if(effects=="fixed") {
first <- which(rowSums(unfin)<rowSums(orig))[1]
rowleft <- rowSums(orig)[first]-rowSums(unfin)[first]
for (j in 1:rowleft) {
item <- sampler(x)
unfin[first,item] <- unfin[first,item]+1
}
if (first<nrow(x)) {
for (i in (first+1):nrow(x)) {
probs <- colSums(x)/sum(colSums(x))
unfin[i,] <- rmultinom(1,size=sum(x[i,]),prob=probs)
}
}
return(unfin)
}
if (effects=="random") {
numbleft <- sum(orig)-sum(unfin)
return(unfin+rtabsamp(orig,numbleft))
}
}#end of quickfin
sampler <- function(x) { #used in slow sampling when effects are fixed
#x is a two-way table
#sample at random from column-variable distribution given by table x, assume no relationship
cdf.col <- cumsum(colSums(x))/sum(colSums(x))
samp <- which(runif(1)<cdf.col)[1]
samp
}
if (effects=="fixed") {
for (i in 1:nrow(x)) {
if (callquick==TRUE) break
for (j in 1:rowSums(x)[i]) {
w <- readline("Get an item, press Return. To finish quickly, enter any character.")
if (w=="") {
item <- sampler(x)
cat("The sampled item is",rnam[i],cnam[item],"\n")
sofar[i,item] <- sofar[i,item]+1
cat("So far, the table looks like \n")
print(sofar)
} else {callquick <- TRUE;break}
}
}
if (callquick==TRUE) {
sofar <- quickfin(orig=x,unfin=sofar,effects=effects)
}
return(sofar)
}#end of effects==finxed
if(effects=="random") {
for (i in 1:nrow(x)) {
if (callquick==TRUE) break
for (j in 1:rowSums(x)[i]) {
w <- readline("Get an item, press Return. To finish quickly, enter any character.")
if (w=="") {
item <- rtabsamp(x,1) #retunrs table for sampling just one item
rightrow <- which(rowSums(item)==1) #find row and column of th eitem
rightcol <- which(colSums(item)==1)
cat("The sampled item is",rnam[rightrow],cnam[rightcol],"\n")
sofar <- sofar+item
cat("So far, the table looks like \n")
print(sofar)
} else {callquick <- TRUE;break}
}
}
if (callquick==TRUE) {
sofar <- quickfin(orig=x,unfin=sofar,effects=effects)
}
return(sofar)
}#end of effects == random
}#end of slowsamp
exp.counts <- function(x) (rowSums(x)%*%t(colSums(x)))/sum(x)
#Resampler for "random effects": row sums not fixed.
rtabsamp <- function(x,n) {
#x is the table of observed counts
#n is the number of items to resample
#n=sum(x) amounts to one quick bootstrap resample
expected <- exp.counts(x)
probs <- expected/sum(x)
resamp.tab <- rmultinom(1,size=n,prob=probs)
resamp.tab <- matrix(resamp.tab,nrow=nrow(x))
rownames(resamp.tab) <- rownames(x)
colnames(resamp.tab) <- colnames(x)
return(resamp.tab)
}
BinFinder <- function(val,breaks) {
#breaks a numeric vector of strictly increasing values
if (val < breaks[1]) return(c(-Inf,breaks[1]))
if (val >=breaks[length(breaks)]) return(c(breaks[length(breaks)],Inf))
left <- max(which(breaks<=val))
right <- min(which(val<breaks))
return(c(breaks[left],breaks[right]))
}
RightRow <- function(counts,val) {
n <- nrow(counts)
breaks <- c(counts[,1],counts[n,2])
rightrow <- which(counts[,1]==BinFinder(val,breaks)[1])
return(rightrow)
}
UpdateCounts <- function(counts,new) {
#counts is a matrix with 3 columns:
#LH sides of bins, RH sides of bins, count in bin
#new is the new value
rightrow <- RightRow(counts,new)
counts[rightrow,3] <- counts[rightrow,3]+1
return(counts)
}
UpdateHist <- function(counts,prev,new,oldcolor,newcolor) {
#make the bin of prev all oldcolor
if(length(chisq.stats)>1) {
rr.prev <- RightRow(counts,prev)
rect(xleft=counts[rr.prev,1],
ybottom=0,
xright=counts[rr.prev,2],
ytop=counts[rr.prev,3],
col=oldcolor,border="black")
}
#add a distinct rectangle to top of bin for new value
rr.new <- RightRow(counts,new)
rect(xleft=counts[rr.new,1],
ybottom=counts[rr.new,3],
xright=counts[rr.new,2],
ytop=counts[rr.new,3]+1,
col=newcolor,border="black")
#draw vertical line at stat just in case it got obscured:
lines(x=c(stat,stat),y=c(0,ymax),lwd=2)
}
ResetHist <- function(hist,new,oldcolor) {
oldxmax <- max(hist[,2])
newxmax <- floor(new)+1
xmax <<- newxmax #write upstairs just in case it's needed later
newcountcol <- c(hist[,3],rep(0,newxmax-oldxmax))
newleftcol <- 0:(newxmax-1)
newrightcol <- 1:newxmax
hist.info <- cbind(newleftcol,newrightcol,newcountcol)
#Set up the plot:
plot(0,0,col=rgb(0,0,1,0),xlim=c(0,newxmax),ylim=c(0,ymax),
xlab="Resampled Chisq values",
ylab="Count",
main="Distribution of Resampled Chisq Values So Far")
#add the rectangles, all oldcolor
for(i in 1:nrow(hist.info)) {
rect(xleft=hist.info[i,1],
ybottom=0,
xright=hist.info[i,2],
ytop=hist.info[i,3],
col=oldcolor,border="black")
}
return(hist.info)
}
#___________________________
#Ok, get started on processing the input
prsd <- with(data,ParseFormula(form))
expname <- as.character(prsd$rhs)[2]
respname <- as.character(prsd$rhs)[3]
explanatory <- data[,expname]
response <- data[,respname]
x <- xtabs(~explanatory+response)
N <- 50 #number of times player is liable to play
overprob <- 0.025 #desired prob of going over xlimits and ylimits I will set
q.needed <- 1+log(1-overprob)/N #Using Poisson approx to binomial
deg.freedom <- (nrow(x)-1)*(ncol(x)-1)
xmax <- ceiling(qchisq(q.needed,df=deg.freedom)) #set upper limit on horiz axis
#Now go for upper limit on vertical axis (counts):
left <- 0:(xmax-1)
right <- 1:xmax
leftprobs <- pchisq(left,df=deg.freedom)
rightprobs <- pchisq(right,df=deg.freedom)
binprobs <- rightprobs-leftprobs
biggie <- max(binprobs)
ymax <- ceiling(qbinom(1-overprob,size=N,prob=biggie))
#Set up initial info for histogram:
breaks <- 0:xmax
m <- length(breaks)
hist.info <- cbind(breaks[-m],breaks[-1],rep(0,(m-1)))
prev <- 0
new <- 0
oldcolor <- "lightgreen"
newcolor <- "red"
#lay it out for the user:
cat("The observed cell counts are:","\n")
print(x)
cat("\n")
#now expected cell counts:
expected <- exp.counts(x)
rownames(expected) <- rownames(x)
colnames(expected) <- colnames(x)
cat("If there is no relationship then the expected counts are:","\n")
print(round(expected,1))
stat <- sum((x-expected)^2/expected)
cat("The observed value of the chisq statistic is:",round(stat,2),"\n")
chisq.stats <- numeric()
ymax.bar <- 1.3*max(x)
barplot(t(x),beside=TRUE,
legend.text=colnames(x),
ylim=c(0,ymax.bar),xlab=expname,
args.legend = list(x = "topleft",title=respname,cex=0.75),
col=rainbow(ncol(x)),
main=paste(expname,"and",respname))
procedure <- readline("Enter s for slow sample, f for one fast sample, q to quit.")
#Make sure they enter something:
acceptable <- (procedure %in%c("s","f","q"))
while(!acceptable) {
procedure <- readline("Hey! I said: Enter s for slow sample, f for one fast sample, q to quit.")
acceptable <- (procedure %in%c("s","f","q"))
}
if (procedure=="q") return(cat("All Done!"))
else {
plot(0,0,col=rgb(0,0,1,0),xlim=c(0,xmax),ylim=c(0,ymax),
xlab="Resampled Chisq values",
ylab="Count",
main="Distribution of Resampled Chisq Values So Far")
}
while(procedure != "q") {
max.height <- max(hist.info[,3])
if(max.height >=ymax) {#This person is persistant. Give her more head room.
cat("My, my, you ARE the persistent type!\n")
cat("I'll raise the vertical axis so you can keep going.\n")
ymax <- floor(1.5*ymax)
#Trick here: reuse the ResetHist function
ResetHist(hist.info,xmax-1,oldcolor)
}
if (procedure=="s") {
resamp <- slowsamp(x,effects=effects)
expected <- exp.counts(resamp)
newstat <- sum((resamp-expected)^2/expected)
chisq.stats <- c(chisq.stats,newstat)
}
if (procedure=="f") {
if (effects=="fixed") resamp <- table.samp(x) else resamp <- rtabsamp(x,sum(x))
expected <- exp.counts(resamp)
newstat <- sum((resamp-expected)^2/expected)
chisq.stats <- c(chisq.stats,newstat)
}
#output results to console:
cat("\n")
print(resamp)
cat("For this resample the chisq statistic is",round(newstat,2),".")
cat("Total resamples so far is",length(chisq.stats),"\n")
perc <- 100*length(chisq.stats[chisq.stats >= stat])/length(chisq.stats)
cat("\n")
cat("Percentage of times the resampled statistics have exceeded observed chisq statistic",
round(stat,2),"is",round(perc,1),"%")
#now update the histogram
#first deal with possibility that newstat exceeds old bounds:
if(newstat>xmax) hist.info <- ResetHist(hist.info,newstat,oldcolor)
#Then update:
new <- newstat
UpdateHist(hist.info,prev,new,oldcolor,newcolor)
hist.info <- UpdateCounts(hist.info,new)
prev <- new
#Query for another resample:
procedure <- readline("Enter s for slow sample, f for one fast sample, q to quit.")
#Make sure they enter something:
acceptable <- (procedure %in%c("s","f","q"))
while(!acceptable) {
procedure <- readline("Hey! I said: Enter s for slow sample, f for one fast sample, q to quit.")
acceptable <- (procedure %in%c("s","f","q"))
}
}
return(cat("All done!"))
}#end of chisqSimSlow
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.