Nothing
#' @title Chi Square Resampler (One at a Time)
#' @description An app to illustrate use of the chi-square statistic to test
#' for a relationship between two categorical variables. The P-value is computed
#' by resampling, and the resamples are done one at a time. A histogram of resampled chi-square statistics is displayed
#' after each resample, and summary information is output to the
#' console.
#'
#' @rdname ChisqSimSlow
#' @usage ChisqSimSlow(form,data,effects=c("random","fixed"))
#' @param form a formula of the form ~x+y. When using
#' fixed effects (see below for explanation), x should be the
#' variable that is considered the predictor variable.
#' @param data A data frame from which x and y are drawn.
#' @param effects When effects="fixed", the re-sampling is performed under
#' the condition that the row sums in the re-sampled two-way table (with x
#' for rows) are the same as the row sums in the two-way table based on the
#' original data. When effects="random", then both row and column sums
#' in the re-sampled table may vary: only the sum of the counts is
#' constant. (Note: in the re-sampling procedure for chisq.test
#' in the stats package of R, both row and column sums are
#' required to equal the corresponding sums for the original data.)
#' @return Graphical and numerical output
#' @export
#' @author Homer White \email{hwhite0@@georgetowncollege.edu}
#' @examples
#' \dontrun{
#' ChisqSimSlow(~weather+crowd.behavior,data=ledgejump,effects="fixed")
#' }
ChisqSimSlow <- function(form,data,effects=c("random","fixed")) {
#form is of the form ~var1+var2, both factors
#best if var1 is explanatory, var2 is response
#data is the data frame containing these variables
#effects== "fnixed" performs resampling subject to constraint that row total
#are fixed to observed row totals
effects <- match.arg(effects)
#Some utility functions
rowsampler <- function(x,p) {rmultinom(1,size=sum(x),prob=p)} #used in slow sampling with fixed effects
table.samp <- function(x) { #used in slow sampling with fixed effects
nullprobs <- colSums(x)/sum(x)
resamp <- t(apply(x,1,rowsampler,p=nullprobs))
rownames(resamp) <- rownames(x)
colnames(resamp) <- colnames(x)
as.table(resamp)
}
slowsamp <- function(x,effects) {
rnam <- rownames(x)
cnam <- colnames(x)
sofar <- matrix(0,nrow=nrow(x),ncol=ncol(x))
rownames(sofar) <- rnam
colnames(sofar) <- cnam
callquick <- FALSE
quickfin <- function(orig,unfin,effects) { #Quick finish for fixed effects
if(effects=="fixed") {
first <- which(rowSums(unfin)<rowSums(orig))[1]
rowleft <- rowSums(orig)[first]-rowSums(unfin)[first]
for (j in 1:rowleft) {
item <- sampler(x)
unfin[first,item] <- unfin[first,item]+1
}
if (first<nrow(x)) {
for (i in (first+1):nrow(x)) {
probs <- colSums(x)/sum(colSums(x))
unfin[i,] <- rmultinom(1,size=sum(x[i,]),prob=probs)
}
}
return(unfin)
}
if (effects=="random") {
numbleft <- sum(orig)-sum(unfin)
return(unfin+rtabsamp(orig,numbleft))
}
}#end of quickfin
sampler <- function(x) { #used in slow sampling when effects are fixed
#x is a two-way table
#sample at random from column-variable distribution given by table x, assume no relationship
cdf.col <- cumsum(colSums(x))/sum(colSums(x))
samp <- which(runif(1)<cdf.col)[1]
samp
}
if (effects=="fixed") {
for (i in 1:nrow(x)) {
if (callquick==TRUE) break
for (j in 1:rowSums(x)[i]) {
w <- readline("Get an item, press Return. To finish quickly, enter any character.")
if (w=="") {
item <- sampler(x)
cat("The sampled item is",rnam[i],cnam[item],"\n")
sofar[i,item] <- sofar[i,item]+1
cat("So far, the table looks like \n")
print(sofar)
} else {callquick <- TRUE;break}
}
}
if (callquick==TRUE) {
sofar <- quickfin(orig=x,unfin=sofar,effects=effects)
}
return(sofar)
}#end of effects==finxed
if(effects=="random") {
for (i in 1:nrow(x)) {
if (callquick==TRUE) break
for (j in 1:rowSums(x)[i]) {
w <- readline("Get an item, press Return. To finish quickly, enter any character.")
if (w=="") {
item <- rtabsamp(x,1) #retunrs table for sampling just one item
rightrow <- which(rowSums(item)==1) #find row and column of th eitem
rightcol <- which(colSums(item)==1)
cat("The sampled item is",rnam[rightrow],cnam[rightcol],"\n")
sofar <- sofar+item
cat("So far, the table looks like \n")
print(sofar)
} else {callquick <- TRUE;break}
}
}
if (callquick==TRUE) {
sofar <- quickfin(orig=x,unfin=sofar,effects=effects)
}
return(sofar)
}#end of effects == random
}#end of slowsamp
exp.counts <- function(x) (rowSums(x)%*%t(colSums(x)))/sum(x)
#Resampler for "random effects": row sums not fixed.
rtabsamp <- function(x,n) {
#x is the table of observed counts
#n is the number of items to resample
#n=sum(x) amounts to one quick bootstrap resample
expected <- exp.counts(x)
probs <- expected/sum(x)
resamp.tab <- rmultinom(1,size=n,prob=probs)
resamp.tab <- matrix(resamp.tab,nrow=nrow(x))
rownames(resamp.tab) <- rownames(x)
colnames(resamp.tab) <- colnames(x)
return(resamp.tab)
}
BinFinder <- function(val,breaks) {
#breaks a numeric vector of strictly increasing values
if (val < breaks[1]) return(c(-Inf,breaks[1]))
if (val >=breaks[length(breaks)]) return(c(breaks[length(breaks)],Inf))
left <- max(which(breaks<=val))
right <- min(which(val<breaks))
return(c(breaks[left],breaks[right]))
}
RightRow <- function(counts,val) {
n <- nrow(counts)
breaks <- c(counts[,1],counts[n,2])
rightrow <- which(counts[,1]==BinFinder(val,breaks)[1])
return(rightrow)
}
UpdateCounts <- function(counts,new) {
#counts is a matrix with 3 columns:
#LH sides of bins, RH sides of bins, count in bin
#new is the new value
rightrow <- RightRow(counts,new)
counts[rightrow,3] <- counts[rightrow,3]+1
return(counts)
}
UpdateHist <- function(counts,prev,new,oldcolor,newcolor) {
#make the bin of prev all oldcolor
if(length(chisq.stats)>1) {
rr.prev <- RightRow(counts,prev)
rect(xleft=counts[rr.prev,1],
ybottom=0,
xright=counts[rr.prev,2],
ytop=counts[rr.prev,3],
col=oldcolor,border="black")
}
#add a distinct rectangle to top of bin for new value
rr.new <- RightRow(counts,new)
rect(xleft=counts[rr.new,1],
ybottom=counts[rr.new,3],
xright=counts[rr.new,2],
ytop=counts[rr.new,3]+1,
col=newcolor,border="black")
#draw vertical line at stat just in case it got obscured:
lines(x=c(stat,stat),y=c(0,ymax),lwd=2)
}
ResetHist <- function(hist,new,oldcolor) {
oldxmax <- max(hist[,2])
newxmax <- floor(new)+1
xmax <<- newxmax #write upstairs just in case it's needed later
newcountcol <- c(hist[,3],rep(0,newxmax-oldxmax))
newleftcol <- 0:(newxmax-1)
newrightcol <- 1:newxmax
hist.info <- cbind(newleftcol,newrightcol,newcountcol)
#Set up the plot:
plot(0,0,col=rgb(0,0,1,0),xlim=c(0,newxmax),ylim=c(0,ymax),
xlab="Resampled Chisq values",
ylab="Count",
main="Distribution of Resampled Chisq Values So Far")
#add the rectangles, all oldcolor
for(i in 1:nrow(hist.info)) {
rect(xleft=hist.info[i,1],
ybottom=0,
xright=hist.info[i,2],
ytop=hist.info[i,3],
col=oldcolor,border="black")
}
return(hist.info)
}
#___________________________
#Ok, get started on processing the input
prsd <- with(data,ParseFormula(form))
expname <- as.character(prsd$rhs)[2]
respname <- as.character(prsd$rhs)[3]
explanatory <- data[,expname]
response <- data[,respname]
x <- xtabs(~explanatory+response)
N <- 50 #number of times player is liable to play
overprob <- 0.025 #desired prob of going over xlimits and ylimits I will set
q.needed <- 1+log(1-overprob)/N #Using Poisson approx to binomial
deg.freedom <- (nrow(x)-1)*(ncol(x)-1)
xmax <- ceiling(qchisq(q.needed,df=deg.freedom)) #set upper limit on horiz axis
#Now go for upper limit on vertical axis (counts):
left <- 0:(xmax-1)
right <- 1:xmax
leftprobs <- pchisq(left,df=deg.freedom)
rightprobs <- pchisq(right,df=deg.freedom)
binprobs <- rightprobs-leftprobs
biggie <- max(binprobs)
ymax <- ceiling(qbinom(1-overprob,size=N,prob=biggie))
#Set up initial info for histogram:
breaks <- 0:xmax
m <- length(breaks)
hist.info <- cbind(breaks[-m],breaks[-1],rep(0,(m-1)))
prev <- 0
new <- 0
oldcolor <- "lightgreen"
newcolor <- "red"
#lay it out for the user:
cat("The observed cell counts are:","\n")
print(x)
cat("\n")
#now expected cell counts:
expected <- exp.counts(x)
rownames(expected) <- rownames(x)
colnames(expected) <- colnames(x)
cat("If there is no relationship then the expected counts are:","\n")
print(round(expected,1))
stat <- sum((x-expected)^2/expected)
cat("The observed value of the chisq statistic is:",round(stat,2),"\n")
chisq.stats <- numeric()
ymax.bar <- 1.3*max(x)
barplot(t(x),beside=TRUE,
legend.text=colnames(x),
ylim=c(0,ymax.bar),xlab=expname,
args.legend = list(x = "topleft",title=respname,cex=0.75),
col=rainbow(ncol(x)),
main=paste(expname,"and",respname))
procedure <- readline("Enter s for slow sample, f for one fast sample, q to quit.")
#Make sure they enter something:
acceptable <- (procedure %in%c("s","f","q"))
while(!acceptable) {
procedure <- readline("Hey! I said: Enter s for slow sample, f for one fast sample, q to quit.")
acceptable <- (procedure %in%c("s","f","q"))
}
if (procedure=="q") return(cat("All Done!"))
else {
plot(0,0,col=rgb(0,0,1,0),xlim=c(0,xmax),ylim=c(0,ymax),
xlab="Resampled Chisq values",
ylab="Count",
main="Distribution of Resampled Chisq Values So Far")
}
while(procedure != "q") {
max.height <- max(hist.info[,3])
if(max.height >=ymax) {#This person is persistant. Give her more head room.
cat("My, my, you ARE the persistent type!\n")
cat("I'll raise the vertical axis so you can keep going.\n")
ymax <- floor(1.5*ymax)
#Trick here: reuse the ResetHist function
ResetHist(hist.info,xmax-1,oldcolor)
}
if (procedure=="s") {
resamp <- slowsamp(x,effects=effects)
expected <- exp.counts(resamp)
newstat <- sum((resamp-expected)^2/expected)
chisq.stats <- c(chisq.stats,newstat)
}
if (procedure=="f") {
if (effects=="fixed") resamp <- table.samp(x) else resamp <- rtabsamp(x,sum(x))
expected <- exp.counts(resamp)
newstat <- sum((resamp-expected)^2/expected)
chisq.stats <- c(chisq.stats,newstat)
}
#output results to console:
cat("\n")
print(resamp)
cat("For this resample the chisq statistic is",round(newstat,2),".")
cat("Total resamples so far is",length(chisq.stats),"\n")
perc <- 100*length(chisq.stats[chisq.stats >= stat])/length(chisq.stats)
cat("\n")
cat("Percentage of times the resampled statistics have exceeded observed chisq statistic",
round(stat,2),"is",round(perc,1),"%")
#now update the histogram
#first deal with possibility that newstat exceeds old bounds:
if(newstat>xmax) hist.info <- ResetHist(hist.info,newstat,oldcolor)
#Then update:
new <- newstat
UpdateHist(hist.info,prev,new,oldcolor,newcolor)
hist.info <- UpdateCounts(hist.info,new)
prev <- new
#Query for another resample:
procedure <- readline("Enter s for slow sample, f for one fast sample, q to quit.")
#Make sure they enter something:
acceptable <- (procedure %in%c("s","f","q"))
while(!acceptable) {
procedure <- readline("Hey! I said: Enter s for slow sample, f for one fast sample, q to quit.")
acceptable <- (procedure %in%c("s","f","q"))
}
}
return(cat("All done!"))
}#end of chisqSimSlow
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.