knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(rART) set.seed(123) data("mr2015")
This example is from Section 4.2 of "A User's Guide to Approximate Randomization Tests with a Small Number of Clusters". It is a replication of Munyo and Rossi (2015) which studies criminal recidivism of former prisoners by looking at the relationship between the number of inmates released from incarceration on a given day and the number of offenses committed on the same day.
We start by making time based pseudo-clusters. The number of pseudo-clusters is at the digression of the analyst. We generate clusers according to the following table
nr <- nrow(mr2015) cluster_counts <- c(8L,10L,16L) knitr::kable(cbind( "# of Clusters (q)" = cluster_counts, "Cluster Size" = sapply(cluster_counts, function(q){floor(nr/q)}) ))
with the remainder thrown into the last cluster. We first generate the cluster vectors and add them to the data frame.
makecluster <- function(q, n) { cs <- floor(n/q); # cluster size cluster <- rep(1L:q, each = cs) cluster <- c(cluster, rep.int(q, n-(cs*q))) # add remainder to cluster q cluster } # get a matrix of cluster data clustercol <- vapply(cluster_counts, makecluster, numeric(nr), n = nr) # make names for the clusters (clnames <- paste("clusters", cluster_counts, sep = "")) # append data to mr2015 mr2015[clnames] <- clustercol
We start with the first specification as it is the simplest.
We run it once for 10 clusters.
sp1 <- formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + year ) sp1q10 <- artlm(sp1, cluster = clusters10, data = mr2015, select = "releases")
We can get the p-value using summary
summary(sp1q10)
and the confidence intervals using confint
.
confint(sp1q10)
Note that the numbers are exactly the same as in Table 4 of the paper.
In this section, we will just output a table of all specifications to show that the results are the same. These are the specifications.
specifications <- list( # Specification 1: formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + year ), # Specification 2: formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + index ), # Specification 3: formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + I(12L * (year - min(year)) + month) ), # Specification 4: formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + date + factor(year)*factor(month) ), # Specification 5: formula( totalcrime ~ releases + temperature + rainfall + holiday + sunshine + dayofweek + endofyear + factor(year)*factor(month) ) )
We are going to run this many specifications and cluster numbers.
(sp_num <- length(specifications)) (cl_num <- length(cluster_counts))
First, make a list of linear models for each specification and number of clusters.
# initialize empty list spregs <- list() # create a call with common elements fcall <- call("artlm", data = substitute(mr2015), select = "releases") for (si in 1L:sp_num) { for (ci in 1L:cl_num){ lcall <- fcall lcall$formula <- specifications[[si]] lcall$cluster <- as.name(clnames[ci]) # number of the current iteration i <- cl_num*(si-1)+ci # add evaluated model to the list spregs[[i]] <- eval(lcall) # name the model names(spregs)[i] <- paste("s", si, "c", cluster_counts[ci], sep = "") } }
# get p-values (fourth column of summary) p_values <- vapply(spregs, function(z) {summary(z)$coefficients[,4L]}, 0) # get confidence intervals confidence_intervals <- vapply(spregs, confint, numeric(2L)) knitr::kable(cbind("Pr(>|t|)" = p_values, t(confidence_intervals)))
Note that the numbers are exactly the same as in Table 4 of the paper for q = 8
and q = 10
. However, they differ slightly for q = 16
. This is because the function uses random sampling when the number of clusters exceeds 10.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.