R/sandbox-for-peer-stats-calcs-with-datatable.R In nsmader/chstatsum: Miscellaneous Utilities for Describing Data

```#-------------------------------------------#
### Sandbox for peer statistics calculation #
#-------------------------------------------#

# The goal is to calculate the average characteristics for Mercs by gear, and
# also the average characteristics of all non-Merc cars with the same gear

### Workspace and data prep ----------------------------------------------------
rm(list=ls())
mtcars
library(data.table)
library(plyr)
mtcars
mtcars <- within(mtcars, {
model <- rownames(mtcars)
merc <- 1*grepl("Merc", model)
})

# 0. Make the data set long format
cars_l <- melt(mtcars,
id.vars = c("model", "merc", "gear"))
dt_l <- data.table(cars_l, key = "merc,gear,variable")

### Miscellaneous experimentation with automatic variables created in data.table ----
merc_l <- dt_l[merc == 1]
test_l <- merc_l[, list(dot_i = .I,
dot_grp = .GRP,
dot_N = .N,
test = .I[which.max(value)],
value = value), by = "gear,variable"] # dot_by = .BY, this has the values of the by variables at each point... returns mor ethan on ealue
wgt_l <- merc_l[, list(N_sameGear = sum(!is.na(value[gear==gear[.I]])), # Is returning something weird
N_AllGears = sum(!is.na(value))),  # Worked as expected
by = "variable"]

### Experimenting with code from Andrew Brooks post on excluding certain rows from calculations ----

### Andrew Brooks' straight average
dt <- data.table(mtcars, key = "cyl,gear")
avgMpg_c <- dt[, # gear != gear[.I]
.(mpg_avg = mean(mpg)),
by = "cyl"]
avgMpg_c
dt[cyl == 4, mean(mpg)]

# Andrew Brooks' average minus current category
avgMpgPeer_c <- dt[,
dt[!(gear %in% unique(dt\$gear)[.GRP]), # 2. The "unique(dt\$gear)[.GRP]" pulls the right gear value. This subsets away from the focal gear as desired.
mean(mpg),
by = "cyl"], # 3. The calculation is still by cylinder. It doesn't matter that this is happening in the inner loop, since it's our only condition.
by = "gear"] # 1. Separates the job by gear, which establishes GRP numbering of these values.
avgMpgPeer_c
dt[cyl == 4 & gear != 3, mean(mpg)]

# Adding a second condition to generalize -- calculation by cyl and vs
with(mtcars, table(cyl, vs))
avgMpgPeer_cv <- dt[,
dt[!(gear %in% unique(dt\$gear)[.GRP]),
mean(mpg),
by = "cyl,vs"],
by = "gear"]
avgMpgPeer_cv
dt[cyl == 4 & vs == 1 & gear != 3, mean(mpg)]

# Confirming the ability to take multiple steps within an i or j component
avgMpgPeer_cv2 <- dt[,
dt[{myGrp <- unique(dt\$gear)[.GRP]
!(gear %in% myGrp)},
mean(mpg),
by = "cyl,vs"],
by = "gear"]
identical(avgMpgPeer_cv, avgMpgPeer_cv2) # TRUE

# Note: in our peer stats application, we're only ever excluding based on one
# criterion, i.e. that student's aren't enrolled in a given program

# 1. Calculation of proportions of focal group by reference groups.  -----------
#    This can be done with a cross-row calculation: # focals in group (with var
#    measured)/# focals. So: subset to focals, total group
#    Do this in its own object?

# Proportion is total of folks in focal, divided by number of folks in focal and reference
# Say, focal is gear, and reference is cyl.
# Then, we want, for a given gear, what % are in different cyl categories
dt_cgm <- melt(dt,
id.vars = c("model", "cyl", "gear"),
variable.name = "meas")
setkeyv(dt_cgm, c("cyl", "gear"))
# Randomly add missing values to ensure that our functions don't count those
mean(is.na(dt_cgm\$value))
dt_cgm[runif(nrow(dt_cgm))<0.10, value:=NA]
mean(is.na(dt_cgm\$value))

dt_cgm[, foc_n := sum(!is.na(value)),      by = "gear,meas"]
dt_cgm[, focInRef_n := sum(!is.na(value)), by = "gear,meas,cyl"]
dt_cgm[, wgt_c := focInRef_n / foc_n]
n_gm  <- dt_cgm[, .(n_foc      = sum(!is.na(value))), by = "gear,meas"]
n_cgm <- dt_cgm[, .(n_focInRef = sum(!is.na(value))), by = "gear,meas,cyl"]

### Check proportion calculations
# Note that the proportion calculations are across all observations in the
# reference categories. Thus, they aren't direct weights for each observation,
# but rather weights for calculations across the observations
checkProps <- unique(dt_cgm[gear == 3 & meas == "mpg", .(gear, cyl, wgt_c)])
colSums(checkProps)

### 2. Calculate averages and variances across all non-focal observations ------
# Note: this procedure would be good for doing calculations for an entire slice
# var rather than just each slice value of a given slice var at a time. E.g.,
# instead of calculating the sch-based peer stats for the YMCA's CSI program,
# we could get the sch-based peer stats for CSI, Achievers, etc.
# Caveat for this is that this still needs handling of unique individuals.
# If we get a good speed-up, could we consider having multiple versions of the
# calculation, where in one we exclude all org-served youth in the peers, and one
# we don't.
# /!\ Still need to calculate the n of peer observations
avg <- dt_cgm[,
dt_cgm[!(gear %in% unique(dt_cgm\$gear)[.GRP]),
.(peerMean = mean(value, na.rm = TRUE), peerVar = var(value, na.rm = TRUE)),
by = "meas,cyl"],
by = "gear"]

avg[cyl == 6 & meas == "mpg"]
dt_cgm[cyl == 6 & meas == "mpg" & gear != 3, var(value, na.rm = TRUE)]

### 3. Join weights and average calculations -----------------------------------
peerStats <- merge(avg, n_gm, by = c("gear", "meas")) %>%
merge(n_cgm, by = c("gear", "meas", "cyl")) %>%
within({
wgt_g  <- n_focInRef / n_foc
wgt2_g <- wgt_g^2
})
# Regarding the squared weight, note that the mean is mu = n1/N*v1 + n2/N*v2 + ... + nk/N*vk =
# w1*v1 + w2*v2 + ... + wk*vk, and that var(mu) = w1^2*var(v1) + w2^2*var(v2) + ... + wk^2*var(vk)
# Thus, the squared weights are effectively the simple weights that we need for the variance
# calculations.
peerStats[meas == "mpg"]

### 4. Weight, calc, and done --------------------------------------------------
peerCalc <- peerStats[,
.(mean = weighted.mean(x = peerMean, w = wgt_g),
var  = weighted.mean(x = peerVar,  w = wgt2_g)),
by = "gear,meas"]
peerCalc[meas == "mpg"]
peerStats[meas == "mpg" & gear == 3]

# Check values
check <- dt_cgm[meas == "mpg" & gear != 3, .(meas, gear, cyl, value)]
checkMean <- aggregate(value ~ cyl, check, mean, na.rm = TRUE)
prop <- as.data.frame(prop.table(table(subset(dt_cgm, gear == 3 & !is.na(value) & meas == "mpg", cyl))))
prop\$cyl <- as.numeric(as.character(prop\$Var1))
mean_prop <- merge(checkMean, prop[, c("cyl", "Freq")], by = "cyl")
paste("The mean mpg for gear 3's peers in the same cyl =",
with(mean_prop, weighted.mean(x = value, w = Freq)))
```
nsmader/chstatsum documentation built on May 23, 2017, 5:19 p.m.