Description Usage Format Examples
Simulated data set 2500 subjects according a mTLV model with marginal mean parameters (intercept, time, Xe (binary exposure), time)=(-1.75, 0.25, 0.25, 0.1), and dependence model parameters (sigma, gamma)=(1,1). The marginal exposure prevalence of Xe was assumed to be 0.35, and all subjects had 10 observations. A two-phase ODS design was implemented using the phase 1 ODS design D1[25,50,25], and a phase 2 ODS design of D2[0,300,0]. The code to generate odsrand is provided in the Example section.
1 |
A data frame with 3830 observations on the following 11 variables.
id
a subject identifier
Y
a binary outcome
time
a time-varying covariate
Xe
a binary, time-invariant covariate (e.g., exposure of interest)
nobs
number of observations per subject; 10 in this example
sumY
subject-specific sum of Y; used to define sampling strata
ss
sampling strata; values 1-3
sp1
sampling probability for stratum 1
sp2
sampling probability for stratum 2
sp3
sampling probability for stratum 3
sample
phase sampling indicator; 1=sampled in phase 1, 2=sampled in phase 2, NA=otherwise
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | ## Not run:
# Identify ids by sampling strata (ss) and compute basic stratum-specific summaries
idsXss <- function(DAT) {
DAT <- split(DAT, DAT$id)
# Generate ids by each sampling stratum
N <- length(DAT)
DAT_ss <- split(seq(N), factor(sapply(DAT, function(ZZ) ZZ[1,'ss']), 1:3))
# Calculate frequencies in each sampling statrum and frequencies of sum(Y) and mean(Y)
ss_sum <- sapply(DAT_ss, length)
sy_sum <- table(sapply(DAT, function(ZZ) sum(ZZ$Y)))
sy_mean <- table(sapply(DAT, function(ZZ) mean(ZZ$Y)))
out <- DAT_ss
attr(out, 'freq') <- list('ss'=ss_sum, 'sy'=sy_sum, 'sm'=sy_mean)
out
}
# For a given design (e.g., D[n0, n1, n2]), use probability sampling to identify
# subjects to sample. get_sample() returns a list that contains sampling probabilities,
# and lists of sampled ids by ss and remaining ids by ss. When performing 2-phase ODS
# designs, we use the list of remaining ids (sampling without replacement), and thus
# our phase 2 sampling probabilities are conditioned on phase 1 data.
get_sample <- function(DESIGN, IDSXSS, RSEXACT, REPLACE=FALSE) {
# DESIGN = vector of expected sample counts
# IDSXSS = list of ids by sampling stratum (length=3)
# RSEXACT = indicator to force exact random sampling
# REPLACE = indicator if sampling with replacement should be performed
tmp_design <- as.list(DESIGN)
blah_samp <- vector('list',length(tmp_design))
if( RSEXACT ) {
blah <- do.call(rbind, mapply( function(AA,BB) list(cbind(AA,BB)),
AA=as.list( as.numeric(names(IDSXSS)) ),BB=IDSXSS))
blah_tmp <- data.frame( blah[ sort( sample(x=seq(nrow(blah)),round( sum(DESIGN),0) , replace=FALSE)) ,] )
if(ncol(blah_tmp)==1) blah_tmp <- data.frame(t(blah_tmp))
blah_samp <- blah_tmp
blah_samp[,1] <- factor(blah_samp[,1], levels=1:3)
blah_samp <- split(blah_samp$BB, blah_samp$AA)
}
tmp_samp <- mapply( function(AA,BB,CC) {
nn <- length(AA)
if(nn < BB) BB <- nn
sp <- ifelse(nn==0,0, BB/nn)
sampled <- which( rbinom( nn, size=1, prob=sp )== 1 )
if(length(sampled)>0 | !is.null(CC)) {
if(is.null(CC)) {
sampled <- AA[sampled]
} else {
sampled <- CC
}
}
attr(sampled,'sp') <- sp
list(sampled) }, AA=IDSXSS ,BB=tmp_design, CC=blah_samp)
tmp_sp <- sapply( tmp_samp, function(ZZ) attr(ZZ,'sp') )
tmp_remain <- mapply( function(AA,BB) {
out <- AA
if(length(BB)>0) out <- AA[!(AA%in%BB)]
out
}, AA=IDSXSS, BB=tmp_samp)
list('sp'=tmp_sp,'idsXss_samp'=tmp_samp,'idsXss_remain'=tmp_remain)
}
# Generate data
set.seed(1)
N <- 2500
nclust <- rep(10,N)
id <- rep(seq(N), nclust)
Xe <- rep(rbinom(N,size=1,prob=.35), nclust) # binary exposure
time <- unlist( lapply( as.list(nclust), function(ZZ) seq(ZZ)-1 ) )
data <- data.frame(id, time, Xe)
data <- data[order(data$id, data$time),]
# Generate response data
newdata <- GenBinaryY(mean.formula=~time*Xe, lv.formula=~1, t.formula=~1,
beta=c(-1.75, .25, .25, .1), sigma=1, gamma=1, id=id, data=data, Yname = "Y")
# Generate sampling strata and sample indicator
newdata <- do.call(rbind, lapply( split(newdata, newdata$id), function(ZZ) {
ZZ$nobs <- nrow(ZZ)
ZZ$sumY <- sum(ZZ$Y)
ZZ$ss <- ifelse(ZZ$sumY==ZZ$nobs, 3, ifelse(ZZ$sumY==0, 1, 2))
ZZ$sample <- NA
ZZ}))
# Create list of ids by ss.
tmp_ss <- idsXss(DAT=newdata)
(n_ss <- sapply(tmp_ss, length))
# Create sampling probabilities, and identify ODS sample. Suppose it is of
# interset to perform a single phase (or a phase one) ODS design of the form
# D[25,50,25].
# Obtain phase 1 ODS sample
s1d <- apply( cbind( c(25,50,25), attr(tmp_ss,'freq')$ss ), 1, min )
s1info <- get_sample(DESIGN=s1d, IDSXSS=tmp_ss, RSEXACT=FALSE)
s1info$sp
s1id <- sort( unique(unlist( s1info$idsXss_samp )) )
s1idX <- which(newdata$id%in%s1id)
newdata$sample[s1idX] <- 1
# Obtain phase 2 ODS sample
tmp_ss_2 <- s1info$idsXss_remain
tmp_ss_2n <- sapply(tmp_ss_2,length)
prob_sampled <- matrix( s1info$sp,nrow=1 )
colnames(prob_sampled) <- c('sp1','sp2','sp3')
s2d <- apply( cbind( c(0,300,0), sapply(tmp_ss_2,length)), 1, min )
s2info <- get_sample(DESIGN=s2d, IDSXSS=tmp_ss_2, RSEXACT=FALSE)
s2info$sp
s2idX <- which(newdata$id %in% unlist(s2info$idsXss_samp))
newdata$sample[s2idX] <- 2
# Estimate phase 2 sampling probabilities
prob_sampled <- data.frame( rbind(s1info$sp,s2info$sp) )
prob_not_sampled <- apply( 1-prob_sampled, 2, cumprod) # cumulative prob of not being sampled
prob_not_sampled <- rbind(1, prob_not_sampled[-nrow(prob_not_sampled),])
prob_sampled <- prob_sampled * prob_not_sampled
prob_sampled$sample <- c(1,2)
colnames(prob_sampled)[1:3] <- c('sp1','sp2','sp3')
prob_sampled
# Update newdata with sampling probabilities
newdata2 <- merge(newdata, prob_sampled, by='sample',all.x=TRUE)
odsdat <- newdata2[ order(newdata2$id, newdata2$time), ]
odsdat <- odsrand[!is.na(odsdat $sample), ]
## End(Not run)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.