odsdat: Simulated ODS data set

Description Usage Format Examples

Description

Simulated data set 2500 subjects according a mTLV model with marginal mean parameters (intercept, time, Xe (binary exposure), time)=(-1.75, 0.25, 0.25, 0.1), and dependence model parameters (sigma, gamma)=(1,1). The marginal exposure prevalence of Xe was assumed to be 0.35, and all subjects had 10 observations. A two-phase ODS design was implemented using the phase 1 ODS design D1[25,50,25], and a phase 2 ODS design of D2[0,300,0]. The code to generate odsrand is provided in the Example section.

Usage

1

Format

A data frame with 3830 observations on the following 11 variables.

id

a subject identifier

Y

a binary outcome

time

a time-varying covariate

Xe

a binary, time-invariant covariate (e.g., exposure of interest)

nobs

number of observations per subject; 10 in this example

sumY

subject-specific sum of Y; used to define sampling strata

ss

sampling strata; values 1-3

sp1

sampling probability for stratum 1

sp2

sampling probability for stratum 2

sp3

sampling probability for stratum 3

sample

phase sampling indicator; 1=sampled in phase 1, 2=sampled in phase 2, NA=otherwise

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
## Not run: 
# Identify ids by sampling strata (ss) and compute basic stratum-specific summaries 
idsXss <- function(DAT) {
  DAT <- split(DAT, DAT$id)
  
  # Generate ids by each sampling stratum
  N      <- length(DAT)
  DAT_ss <- split(seq(N), factor(sapply(DAT, function(ZZ) ZZ[1,'ss']), 1:3))
  
  # Calculate frequencies in each sampling statrum and frequencies of sum(Y) and mean(Y)
  ss_sum  <- sapply(DAT_ss, length)
  sy_sum  <- table(sapply(DAT, function(ZZ) sum(ZZ$Y)))
  sy_mean <- table(sapply(DAT, function(ZZ) mean(ZZ$Y)))
  
  out <- DAT_ss
  attr(out, 'freq') <- list('ss'=ss_sum, 'sy'=sy_sum, 'sm'=sy_mean)
  out
}

# For a given design (e.g., D[n0, n1, n2]), use probability sampling to identify
# subjects to sample. get_sample() returns a list that contains sampling probabilities,
# and lists of sampled ids by ss and remaining ids by ss. When performing 2-phase ODS
# designs, we use the list of remaining ids (sampling without replacement), and thus
# our phase 2 sampling probabilities are conditioned on phase 1 data.

get_sample <- function(DESIGN, IDSXSS, RSEXACT, REPLACE=FALSE) {
  # DESIGN  = vector of expected sample counts
  # IDSXSS  = list of ids by sampling stratum (length=3)
  # RSEXACT = indicator to force exact random sampling
  # REPLACE = indicator if sampling with replacement should be performed

  tmp_design  <- as.list(DESIGN)
  blah_samp   <- vector('list',length(tmp_design))
  
  if( RSEXACT ) {
    blah <- do.call(rbind, mapply( function(AA,BB) list(cbind(AA,BB)), 
                                   AA=as.list( as.numeric(names(IDSXSS)) ),BB=IDSXSS))
    blah_tmp <-  data.frame( blah[ sort( sample(x=seq(nrow(blah)),round( sum(DESIGN),0) , replace=FALSE)) ,] )
    if(ncol(blah_tmp)==1) blah_tmp <- data.frame(t(blah_tmp))
    blah_samp     <- blah_tmp
    blah_samp[,1] <- factor(blah_samp[,1], levels=1:3)
    blah_samp     <- split(blah_samp$BB, blah_samp$AA)
  }
  
  tmp_samp    <- mapply( function(AA,BB,CC) {
    nn <- length(AA)
    if(nn < BB) BB <- nn
    sp <- ifelse(nn==0,0, BB/nn)
    sampled <- which( rbinom( nn, size=1, prob=sp )== 1 )
    if(length(sampled)>0 | !is.null(CC)) {
      if(is.null(CC)) {
        sampled <- AA[sampled]
      } else {
        sampled <- CC
      }
    }
    attr(sampled,'sp') <- sp
    list(sampled) }, AA=IDSXSS ,BB=tmp_design, CC=blah_samp)
  
  tmp_sp <- sapply( tmp_samp, function(ZZ) attr(ZZ,'sp') )
  
  tmp_remain <- mapply( function(AA,BB) {
    out <- AA
    if(length(BB)>0)  out <- AA[!(AA%in%BB)]
    out
  }, AA=IDSXSS, BB=tmp_samp)
  
  list('sp'=tmp_sp,'idsXss_samp'=tmp_samp,'idsXss_remain'=tmp_remain)
}

# Generate data
set.seed(1)
N      <- 2500
nclust <- rep(10,N)
id     <- rep(seq(N), nclust)
Xe     <- rep(rbinom(N,size=1,prob=.35), nclust) # binary exposure
time   <- unlist( lapply( as.list(nclust), function(ZZ) seq(ZZ)-1 ) )
data   <- data.frame(id, time, Xe)
data   <- data[order(data$id, data$time),]

# Generate response data
newdata <- GenBinaryY(mean.formula=~time*Xe, lv.formula=~1, t.formula=~1, 
                     beta=c(-1.75, .25, .25, .1), sigma=1, gamma=1, id=id, data=data, Yname = "Y") 

# Generate sampling strata and sample indicator
newdata <- do.call(rbind, lapply( split(newdata, newdata$id), function(ZZ) { 
  ZZ$nobs   <- nrow(ZZ)
  ZZ$sumY   <- sum(ZZ$Y)
  ZZ$ss     <- ifelse(ZZ$sumY==ZZ$nobs, 3, ifelse(ZZ$sumY==0, 1, 2))
  ZZ$sample <- NA
  ZZ}))

# Create list of ids by ss. 
tmp_ss <- idsXss(DAT=newdata) 
(n_ss  <- sapply(tmp_ss, length))

# Create sampling probabilities, and identify ODS sample. Suppose it is of 
# interset to perform a single phase (or a phase one) ODS design of the form
# D[25,50,25].

# Obtain phase 1 ODS sample
s1d      <- apply( cbind( c(25,50,25), attr(tmp_ss,'freq')$ss ), 1, min )
s1info   <- get_sample(DESIGN=s1d, IDSXSS=tmp_ss, RSEXACT=FALSE)
s1info$sp

s1id     <- sort( unique(unlist( s1info$idsXss_samp )) )
s1idX    <- which(newdata$id%in%s1id)
newdata$sample[s1idX] <- 1

# Obtain phase 2 ODS sample
tmp_ss_2      <- s1info$idsXss_remain
tmp_ss_2n     <- sapply(tmp_ss_2,length)
prob_sampled  <- matrix( s1info$sp,nrow=1 )
colnames(prob_sampled) <- c('sp1','sp2','sp3')

s2d      <- apply( cbind( c(0,300,0), sapply(tmp_ss_2,length)), 1, min )
s2info   <- get_sample(DESIGN=s2d, IDSXSS=tmp_ss_2, RSEXACT=FALSE) 
s2info$sp
s2idX    <- which(newdata$id %in% unlist(s2info$idsXss_samp))
newdata$sample[s2idX] <- 2 

# Estimate phase 2 sampling probabilities
prob_sampled        <- data.frame( rbind(s1info$sp,s2info$sp) )
prob_not_sampled    <- apply( 1-prob_sampled, 2, cumprod)                  # cumulative prob of not being sampled
prob_not_sampled    <- rbind(1, prob_not_sampled[-nrow(prob_not_sampled),])   
prob_sampled        <- prob_sampled * prob_not_sampled 
prob_sampled$sample <- c(1,2)
colnames(prob_sampled)[1:3] <- c('sp1','sp2','sp3')
prob_sampled

# Update newdata with sampling probabilities
newdata2 <- merge(newdata, prob_sampled, by='sample',all.x=TRUE)
odsdat  <- newdata2[ order(newdata2$id, newdata2$time), ]
odsdat  <- odsrand[!is.na(odsdat $sample), ]

## End(Not run)

mercaldo/MMLB documentation built on May 22, 2019, 6:51 p.m.