knitr::opts_chunk$set(echo = FALSE, eval=TRUE, message=FALSE, warning=FALSE, fig.width=15, fig.height=15)
path = Sys.getenv('PATH')
path = Sys.setenv('PATH'=paste(path, '/home/rstudio/.local/bin', sep=':'))

AWS machine image setup

I use the directions here to create an AMI to run Rstudio on. The Welcome.R script in the NIsim package has code to setup this machine image with Dropbox access to the files.

Setup simulations

# install the latest versions of the packages to perform these analyses.
devtools::install_github('simonvandekar/pbj', ref='ftest')
#devtools::install_github('simonvandekar/NIsim')
### LIBRARIES ###
library(RNifti)
library(parallel)
library(splines)
library(mmand)
library(fslr)
library(progress)
library(abind)
library(pbj)
library(PDQutils)
library(NIsim)



### LOAD IN DATA FROM DROPBOX ###
dbimagedir = '~/pbj/data/abide/neuroimaging/cpac/alff'
dbresimagedir = '~/pbj/data/abide/neuroimaging/cpac/alff_res'
maskfile = '~/pbj/data/abide/neuroimaging/cpac/n1035_mask.nii.gz'
#dbimagedir = '~/pbj/data/abide/neuroimaging/cpac/alff_cropped/'
#dbresimagedir = '~/pbj/data/abide/neuroimaging/cpac/alff_cropped_res/'
#maskfile = '~/pbj/data/abide/neuroimaging/cpac/cropped_n1035_mask.nii.gz'
dbdatafile = '~/pbj/data/abide/demographic/n1035_phenotypic_20190509.rds'


# load in data and get directories
dat = readRDS(dbdatafile)
dat$imgname = paste(dat$file_id, 'alff.nii.gz', sep='_')
dat$files = file.path(dbimagedir, dat$imgname)




### SIMULATION PARAMETERS ###
fakePolySimConfig = list(
  # vector of sample sizes to simulate
  ns = c(25, 50, 100, 200, 400),
  # number of simulations to run
  nsim=500,
  # number of bootstraps
  nboot = 500,
  # cluster forming thresholds
  cfts.s = c(0.1, 0.25, 0.4),
  cfts.p = c(0.01, 0.001),

  # radius for spheres of signal.
  rs=c(8),

  #### MODEL FORMULAS FOR SIMULATIONS ####
  formres = paste0(" ~ dx_group + sex + ns(func_mean_fd, df=10) + ns(age_at_scan, df=10)" ),
  # need age_at_scan in both models for testing nonlinear functions
  form = paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate1 + scale(fake_covariate1^2) + scale(fake_covariate1^3)" ),
  formred = paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate1"),
  #  weights for each subject. Can be a character vector
  W = c("func_mean_fd"),
  # where to put residuals
  resdir = dbresimagedir,
  # where to output results
  simdir = '~/temp',
  dat = dat,
  mask = maskfile,
  output = '~/pbj/pbj_ftest/df2_polynomial.rdata',
  ncores = 46,
  method='bootstrap'
)
# use betas = 0 for global null
# parameters = betas * sd(y)/sd(x).
fakePolySimConfig$betas = rep(0, length(fakePolySimConfig$rs))

### OTHER SIMULATION SETUPS ###
# 1 DF is real group
groupSimConfig = fakePolySimConfig
groupSimConfig$form = paste0(" ~ sex + func_mean_fd + age_at_scan + dx_group" )
groupSimConfig$formred = paste0(" ~ sex + func_mean_fd + age_at_scan" )
groupSimConfig$output = '~/pbj/pbj_ftest/df1_dxgroup.rdata'

# FAKE POLY IS 2 DOF
# 1 DF is real group
randomPolySimConfig = fakePolySimConfig
randomPolySimConfig$form = paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate2 + scale(fake_covariate2^2) + scale(fake_covariate2^3)" )
randomPolySimConfig$formred = paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate2")
randomPolySimConfig$output = '~/pbj/pbj_ftest/df2_polynomial_randomX.rdata'

# 3 DOF
fakeGroupSimConfig = fakePolySimConfig
fakeGroupSimConfig$form = paste0(" ~ sex + func_mean_fd + age_at_scan + fake_group" )
fakeGroupSimConfig$formred = paste0(" ~ sex + func_mean_fd + age_at_scan" )
fakeGroupSimConfig$output = '~/pbj/pbj_ftest/df3_fakegroup.rdata'

# 4 DOF motion
motionSplineSimConfig = fakePolySimConfig
motionSplineSimConfig$form = paste0(" ~ sex + age_at_scan + ns(func_mean_fd, df=5)" )
motionSplineSimConfig$formred = paste0(" ~ sex + age_at_scan + func_mean_fd" )
motionSplineSimConfig$output = '~/pbj/pbj_ftest/df4_motionspline.rdata'

# 5 DOF
ageSplineSimConfig = fakePolySimConfig
ageSplineSimConfig$form = paste0(" ~ sex + func_mean_fd + ns(age_at_scan, df=6)" )
ageSplineSimConfig$formred = paste0(" ~ sex + func_mean_fd + age_at_scan" )
ageSplineSimConfig$output = '~/pbj/pbj_ftest/df5_agespline.rdata'

Simulation functions

# Function that gets observed and bootstrap values from a pbj object.
getBoots = function(pbjObj){
  cftnames = grep('cft', names(pbjObj), value=TRUE)
  out = do.call(cbind, lapply(pbjObj[ cftnames ], function(x) x$boots))
  colnames(out) = cftnames
  ccomps = lapply(pbjObj[ cftnames ], function(x) x$obs)
  return(list(obs=ccomps, boots=out))

}

# Statistic function to get objects for pbjInference
simStats = function(image, mask, thrs){
  c(maximum = max(c(image)), sapply(pbj::cluster(image, mask, thrs), function(z) {suppressWarnings(res <- max(z)); res[is.infinite(res)] = 0; res}))
}

first = function(image, mask, thr){ image[ which(mask==1)[1] ] }

# simfunc should contain a data argument, which is defined within runSim
# Other arguments are identical across simulation runs.
simFunc = function(lmfull, lmred, mask, data, nboot, cfts.s=NULL, cfts.p=NULL){
  # generate fake covariates
  data$fake_group = factor(ceiling(ppoints(nrow(data))*4 ) )
  data$fake_covariate1 = ppoints(nrow(data))
  data$fake_covariate2 = rnorm(nrow(data))

  # t transform, robust, estimate covariance
  #robustStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 'none', HC3 = FALSE )
  # methods to try to make stats map more normal
  HC3RobustStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 'none', HC3 = TRUE )
  # t transform, classical, estimate covariance
  tStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 't', robust=FALSE, HC3=TRUE)

  statmaps = c('tStatmap','HC3RobustStatmap') #'robustStatmap')#, ) 
  out = list()

  # if both are passed, only p-value thresholding is performed
  if(!is.null(cfts.p)){
    thrs = qchisq(cfts.p, df = HC3RobustStatmap$df, lower.tail = FALSE)
  } else if(!is.null(cfts.s)){
    thrs = (cfts.s^2*HC3RobustStatmap$rdf) + tRobustStatmap$df
  } else{
    stop('cfts.p or cfts.s must be specified.')
  }
  # Apply each of the sampling methods
  for(statmapname in statmaps){

    ### BOOTSTRAP METHODS
    statmap = get(statmapname)
    # normal bootstrap
    #pbjNorm = getBoots(pbjSEI(statmap, nboot = nboot, cfts.s = cfts))
    if(statmapname %in% c('HC3RobustStatmap')){
      pbjRadT = pbjInference(statmap, nboot = nboot, rboot = function(n){ (2*rbinom(n, size=1, prob=0.5)-1)}, method='t', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjNormT = pbjInference(statmap, nboot = nboot, rboot = function(n){ rnorm(n)}, method='t', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjPermT = pbjInference(statmap, nboot = nboot, method='permutation', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjNonparametric = pbjInference(statmap, nboot = nboot, method='nonparametric', thr = thrs, mask=statmap$mask, statistic=simStats)
    }

     if(statmapname %in% c('tStatmap')){
      pbjRadT = pbjInference(statmap, nboot = nboot, rboot = function(n){ (2*rbinom(n, size=1, prob=0.5)-1)}, method='t', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjNormT = pbjInference(statmap, nboot = nboot, rboot = function(n){ rnorm(n)}, method='t', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjPermT = pbjInference(statmap, nboot = nboot, method='permutation', statistic=simStats, thr = thrs, mask=statmap$mask)
      pbjNonparametric = pbjInference(statmap, nboot = nboot, method='nonparametric', statistic=simStats, thr = thrs, mask=statmap$mask)
    }
    # collect output
    PBJnames = grep('^pbj', ls(), value=TRUE)
    allnames = paste(statmapname, PBJnames, sep='_')
    out[allnames] = lapply(PBJnames, get, pos = environment())
    rm(list=PBJnames)

    ### REPEAT ALL WITH INDEPENDENCE SPATIAL COVARIANCE ASSUMPTION
    # nonrobust methods won't be different, because covariance is same for all statistics.
  }
  return(out)
}



simDistCheck = function(lmfull, lmred, mask, data, cfts, nboot){
  # generate fake covariates
  data$fake_group = factor(ceiling(ppoints(nrow(data))*4 ) )
  data$fake_covariate1 = ppoints(nrow(data))
  data$fake_covariate2 = rnorm(nrow(data))

  # t transform, robust, estimate covariance
  #robustStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 'none', HC3 = FALSE )
  # methods to try to make stats map more normal
  HC3RobustStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 'none', HC3 = TRUE )
  # t transform, classical, estimate covariance
  tStatmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 't', robust=FALSE)

 out = list()
  statmaps = c('HC3RobustStatmap', 'tStatmap') #'robustStatmap')#, ), 
  for(statmapname in statmaps){

    ### BOOTSTRAP METHODS
    statmap = get(statmapname)
    pbjMax = pbjInference(statmap, nboot=0)
    PBJnames = grep('^pbj', ls(), value=TRUE)
    allnames = paste(statmapname, PBJnames, sep='_')
    out[allnames] = lapply(PBJnames, get, pos = environment())
    rm(list=PBJnames)

    ### REPEAT ALL WITH INDEPENDENCE SPATIAL COVARIANCE ASSUMPTION
    # nonrobust methods won't be different, because covariance is same for all statistics.
  }
  return(out)
}

#debug(pbjInference)
#simConfig = get("fakePolySimConfig")
#simdirs = simSetup(simConfig$dat$files, data=simConfig$dat, outdir=simConfig$simdir, nsim=simConfig$nsim, #ns=simConfig$ns, mask=simConfig$mask, rs=simConfig$rs, betas=simConfig$betas )
#simtime = system.time(test <- simFunc(simConfig$form, simConfig$formred, simConfig$mask, readRDS(file.path(simdirs$simdir[200], 'data.rds')), 2, cfts.p = simConfig$cfts.p))
#stop('not an error.')

Run simulations

sims = grep('SimConfig', ls(), value=TRUE)
for(sim in sims[6]){
  # get simulation configuration for this simulation
  simConfig = get(sim) 
  ### SETUP THE SIMULATION ANALYSIS ###
  # subsets dataset to all people who have the variables
  simConfig$dat = simConfig$dat[apply(!is.na(simConfig$dat[ ,c(all.vars(as.formula(simConfig$formres)), simConfig$W)]), 1, all), ]
  # Create residualized images
  if(class(simConfig$formres)=='formula' | is.character(simConfig$formres)){
    simConfig$dat$rfiles = file.path(simConfig$resdir, basename(simConfig$dat$files))
    if(!all(file.exists(simConfig$dat$rfiles))){
      pbj::residualizeImages(files=simConfig$dat$files, dat=simConfig$dat, mask=simConfig$mask, form=as.formula(simConfig$formres), outfiles=simConfig$dat$rfiles, mc.cores=simConfig$ncores)
    }
    simConfig$dat$files = simConfig$dat$rfiles
    # clean up. May not be necessary
    gc()
  }

  simdirs = simSetup(simConfig$dat$files, data=simConfig$dat, outdir=simConfig$simdir, nsim=simConfig$nsim, ns=simConfig$ns, mask=simConfig$mask, rs=simConfig$rs, betas=simConfig$betas )


  #time = system.time(test <- simFunc(simConfig$form, simConfig$formred, simConfig$mask, readRDS(file.path(simdirs$simdir[10], 'data.rds')), simConfig$nboot, simConfig$cfts.s) )

  # mix this up so that large sample simulations aren't all dropped on one "thread".
  simdirs = simdirs[sample(1:nrow(simdirs)),]


  results = runSim(simdirs$simdir, method=simConfig$method,
                   simfunc = simFunc, mask = simConfig$mask,
                   simfuncArgs = list(
                     lmfull= simConfig$form,
                     lmred = simConfig$formred,
                     mask = simConfig$mask, nboot=simConfig$nboot, cfts.p=simConfig$cfts.p), ncores = simConfig$ncores)

  dir.create(dirname(simConfig$output), showWarnings = FALSE, recursive = TRUE)
  # clean up files
  save.image(file=simConfig$output)
  unlink(list.files(tempdir(), full.names = TRUE))
  gc()
  unlink(simdirs)
}
stop('not an error. Finished simulations.')

Function to plot results

# for each method plot:
# qqplot of maximum value for each sample size
# qqplot of max cluster size for each cft and sample size
# plotting function for below sections
plots = function(rdata, alpha=0.1, stats=NULL){
  load(rdata)
  simdirs$results = results# lapply(results, simplify2array)
  methods = names(simdirs$results[[1]])
  if(is.null(stats)){
    stats = c("Max", paste('cft =', simConfig$cfts.p) )
  }

  # graphical parameters
  cex=1.5
  par(mgp=c(1.7,.7,0), lwd=1.5, lend=2, cex.lab=0.8*cex, cex.axis=0.8*cex, cex.main=1*cex, mfrow=c(1,1), mar=c(2.8,2.8,1.8,.5), bty='l', oma=c(0,0,2,0))
  layout(mat=matrix(1:(length(stats)*length(simConfig$ns)), nrow=length(stats), byrow = TRUE) )
  # axes are based on tail quantiles
  probs = c(0.75, 0.9, 0.95)

  for(method in methods){
  obsStat = do.call(rbind, lapply(simdirs$results, function(y) if(is.null(y)) NA else y[[method]][['obsStat']] ) )
  # These colnames were sample size dependent
  simdirs[, stats] = obsStat

  simdirs$boots = lapply(simdirs$results, function(y) do.call(rbind, lapply(y[[method]][['boots']], function(z0) simplify2array(z0) ) ) )
 #length.out=pmin(simConfig$nsim, simConfig$nboot)

for(cftInd in 1:length(stats)){

  xaxlab = c(0.75, 0.9, 0.95)
  colname = stats[cftInd]
  xlims = range(unlist(by(simdirs, simdirs$n, function(df){ quantile(df[,colname], probs=xaxlab, na.rm=TRUE)} ) ))

  trash = by(simdirs, simdirs$n, function(df){
    ylims = range(sapply(df$boots, function(x) range(quantile(x[,cftInd], probs=probs, na.rm=TRUE) ) ))
      x = df[,colname ]
      xaxt = quantile(x, probs=xaxlab, na.rm = TRUE)
      plot(x, ylim=ylims, xlim=xlims, type='n', xlab='Observed quantile', ylab='Estimated quantile', main=paste('n =', df$n[1],  colname))
      #axis(side=1, at=xaxt, labels=xaxlab)
      abline(v=xaxt, col='orange', lty=2)
      for(ind in 1:simConfig$nsim){
        if(!is.null(df$boots[[ind]])) points(quantile(x, probs=probs, na.rm=TRUE), quantile(df$boots[[ind]][,cftInd], probs=probs, na.rm=TRUE), type='l')
      }
      abline(a=0,b=1, col='blue')
  })
}
      mtext(method, outer=TRUE)

for(cftInd in 1:length(stats)){
  trash = by(simdirs, simdirs$n, function(df){
      ylims = range(sapply(df$boots, function(x) range(quantile(x[,cftInd], probs=probs, na.rm = TRUE))), na.rm=TRUE)
      x = df[,stats[cftInd] ]
      xlims = range(quantile(x, probs=probs, na.rm=TRUE))
      xaxlab = c(0.5, 0.75, 0.9, 0.95)
      xaxt = quantile(x, probs=xaxlab, na.rm=TRUE)
      y=colMeans(do.call(rbind, lapply(1:nrow(df), function(ind) quantile(df$boots[[ind]][,cftInd], probs=xaxlab, na.rm =TRUE)<c(df[ind,stats[cftInd] ]) ) ), na.rm=TRUE )
      plot(1-xaxlab, y, type='b', xlab='Target type 1 error', ylab='Actual type 1 error', xlim=range(c(y, 1-xaxlab)), ylim=range(c(y, 1-xaxlab)), main=paste('n =', df$n[1],  stats[cftInd]) )
      points(1-xaxlab, qbinom(alpha/2, simConfig$nsim, 1-xaxlab)/simConfig$nsim, type='l', lty=2)
      points(1-xaxlab, qbinom(1-alpha/2, simConfig$nsim, 1-xaxlab)/simConfig$nsim, type='l', lty=2)
      abline(a=0,b=1, col='blue')
  })
    }
   mtext(method, outer=TRUE)
  }
}

Asessing the distribution of the maximum across images

qs = c(0.75, 0.9, 0.95,0.99)
out = data.frame(sim=rep(sims, each=length(simConfig$ns) ), n=rep(simConfig$ns, length(sims)))
out[, paste0('q', qs)] = NA
for(sim in sims){
  # get simulation configuration for this simulation
  simConfig = get(sim)
  load(simConfig$output)
  simdirs$results = results
  out[ out$sim== sim, paste0('q', qs)] = do.call(rbind, by( simdirs, simdirs$n,  function(y) quantile(sapply(y$results, function(x) x$HC3RobustStatmap_pbjMax$obsStat), probs = qs) ))
}

Plotting the 3 voxel PDFs

#debug(plots)
rdatas = list.files('~/pbj/pbj_ftest', '*.rdata', full.names = TRUE)
for(rdata in rdatas[3]){
  pdffile = gsub('rdata', 'pdf', rdata)
  pdf(pdffile, width = 12, height=4)
  plots(rdata, stats = c("Max"))
  dev.off()
}

Diagnostic group (Autism dx)

#debug(plots)
#pdf('~/Dropbox (VUMC)/pbj/pbj_ftest/df2_polynomial.pdf')
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df1_group.rdata')
#dev.off()

Polynomial continuous covariates

Testing the second and third degree terms of a polynomial covariate using fixed X.

#debug(plots)
#pdf('df2_polynomial.pdf')
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df2_polynomial.rdata')
#dev.off()

Polynomial continuous covariate

Testing the second and third degree terms of a polynomial covariate using random X.

#debug(plots)
#pdf('df2_polynomial.pdf')
plots('~/pbj/pbj_ftest/df2_polynomial_randomX.rdata')
#dev.off()

Fake group

#debug(plots)
#pdf('df3_fakegroup_synthetic.#pdf')
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df3_fakegroup.rdata')
#dev.off()

Spline age continuous covariate

Testing on 5 dof.

#debug(plots)
#pdf('df5_agespline.#pdf')
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df5_agespline.rdata')
#dev.off()


simonvandekar/NIsim documentation built on Oct. 12, 2020, 5:06 p.m.