pmsimstats: Personalized medicine simulated statistics

Documented in lme_analysis

#' LME Analysis
#'
#' \code{lme_analysis} analyzes the data from a single simulated trial
#'
#' Analyzes the simulated data (including all paths) from a single simulated
#' trial. Most often called by \code{generateSimualtedResults}. See vignettes
#' for additional information.
#'
#' @param trialdesign_set The set of all pathways for the clinical trial
#' @param dat A dat file as generated by \link{generateData}
#' @param op an options file with the following components:
#'   \itemize{
#'     \item{\code{op$useDE=TRUE}}{  Binary: should the LME model use expectnacy
#'       information if available?}
#'     \item{\code{op$t_random_slope=FALSE}}{  Binary: should the LME model use
#'       random slopes, as well as random intercepts, for participants?}
#'     \item{\code{op$full_model_out=FALSE}}{  Binary: if false, will output model
#'       parameters as a row of numeric values for simulation processing (most common);
#'       if true, will output an array that allows you to examine the results of a single
#'       analysis in more detail}
#'     \item{\code{op$simplecarryover=FALSE}}{  Binary: if true will add "time since
#'       drug" as a free parameter in the model}
#'     \item{\code{op$carryover_t1half=0}}{  If non-zero, will build in an exponential
#'       falloff in the "on drug or not" term in the analysis (Db), using this as the
#'       half-life.}
#'   }
#' @return Generally returns a one row data table with components:
#'   \itemize{
#'     \item{\code{out$beta}}{  The coefficient corresponding to an interaction
#'       between biomarker and outcome}
#'     \item{\code{out$betaSE}}{  Standard error of the above coefficient}
#'     \item{\code{out$p}}{  p-value conveying the statistical significance of the coefficient}
#'     \item{\code{out$issingular}}{  Was the model fit flagged as singular?}
#'     \item{\code{out$warning}}{  Any warnings from \code{fitMsgs}}
#'   }
#'   However, if you have full_model_out set to TRUE, you will instead get an array with components:
#'   \itemize{
#'   \item{\code{form}}{  The model fit by lmer}
#'   \item{\code{fit}}{  The output of lmer}
#'   \item{\code{datamerged}}{  The data in the form it was provided to lmer}
#'   \item{\code{stdout}}{  The standard one row data table that's usually returned when
#'      full_mode_out is set to FALSE, with beta, betaSE, etc}
#'   }
#'
#' @examples
#' # See vignettes 1 & 2 for examples of how to call from the function generateSimualtedResults.R
#' # See vignette 3 for an example of how to apply lme_analysis to actual clincal trial results
#' @export

lme_analysis<-function(trialdesign_set,dat,op){

  if(missing(op)){
    op$useDE<-TRUE
    op$t_random_slope<-FALSE
    op$full_model_out<-FALSE
    op$carryover_t1half <- 0
    op$simplecarryover <- FALSE
    op$carryover_scalefactor=1
  }else if(!("simplecarryover"%in%names(op))){
    op$simplecarryover=FALSE
  }
  if(!("carryover_t1half"%in%names(op))){
    op$carryover_t1half=0
    op$carryover_scalefactor=1
  }
  if((op$carryover_t1half>0)&(op$simplecarryover==TRUE)){
    error("Don't know what it means to have two different carryover models, so disallowing simplecarryover and a half-life based carryover at the same time for now.")
  }

  n_groups<-length(trialdesign_set)
  datout<-list(rep(NA),n_groups)
  lastptID<-0
  for(g in 1:n_groups){
    trialdesign<-trialdesign_set[[g]]
    datSingle<-dat[path==g]

    # I. Turn data into this format

    # Format trialdesign:
    # Add a BL row
    trialdesign<-rbind(list(timeptnames="BL",t_wk=0,e=0,tod=0,tsd=0,tpb=0),trialdesign)
    # For minimization of coding erors, t_wk is length of the block, not time since
    # start, so create a "t" that is just true time with bl as 0:
    trialdesign[,t:=as.integer(NA)]
    trialdesign$t<-cumulative(trialdesign$t_wk)

    # Format dat:
    # Make pt data dt that just has the numbers we actually want
    timeptnames=union(trialdesign$timeptnames,"BL") # makes sure don't end up with it twice
    evalstring<-paste("data<-datSingle[,.(ptID,bm,",
                      paste(timeptnames,sep="",collapse=","),")]",sep="")
    eval(parse(text=evalstring))

    # Make sure the ptID is unique even if we're merging across groups:
    data$ptID<-data$ptID+lastptID
    lastptID<-max(data$ptID)

    # Turn it long-form, then merge in the trialdesign data
    data.m1<-melt(data,id.vars=c("ptID","bm"),measure.vars=trialdesign$timeptnames,
                  variable.name="timeptnames",value.name="Sx",na.rm=FALSE)
    data.m2<-merge(data.m1,
                   trialdesign[,.(timeptnames,t,De=e,Db=(tod>0),tsd)],
                   by="timeptnames",all=TRUE)

    # Add a scaler version of Db - Dbc, for continuous - to allow for
    # more complex carryover modeling
    data.m2[,Dbc:=as.numeric(NA)]
    data.m2[Db==TRUE,Dbc:=1]
    data.m2[Db==FALSE,Dbc:=((1/2)^(op$carryover_scalefactor*tsd/op$carryover_t1half))]

    datout[[g]]<-data.m2
  }

  datamerged<-datout[[1]]
  if(n_groups>1){for(g in 2:n_groups){datamerged<-rbind(datamerged,datout[[g]])}}

  # Tests we'll use to sort out what model to use:
  # 1) Test whether can possibly include the expectancy-related factor:
  varInExp<-length(unique(trialdesign$e[2:length(trialdesign$e)]))
  # 2) Test whether can use Db vs need to use t for the interaction term:
  datamerged[t>0,meanDb:=mean(Db),by=ptID]
  datamerged[t>0,DbVar:=((meanDb!=0)&(meanDb!=1)),by=ptID]
  varInDb<-(sum(datamerged[t>0]$DbVar==TRUE)>0)
  if(!varInDb){
    # Only analyze folks who are ever on drug, in this case:
    iEverOnDrug<-unique(datamerged[meanDb==1]$ptID)
    datamerged<-datamerged[ptID%in%iEverOnDrug]
  }
  # 3) Test whether we can use the carryover term, ie, tsd is sometimes non-zero
  varintsd=(nrow(datamerged[tsd!=0])>0)

  # Use these to pick a model
  modelbase="Sx~bm+t"
  if(varInDb){
    modelbase=paste0(modelbase,"+Dbc+bm*Dbc")
  }else{
    modelbase=paste0(modelbase,"+bm*t")
  }
  if((varInExp>1)&(op$useDE==TRUE)){
    modelbase=paste0(modelbase,"+De")
  }
  if(op$simplecarryover&varintsd){
    modelbase=paste0(modelbase,"+tsd")
  }
  if(op$t_random_slope){
    modelbase=paste0(modelbase,"+(1+t|ptID)")
  }else{
    modelbase=paste0(modelbase,"+(1|ptID)")
  }
  # now knit together and evaluate - poor programming form!
  eval(parse(text=paste0("form<-",modelbase)))
  # if(varInDb){
  #   if(op$t_random_slope){
  #     if((varInExp>1)&(op$useDE==TRUE)){
  #       form<-Sx~bm+De+Db+t+bm*Db+(1+t|ptID)
  #     } else {
  #       form<-Sx~bm+Db+t+bm*Db+(1+t|ptID)
  #     }
  #   }else{
  #     if((varInExp>1)&(op$useDE==TRUE)){
  #       form<-Sx~bm+De+Db+t+bm*Db+(1|ptID)
  #     } else {
  #       form<-Sx~bm+Db+t+bm*Db+(1|ptID)
  #     }
  #   }
  # }else{
  #   if(op$t_random_slope){
  #     if((varInExp>1)&(op$useDE==TRUE)){
  #       form<-Sx~bm+De+t+bm*t+(1+t|ptID)
  #     } else {
  #       form<-Sx~bm+t+bm*t+(1+t|ptID)
  #     }
  #   }else{
  #     if((varInExp>1)&(op$useDE==TRUE)){
  #       form<-Sx~bm+De+t+bm*t+(1|ptID)
  #     } else {
  #       form<-Sx~bm+t+bm*t+(1|ptID)
  #     }
  #   }
  # }

  # Run (should insert tryCatch here, but not working!)
  fit<-lmer(form,data=datamerged)
  holdWarning<-summary(fit)$fitMsgs
  if(length(holdWarning)==0) holdWarning<-as.character(NA)

  # set issingular flag off, turn on if get singular warning
  issingular<-FALSE
  if(length(summary(fit)$optinfo$conv$lme4$messages)>0){
    if(summary(fit)$optinfo$conv$lme4$messages[[1]]=="boundary (singular) fit: see ?isSingular"){
      issingular<-TRUE
    }
  }

  # Package output
  c<-summary(fit)$coefficients
  if(varInDb){
    p<-c['bm:DbcTRUE','Pr(>|t|)']
    beta<-c['bm:DbcTRUE','Estimate']
    betaSE<-c['bm:DbcTRUE','Std. Error']
  }else{
    p<-c['bm:t','Pr(>|t|)']
    beta<-c['bm:t','Estimate']
    betaSE<-c['bm:t','Std. Error']
  }

  # pvalue plan from http://mindingthebrain.blogspot.in/2014/02/three-ways-to-get-parameter-specific-p.html
  # this is the medium-conservative option
  out<-data.table(beta=beta,betaSE=betaSE,p=p,issingular=issingular,warning=holdWarning)

  # If full model output reqeusted, will repackage a bit:
  if(op$full_model_out){
    out<-list(form=form,fit=fit,datamerged=datamerged,stdout=out)
  }

  return(out)
}
# Short helper functions

#' Cumulative
#'
#' \code{cumulative} turns a vector of intervals into a cumulative running time
#'
#' @export
cumulative<-function(x){
  I<-length(x)
  y<-rep(NA,I)
  y[1]<-x[1]
  if(I>1){
    for(i in 2:I){
      y[i]<-x[i]+y[i-1]
    }
  }
  return(y)
}

#' Modified Gompertz
#'
#' \code{modgompertz} takes response parameters and returns change in symptoms as a function of time
#'
#' This is a slight modification of a standard gompertz function that is designed to model
#' how symptoms can change as a function of time.
#'
#' @param t time - no specific units, just make sure it's consistant across your modeling
#' @param maxr maximum response that can be attained (ceiling)
#' @param disp vertical displacement
#' @param rate rate
#' @return A numeric value indicating how much "improvement" there has been since baseline
#'   as of point \code{t} in time. A negative value indicates worsening rather than improvement.
#' @export
modgompertz<-function(t,maxr,disp,rate){
  y<-maxr*exp(-disp*exp(-rate*t))
  vert_offset<-maxr*exp(-disp*exp(-rate*0))
  y<-y-vert_offset
  y<-y*(maxr/(maxr-vert_offset)) # return the assymptotic max to what it was
  return(y)
}

#' Reknit simulated results
#'
#' \code{reknitsimresults} recombines the results of simulations that have the
#' exact same parameter space set, but were run as separate chunks - e.g., you
#' ran 100 repititions, realized you needed more, and ran 150 more to have 250
#' total. Does not work with rawdataout.
#'
#' @param basesavename The basesavename as described in \link{generateSimulatedResults}
#' @param savedir The directory the files are saved in
#' @return The output is a file just like you would have received from
#'   \link{generateSimulatedResults} if it had all been run at once.
#' @export
reknitsimresults<-function(basesavename,savedir){
  if(missing(savedir)){
    initialdirectory<-getwd()
  }else{
    initialdirectory<-getwd()
    setwd(savedir)
  }
  sr<-vector(mode="list",length=length(basesavename))
  for(iSN in 1:length(basesavename)){
    simresults<-readRDS(paste(basesavename[1],"1",sep="_save"))
    nextfilenum<-2
    while(nextfilenum>0){
      nextfile<-paste(basesavename[1],nextfilenum,sep="_save")
      if(file.exists(nextfile)){
        newresults<-readRDS(nextfile)
        simresults$results<-rbind(simresults$results,newresults$results)
        if(length(names(simresults))>2){
          warning("Haven't implemented adding rawdata together yet")
        }
        nextfilenum<-nextfilenum+1
      }else{
        nextfilenum<-0
      }
    }
    sr[[iSN]]<-simresults
  }
  if(length(sr)>1){
    # test the parameter settings are the same
    for(iSN in 2:length(sr)){
      if(!identical(sr[[1]]$parameterselections$trialdesigns,
          sr[[iSN]]$parameterselections$trialdesigns)){
        warning("Trial design settings don't match")
      }
      if(!identical(sr[[1]]$parameterselections$respparamsets,
                    sr[[iSN]]$parameterselections$respparamsets)){
        warning("Response paramset settings don't match")
      }
      if(!identical(sr[[1]]$parameterselections$blparamsets,
                    sr[[iSN]]$parameterselections$blparamsets)){
        warning("Baseline paramset settings don't match")
      }
    }
    # Merge (inefficiently, if had saved differently could use rbindlist...)
    simresults<-sr[[1]]
    for(iSN in 2:length(sr)){
      simresults$results<-rbind(simresults$results,sr[[iSN]]$results)
    }
  }else{
    simresults<-sr[[1]]
  }
  return(simresults)
}
# Scratch Script space for doing the development work on the package.

#devtools::install_github("r-lib/devtools")
#install_github("rchendrickson/pmsimstats")

# # Get tools on board
# library("devtools")
# library("roxygen2")
# library("testthat")
# library("knitr")
#
# # Stuff we have to figure out how to handle in terms of namespace...
# library(data.table)
# library(lme4)
# library(lmerTest)
# library(corpcor)
# library(ggplot2)
# library(MASS)
# library(svMisc)
# library(tictoc)
# library(ggpubr)
# library(gridExtra)

# Check, because SO many problems with installation...
#library(devtools)
#has_devel()

# Build an R.buildignore file:
#use_build_ignore("scratch_WorkingScript")

# Start the vingette:
#use_vignette("Produce_Publication_Results")
#use_vignette("Produce_Publication_Results_2_Look_at_data")
#use_vignette("Produce_Publication_Results_3_apply_to_actual_clinical_trial_data")

#
# # Create the data files for the plotting results side:
# mdir<-"C:\\Users\\afane\\Documents\\GitHub\\pmsimstats"
#
# basesavenames_basicpowersets<-c("2020_02_17_core1",
#                                 "2020_02_17_core2",
#                                 "2020_02_17_core3",
#                                 "2020_02_17_core4",
#                                 "2020_02_17_core5",
#                                 "2020_02_18_core1",
#                                 "2020_02_18_core2",
#                                 "2020_02_18_core3",
#                                 "2020_02_18_core4",
#                                 "2020_02_18_core5")
# basesavenames_respparamsetsRates<-c("2020_02_19_respRates1",
#                                     "2020_02_19_respRates2",
#                                     "2020_02_19_respRates3",
#                                     "2020_02_19_respRates4",
#                                     "2020_02_19_respRates5",
#                                     "2020_02_19_respRates6",
#                                     "2020_02_19_respRates7",
#                                     "2020_02_19_respRates8",
#                                     "2020_02_19_respRates9",
#                                     "2020_02_19_respRates10")
# basesavenames_respparamsetsMaxes<-c("2020_02_18_respMaxes1",
#                                     "2020_02_18_respMaxes2",
#                                     "2020_02_18_respMaxes3",
#                                     "2020_02_18_respMaxes4",
#                                     "2020_02_18_respMaxes5")

# Our core parameters
# setwd(wdir)
# simresults<-reknitsimresults(basesavenames_basicpowersets)
# setwd(mdir)
# save(results_core, file="data/results_core.rda")
# setwd(wdir)
# simresults<-reknitsimresults(basesavenames_respparamsetsRates)
# setwd(mdir)
# save(results_rates, file="data/results_rates.rda")
# setwd(wdir)
# simresults<-reknitsimresults(basesavenames_respparamsetsMaxes)
# setwd(mdir)
# save(results_maxes, file="data/results_maxes.rda")
# setwd(wdir)
# simresults<-readRDS("LME_compbaseparam_2020_02_08v2_500rep.rds") # 1.5Gigs!
# setwd(mdir)
# # TOOOOO big!!! Cut down to what we acutally need, minimally...
# simresults$rawdata<-simresults$rawdata$precensor # cuts down to 950 MB
# dim(simresults$results)
# length(simresults$rawdata)
# sr<-simresults$results[censorparamset==0]
# dim(sr) # now indexed the same
# ikeep<-sr[respparamset==1][blparamset==1][N==75][c.bm==.6][c.cf1t==.2]$irep
#
#setwd(mdir)
#save(results_trajectories, file="data/results_trajectories.rda")

# See if can apply our lme_analysis to the clinical trial results??? What happens if include
# the placebo group vs not...??

# rddir<-"C:\\Users\\afane\\Dropbox\\Misc\\DataSave\\Data\\CAPSitems"
# hddir<-getwd()
# setwd(rddir)
# dt.s<-as.data.table(readRDS("FreshPullY13AllTimepoints_scoresWithCovar.rds"))
# # s for scores
# dt.d<-readRDS("FreshPullY13AllTimepoints_diffs.rds")
# # d for differences - gives deltaCAPS for each time interval ("timeptDiff")
# # timepts are: weeks 0, 7, 11, 15
# dto<-readRDS("FreshPullY13AllTimepoints_SummaryCalcs.rds")
# # this version goes item by item and gives the mean rating at baseline and at end of trial for eachgroup
# # Data (produced by createDataSetForY13data_06.R)
#
# # Get BP and total CAPS into useable format
# dt.s[,CAPSToti:=sum(B1,B2,B3,B4,B5,C1,C2,C3,C4,C5,C6,C7,D1,D2,D3,D4,D5),by=c("SubjID","timept")]
# dt.s[,BP:=as.numeric(strsplit(ScrBpStan,"/")[[1]][1]),by=c("SubjID","timept")]
#
# # Make match dat file: names are...
# # bm, BL, ptID, (timept names), path, replicate
# rdat<-dt.s[,.(bm=BP,ptID=SubjID,StudyDrug,replicate=1,timept,path=1,CAPSToti)]
# rdat[StudyDrug=="placebo"]$path<-2
# rdatw<-dcast(rdat,bm+ptID+StudyDrug+replicate+path~timept,value.var="CAPSToti")
# setnames(rdatw,c("1","2","3","4"),c("BL","T1","T2","T3"))
# dattest<-rdatw[,.(bm,BL,ptID=1:.N,T1,T2,T3,path)]
#
# The bm of 76 turns out to be an error where the SBP is missing:
# dattest<-dattest[bm>80]
#
# datasavedir<-"C://Users//afane//Documents//GitHub//pmsimstats//data//"
# setwd(datasavedir)
# CTdata<-dattest
# save(CTdata,file="CTdata.rda")
#
# simresults<-generateSimulatedResults(
#   trialdesigns=list(trialdesigns$OL,trialdesigns$PGRCT),
#   respparamsets=origrespparamsets[1],
#   blparamsets=blparamsets[1],
#   censorparams=censorparams,
#   modelparams=coremodelparams[1,],
#   simparam=list(Nreps=1,
#                 progressiveSave=FALSE,
#                 basesavename="TEST",
#                 nRep2save=5,
#                 saveunit2start=1,
#                 savedir=getwd()),
#   analysisparams=analysisparams,
#   rawdataout=TRUE)
#
#' Plot factor trajectories
#'
#' \code{PlotModelingResults} plots basic output relating to the peformance of each
#' trial design across a parameter space
#'
#' @param data the \code{$results} component of the output of \link{generateSimulatedResults}
#' @param param2plot What value to plot as the heatmap out put. Can be power,
#'   mean_frac_NA, mean_beta, mean_betaSE, sd_beta
#' @param param2vary 4 params to vary and plot across the created space
#' @param param2hold Params to keep constant
#' @param param2nothold Things to let vary freely (if e.g. they are linked to another
#'   intentionally varied parameter)
#' @param labelcode A mapping between interal variable names and figure labels
#'
#' @return a ggplot object containing the plot
#' @examples
#' data(results_core)
#'   param2vary<-c("trialdesign","N","c.bm","censorparamset")
#'   param2hold<-data.table(blparamset=1,respparamset=1,carryover_t1half=0)
#'   param2nothold<-c("c.cfct","modelparamset")
#'   param2plot<-"power" # Options: power, mean_frac_NA, mean_beta, mean_betaSE, sd_beta...
#'   p1<-PlotModelingResults(simresults,param2plot,param2vary,param2hold,param2nothold)
#'  # See vignettes for additional details
#' @export

PlotModelingResults<-function(data,param2plot,param2vary,param2hold,param2nothold,labelcode){

  # This just for labeling the plot
  if(missing(labelcode)){
    labelcode<-data.table(handle=c("N","co","carryover","call","ctv","cpb","cbr",
                                   "mtv","mpb","mbr","stv","spb","sbr",
                                   "rtv","rpb","rbr","cbm","sbl","sbm",
                                   "trialdesignopt","beta0","beta1","eb1",
                                   "c.cf1t","carryover_t1half",
                                   "tv_rate","pb_rate","br_rate",
                                   "tv_max","pb_max","br_max",
                                   "c.bm","t_random_slope","use_DE"),
                          label=c("N subjects","Carryover","Carryover","Autocorrelations",
                                  "tv autocorrelation","pb autocorrelation","br autocorrelation",
                                  "tv mean","pb mean","br mean","tv SD","pb SD","br SD",
                                  "tv rate","pb rate","br rate","Correlation with biomarker",
                                  "SD of baseline CAPS","SD of SBP","Trial Design",
                                  "Baseline dropout (beta0)","Beta1","eb1",
                                  "cross-factor within timepoint autocorrelation","Carryover (t_1/2)",
                                  "TR rate","ER rate","BR rate",
                                  "TR max"," ER max","BR max",
                                  "Biomarker effect","Random Slope","Include expectancy in model"
                          ))
  }

  # Narrow the data down to what we will actually use
  d<-data$results

  # If we're breaking up respparams, do so now so can use same plotting machinery below
  if(length(intersect(param2vary,c("tv_rate","pb_rate","br_rate")))>1){
    # Build map from respparamste to rates
    cw<-data.table(respparamset=integer(),tv_rate=numeric(),pb_rate=numeric(),br_rate=numeric())
    for(irp in 1:length(data$parameterselections$respparamsets)){
      rps<-data$parameterselections$respparamsets[[irp]]$param
      cwn<-data.table(respparamset=irp,
                      tv_rate=rps[cat=="tv"]$rate,
                      pb_rate=rps[cat=="pb"]$rate,
                      br_rate=rps[cat=="br"]$rate)
      cw<-rbind(cw,cwn)
    }
    d<-merge(d,cw,by="respparamset",all.x=TRUE,all.y=FALSE)
  }
  if(length(intersect(param2vary,c("tv_max","pb_max","br_max")))>1){
    # Build map from respparamste to rates
    cw<-data.table(respparamset=integer(),tv_max=numeric(),pb_max=numeric(),br_max=numeric())
    for(irp in 1:length(data$parameterselections$respparamsets)){
      rps<-data$parameterselections$respparamsets[[irp]]$param
      cwn<-data.table(respparamset=irp,
                      tv_max=rps[cat=="tv"]$max,
                      pb_max=rps[cat=="pb"]$max,
                      br_max=rps[cat=="br"]$max)
      cw<-rbind(cw,cwn)
    }
    d<-merge(d,cw,by="respparamset",all.x=TRUE,all.y=FALSE)
  }

  # What didn't have a section made, but isn't part of what's being intentionally varied?
  otherholds<-setdiff(names(d),
                      c(param2hold,param2vary,param2nothold,"modelparamset",
                        "frac_NA","beta","betaSE","p","tID","irep","ETbeta","ETbetaSE",
                        "issingular","warning"))
  for(iP in 1:length(param2hold)){
    n<-names(param2hold)[iP]
    d<-d[get(n)==param2hold[[n]]]
  }
  for(iP in 1:length(otherholds)){
    param2pick<-d[[otherholds[iP]]][1] # TAKING THE FIRST ONE
    d<-d[get(otherholds[iP])==param2pick]
  }

  # Format the parameters to vary and their labels. Hacky - could have designed more flexibly...
  op<-param2vary
  oplabels<-vector(mode="list",length=(length(op)))
  for(iop in 1:length(op)){
    if(op[iop]=="trialdesign"){
      axlabel<-"Trial design"
      acttds<-unique(d$trialdesign)
      bklabels<-vector(mode="character",length=length(acttds))
      for(iT in 1:length(bklabels)){
        itd<-acttds[iT]
        bklabels[iT]<-data$parameterselections$trialdesigns[[itd]]$metadata$name_shortform[[1]]
      }
    }else if(op[iop]=="respparamset"){
      axlabel<-"Response parameters"
      bklabels<-vector(mode="character",length=length(data$parameterselections$respparamsets))
      for(iT in 1:length(bklabels)){
        bklabels[iT]<-data$parameterselections$respparamsets[[iT]]$name
      }
    }else if(op[iop]=="blparamset"){
      axlabel<-"Baseline parameters"
      bklabels<-vector(mode="character",length=length(data$parameterselections$blparamsets))
      for(iT in 1:length(bklabels)){
        bklabels[iT]<-data$parameterselections$blparamsets[[iT]]$name
      }
    }else if(op[iop]=="censorparamset"){
      axlabel<-"Censoring parameters"
      bklabels<-c("No censoring",data$parameterselections$censorparams$names)
      d$censorparamset<-d$censorparamset+1
    }else if(op[iop]=="carryover_t1half"){
      axlabel<-"Carryover (in weeks)"
      bklabels<-unique(d[,get(op[iop])])
      conv<-data.table(carryover_t1half=bklabels,new=1:(length(bklabels)))
      d<-merge(d,conv,by="carryover_t1half")
      d[,carryover_t1half:=new]
      d[,new:=NULL]
    }else{
      axlabel<-merge(data.table(handle=op[iop]),labelcode,by="handle",all=FALSE)$label
      bklabels<-unique(d[,get(op[iop])])
      if(class(d[,get(op[iop])])=="logical"){
        evalstring<-paste("d$",op[iop],"<-as.integer(d[,get(op[iop])])",sep="")
        eval(parse(text=evalstring))
        }
    }
    oplabels[[iop]]<-list(axlabel=axlabel,bklabels=bklabels)
  }

  # Change the labels on the stuff that will be faceted
  for(iop in 1:2){
    d[[op[iop]]]<-as.factor(d[[op[iop]]])
    levels(d[[op[iop]]])<-oplabels[[iop]]$bklabels
  }

  d$tID<-1:dim(d)[1]
  d[,mean_beta:=mean(beta),by=op]
  d[,mean_betaSE:=mean(betaSE),by=op]
  d[,sd_beta:=sd(beta),by=op]
  d[,sig_p:=(p<.05),by=tID]
  d[,power:=mean(sig_p),by=op]
  d[,mean_frac_NA:=mean(frac_NA),by=op]
  d[,CV:=sd_beta/mean_beta,by=op]
  d[,fractionSingular:=sum(issingular==TRUE,na.rm=TRUE)/length(issingular),by=op]
  d[,fractionNA:=sum(is.na(issingular))/length(issingular),by=op]
  d[,fractionNotSingular:=sum(issingular==FALSE,na.rm=TRUE)/length(issingular),by=op]
  d[,toplot:=get(param2plot)]
  ds<-d
  ds[,c("tID","beta","betaSE","p","sig_p"):=NULL]
  ds<-unique(ds)
  palettevalues=c(0,.7,1)

  if(param2plot=="power"){
    legendtitle<-"Power"
  }else if(param2plot=="sd_beta"){
    legendtitle<-"SD of beta"
  }else if(param2plot=="mean_betaSE"){
    legendtitle<-"Mean SE of \ncoefficient"
  }else{
    legendtitle<-param2plot
  }
    # suppress the warnings thrown by the scale_y_continuous line:
  suppressWarnings(
    p<-ggplot(data=ds,aes_string(op[3],op[4])) +
      geom_tile(aes(fill=toplot),colour="white") +
      scale_fill_distiller(type="seq",palette="RdYlBu",values=palettevalues,legendtitle) +
      geom_text(aes(label=round(toplot,dig=2)),size=3,na.rm=TRUE) +
      scale_x_continuous(name=oplabels[[3]]$axlabel,breaks=unique(ds[,get(op[3])]),
                         labels=oplabels[[3]]$bklabels) +
      scale_y_continuous(name=oplabels[[4]]$axlabel,breaks=unique(ds[,get(op[4])]), # This line is where the warnings are coming from....??
                         labels=oplabels[[4]]$bklabels,
                         sec.axis=sec_axis(trans=~.,labels=NULL,breaks=NULL,
                                           name=oplabels[[2]]$axlabel)) +
      ggtitle(oplabels[[1]]$axlabel) +
      theme(plot.title = element_text(size=12,hjust=0.5))+
      facet_grid(reformulate(op[1], op[2]))
  )
  return(p)
}

#' Plot factor trajectories
#'
#' \code{plotfactortrajectories} plots the averaged output of the factors
#' used to generate simulated trial data
#'
#' @param data the \code{$rawdata$precensor} component of the output of \link{generateSimulatedResults}
#'   (run with rawdataout=TRUE)
#' @param tinfo the \code{$results} component of the output of \link{generateSimulatedResults}
#' @param tds the \code{$parameterselections$trialdesigns} component of the output of
#'   \link{generateSimulatedResults}
#' @param opt2plot a data.table with a single row using parameter names to instruct the
#'   function which parameters to hold constant and plot the results of
#' @param mergeacrossreps=TRUE logical input of whether to merge across repititions
#' @return a ggplot object containing the plot
#' @examples
#'   data(results_trajectories)
#'   data<-simresults$rawdata$precensor # this gives a list of length (total reps done)
#'   tinfo<-simresults$results # dim[1] of tinfo gives the parameter space that went in
#'   tds<-simresults$parameterselections$trialdesigns
#'   mergeacrossreps<-TRUE
#'   plots1<-vector(mode="list",length=length(simresults$parameterselections$trialdesigns))
#'   for(ip in 1:length(plots1)){
#'     param2hold<-data.table(carryover_t1half=0,trialdesign=ip)
#'     plots1[[ip]]<-plotfactortrajectories(data,tinfo,tds,param2hold,mergeacrossreps)
#'   }
#'   # See vignettes for additional details
#' @export
plotfactortrajectories<-function(data,tinfo,tds,opt2plot,mergeacrossreps=TRUE){

  # Pull out the trial design we're plotting
  if("trialdesign"%in%names(opt2plot)){
    td<-tds[[opt2plot$trialdesign]]
  }else{
    warning("You need to specify the trial design by index number")
  }

  # Turn tinfo into, essentially VPG from generateEsimulatedResults.R:
  tinfo<-unique(tinfo[,.(trialdesign,respparamset,blparamset,modelparamset,N,c.bm,
                       carryover_t1half,c.tv,c.pb,c.br,c.cf1t,c.cfct,censorparamset,
                       use_DE,t_random_slope)])
  # Since crossreferencing indexing, make it explicit:
  tinfo<-cbind(iR=1:(dim(tinfo)[1]),tinfo)
  # Only plot pre-censoring:
  tinfo<-tinfo[censorparamset==0]

  # Pull out the reps we're going to use
  # 1) What didn't have a section made, but isn't part of what's being intentionally varied?
  otherholds<-setdiff(names(tinfo),
                      c(names(opt2plot),"modelparamset","trialdesign",
                        "frac_NA","beta","betaSE","p","iR","irep",
                        "ETbea","ETbetaSE","issingular","warning"))
  keepR<-1:dim(tinfo)[1]
  for(iP in 1:length(param2hold)){
    n<-names(param2hold)[iP]
    kkeepR<-which(tinfo[[n]]==param2hold[[n]])
    keepR<-intersect(keepR,kkeepR)
  }
  for(iP in 1:length(otherholds)){
    param2pick<-tinfo[[otherholds[iP]]][1] # TAKING THE FIRST ONE
    kkeepR<-which(tinfo[[otherholds[iP]]]==param2pick)
    keepR<-intersect(keepR,kkeepR)
  }

  # keepR<-1:dim(tinfo)[1]
  # for(op in names(opt2plot)){
  #   kkeepR<-which(tinfo[[op]]==opt2plot[[op]])
  #   keepR<-intersect(keepR,kkeepR)
  # }

  # Compile the data we want to use and melt it for flexible plotting
  d<-data[[keepR[1]]][0,]
  for(kR in keepR){
    d<-rbind(d,data[[kR]])
  }
  if(!mergeacrossreps) d<-d[replicate==1]
  #d[,ptID:=NULL] # not unique after merge
  d<-cbind(ptID=1:dim(d)[1],d)
  suppressWarnings(d.m1<-melt(d,id.var=c("ptID","path","bm"),value.name="pointvalue"))
  d.m1$variable<-as.character(d.m1$variable)
  d.m1[,c("temp1","factor"):=tstrsplit(variable,"[.]")]
  d.m1[,c("temp2","temp3"):=tstrsplit(temp1,"_")]
  d.m1[,timept:=as.character(NA)]
  d.m1[,L_delta:=as.logical(NA)]
  # Check, as this got way more complicated than expected, fast
  #unique(d.m1[,.(variable,timept,factor,L_delta,temp1,temp2,temp3)])
  # Fix up the timept column
  d.m1[temp2=="D"]$timept<-d.m1[temp2=="D"]$temp3
  d.m1[temp2!="D"]$timept<-d.m1[temp2!="D"]$temp1
  #unique(d.m1[,.(variable,timept,factor,L_delta,temp1,temp2,temp3)])
  # Fix up the L_delta column
  d.m1[temp2=="D"]$L_delta<-TRUE
  d.m1[!is.na(factor)]$L_delta<-TRUE
  d.m1[is.na(L_delta)]$L_delta<-FALSE
  # Fix up the factor column
  d.m1[is.na(factor)]$factor<-"all"
  # Last check - HIGH RISH POINT FOR ERRORS
  #unique(d.m1[,.(variable,timept,factor,L_delta,temp1,temp2,temp3)])
  d.m1[,c("temp1","temp2","temp3"):=NULL]

  # Merge in the weeks, and do a version with mean/SDs
  timecrosswalk<-data.table(timept=td$metadata$timeptname,wk=td$metadata$timepoints)
  d.m2<-merge(d.m1,timecrosswalk,by="timept",all.x=TRUE,all.y=FALSE)
  d.m2[,meanvalue:=mean(pointvalue),by=c("variable","path")]
  d.m2[,sdvalue:=sd(pointvalue),by=c("variable","path")]
  d.m3<-unique(d.m2[,.(timept,path,factor,L_delta,wk,meanvalue,sdvalue)])

  # add in the baseline values...
  EG<-expand.grid(unique(d.m3$path),unique(d.m3$factor))
  setnames(EG,names(EG),c("path","factor"))
  BLvalues<-cbind(timept="BL",EG,L_delta=TRUE,wk=0,meanvalue=0,sdvalue=0)
  d.m3<-rbind(d.m3,BLvalues)

  # Order the factors, so will be consistent coloring:
  d.m3$factor<-factor(d.m3$factor,levels=c("all","br","pb","tv"))

  # Create the info for the bar at the bottom ID'ing when on drug
  mod<-td$metadata$ondrug
  w<-td$metadata$timepoints
  ondrug<-data.table(starts=numeric(),stops=numeric(),path=integer())
  for(path in 1:length(mod)){
    for(ep in 1:length(mod[[path]])){ # ep is the endpoint index
      if(mod[[path]][ep]==1){ # only add to list to plot if on drug at end of segment
        if(ep==1){start<-0}else{start<-w[ep-1]}
        stop<-w[ep]
        ondrug<-rbind(ondrug,data.table(starts=start,stops=stop,path=path))
      }
    }
  }

  # Plot by path
  p<-ggplot()+
    # Plot by factor, then sum...
    geom_point(data=d.m3[L_delta==TRUE],aes(x=wk,y=meanvalue,color=factor),size=.3)+
    geom_line(data=d.m3[L_delta==TRUE],aes(x=wk,y=meanvalue,color=factor),size=.3)+
    facet_wrap(~path)+
    labs(x="Weeks",y="Improvement in symptom intensity from baseline")+
    #    theme_minimal()+
    scale_color_manual(name="Factor",
                       values=c("red","blue","brown","black"),
                       breaks=c("all","br","pb","tv"),
                       labels=c("Sum","BR","ER","TR"))+
    # add the bar indicating when on drug...
    geom_segment(data=ondrug,aes(x=starts,y=-1.5,xend=stops,yend=-1.5),size=1)

  return(p)

}


#' Personalized medicine stimulation-based statistics
#'
#' The \code{pmsimstats} package is a simple set of tools to
#' help with implementing simulation-based powercalculations and
#' bias assessments the ability of different clinical trial
#' designs to answer personalized medicine (or precision medicine)
#' oriented hypotheses - specifically, the ability of a
#' biomarker measured at baseline to predict the biologic
#' response to a treatment, in the presence of other confounding
#' effects, including regression to the mean and expectancy
#' related effects.
#'
#' @section Further documentation:
#' See ___ publication or the two included vignettes, which
#' generate the analyses and figures from the publication.
#'
#' @docType package
#' @name pmsimstats
NULL
#' Generate Simulated Results
#'
#' \code{generateSimulatedResults} rotates through a parameter space, creating
#'   and analyizing simulated data and collating the results
#'
#' This is a wrapper for \link{generateData} and \link{lme_analysis} that
#' systematically works its way through a set of parameters and creates
#' a specified number of repetitions of simulated data and results of
#' analyizing that data for each set of parameter specifications.
#'
#' @param trialdesigns A list of trial designs of the form generated
#'   by \link{buildtrialdesigns}
#' @param respparamsets A list of options for \code{respparam} as
#'   described in \link{generateData}
#' @param blparamsets A list of options for \code{blparam} as
#'   described in \link{generateData}
#' @param censorparams A data.table where each row is a set of options for
#'   \code{censorparam} as described in \link{censordata}
#' @param modelparams A data.table where each row is a set of options for
#'   \code{modelparams} as described in \link{generateData}
#' @param simparam The options that control the logistics of the simulations:
#'   \itemize{
#'     \item{\code{Nreps}}{  Number of repititions for a given set of parameters}
#'     \item{\code{progressiveSave=TRUE}}{  Do you want to save chunks as you go?
#'       Helpful if the run is going to take a long time, and you don't want the
#'       run to be interrupted 8 hours in to a 9 hour run and lose everything.}
#'     \item{\code{basesavename}}{  If you are using progressivesave, what do you
#'       want the base of the filename to be?  Will have _save1, _save2, etc appended}
#'     \item{\code{nRep2save}}{  How many parameter sets to you want to run at
#'       a time before saving?}
#'     \item{\code{saveunit2start}}{  If you got through part of your parameter
#'       space but not all of it, you can start partway through. If starting
#'       from the beginning, set to 1}
#'     \item{\code{savedir}}{  What directory do you want to save to?}
#'   }
#' @param analysisparams The options used by \link{lme_analysis}. Has the
#'   following components:
#'   \itemize{
#'     \item{\code{useDE=TRUE}}{  Binary: should the LME model use expectnacy
#'       information if available?}
#'     \item{\code{t_random_slope=FALSE}}{  Binary: should the LME model use
#'       random slopes, as well as random intercepts, for participants?}
#'   }
#' @param rawdataout=FALSE Binary - output the raw data for all
#'   simulations?  This is memory intensive for large runs.
#' @return Returns a list with three named parts:
#'   \itemize{
#'     \item{\code{results}}{  A large data table that tells you the parameters used
#'       and the analysis results for every simulated trial, one row per simualated trial.}
#'     \item{\code{parameterselections}}{  A list desribing the parameterspace used.
#'       This is important because some of the information in \code{out$results} is
#'       the index of the parameter set selection, and you will need this additional
#'       information to interpret it. For example, \code{trialdesigns}, \code{respparamset},
#'       \code{blparamset}, and \code{modelparamset} are all indexed.}
#'     \item{\code{rawdataout}}{  If you have rawdataout set to \code{TRUE}, you will
#'       also get (1) \code{drawdataout$precensor}, a list of simulated trial data of
#'       a length corresponding to the number of parameter permulations, with the order
#'       parallel to that of the main output results. Repititions of a particular
#'       parameter set are mixed, identified by \code{irep}. And, \code{rawdataout$postcensor},
#'       which has the same format but with the psotcensoring data}
#'   }
#' @examples
#'   # See vignettes for examples
#' @export

generateSimulatedResults<-function(trialdesigns,respparamsets,blparamsets,
                                   censorparams,modelparams,simparam,analysisparams,
                                   rawdataout=FALSE){

  # defaults
  if(missing(analysisparams)) analysisparams<-list(useDE=TRUE,
                                                   t_random_slope=FALSE,
                                                   full_model_out=FALSE)

  # Hold original directory in case we change it
  initialdirectory<-getwd()

  # Setup our path through the variable parameters and
  # initialize counts used for tracking on the screen:
  tic("Total time:")
  VPGmaster<-expand.grid(
    trialdesign=1:length(trialdesigns),
    respparamset=1:length(respparamsets),
    blparamset=1:length(blparamsets),
    modelparamset=1:dim(modelparams)[1]
  )
  nV<-dim(VPGmaster)[1]
  # note that censor param not included because done after data generation
  irep<-1
  totrep<-nV*simparam$Nreps
  totparamsets<-nV

  # If we're doing a progressive save, have a "large loop" that controls the smaller inside loop
  if(simparam$progressiveSave==FALSE){
    nLargeLoops<-1
    LLstarts<-1
    LLstops<-nV
  }else{
    nLargeLoops<-ceiling(nV/simparam$nRep2save)
    if(nLargeLoops>1){
      LLstarts<-c(1,1+simparam$nRep2save*(1:(nLargeLoops-1)))
      LLstops<-c(LLstarts[2:nLargeLoops]-1,nV)
    }else{
      LLstarts<-1
      LLstops<-nV
    }
  }

  for(iLL in simparam$saveunit2start:nLargeLoops){
    tic(paste("***Progressive Save Unit ",(iLL+1-simparam$saveunit2start)," of ",(nLargeLoops+1-simparam$saveunit2start),
              " complete",sep=""))
    VPG<-VPGmaster[LLstarts[iLL]:LLstops[iLL],]
    iparamset<-LLstarts[iLL]

    # Create output structure:
    out<-cbind(VPG[0,],data.table(censorparamset=integer(),use_DE=logical(),t_random_slope=logical(),
                                  irep=integer(),frac_NA=numeric(),beta=numeric(),betaSE=numeric(),
                                  p=numeric(),issingular=logical(),warning=character()))
    if(rawdataout){d_out<-list(precensor=list(),postcensor=list())}

    # Loop through the different variable parameters
    for(iR in 1:dim(VPG)[1]){
      tic(paste("Parameter set ",iR," of ",dim(VPG)[1]," in this save unit now complete (on ",iparamset," of ",nV," total sets)",sep=""))
      td<-trialdesigns[[VPG[iR,"trialdesign"]]][[2]]
      p<-list()
      p$respparam<-respparamsets[[VPG[iR,"respparamset"]]]$param
      p$blparam<-blparamsets[[VPG[iR,"blparamset"]]]$param
      p$modelparam<-modelparams[VPG[iR,"modelparamset"],]

      # Run this set of params 1 time to create N*Nrep simuated participants then subset, for efficiency;
      # have to create the dat for each path and merge
      #
      nP<-length(td)
      Ns<-p$modelparam$N%/%nP # How many in each path run?
      Ns<-Ns+c(rep(1,p$modelparam$N%%nP),rep(0,nP-p$modelparam$N%%nP)) # distribute the remainder
      NNs<-Ns*simparam$Nreps # here adjust so doing all with this paramset at once
      p$modelparam$N<-NNs[[1]]
      dat<-generateData(p$modelparam,p$respparam,p$blparam,td[[1]],FALSE,TRUE)
      dat[,path:=1]
      dat[,replicate:=rep(1:simparam$Nreps,Ns[1])]
      if(nP>1){
        for(iP in 2:nP){
          p$modelparam$N<-NNs[[iP]]
          dat2<-generateData(p$modelparam,p$respparam,p$blparam,td[[iP]],FALSE,TRUE)
          dat2[,path:=iP]
          dat2[,replicate:=rep(1:simparam$Nreps,Ns[iP])]
          dat<-rbind(dat,dat2)
        }
      }
      p$modelparam$N<-sum(Ns)

      # Analysis chunk repeat x nAnalysisParametersets:
      for(iAP in 1:dim(analysisparams)[1]){
        # Analyze entire population to get "empircal true beta" (ETbeta)
        #tic("analysis-whole population")
        ETanalysisout<-lme_analysis(td,dat,analysisparams[iAP,])
        #toc()
        # Now, for each replicate: analyize and save output
        #tic("analysis-replicates")
        for(iS in 1:simparam$Nreps){
          # tryCatch(
          #   expr = {
          #     analysisout<-lme_analysis(td,dat[replicate==iS],analysisparams[iAP,])
          #     analysisout$warning<-NA
          #   },
          #   error=function(e){
          #     message('Error in call to lme_analysis')
          #     print(e)
          #     print(paste("Error occurred on VPG line ",iR," on analysis parameterset ",AP," on replicate ",iS,sep=""))
          #   },
          #   warning=function(w){
          #     message('Error in call to lme_analysis')
          #     print(w)
          #     print(paste("Error occurred on VPG line ",iR," on analysis parameterset ",AP," on replicate ",iS,sep=""))
          #     analysisout<-data.table(beta=NA,betaSE=NA,p=NA,issingular=analysisout$issingular,warning=w)
          #   }
          # )
          analysisout<-lme_analysis(td,dat[replicate==iS],analysisparams[iAP,])
          o<-cbind(VPG[iR,],as.data.table(p$modelparam),
                   data.table(censorparamset=0,use_DE=analysisparams[iAP,]$useDE,t_random_slope=analysisparams[iAP,]$t_random_slope,
                              irep=((iR-1)*simparam$Nreps+iS),frac_NA=0,ETbeta=ETanalysisout$beta,
                              ETbetaSE=ETanalysisout$betaSE,beta=analysisout$beta,betaSE=analysisout$betaSE,
                              p=analysisout$p,issingular=analysisout$issingular,warning=analysisout$warning))
          out<-rbind(out,o)
        }
        # Repeat for each set of censor parameters
        nocensoringflag<-FALSE
        if(length(censorparams)<2){
          if(is.na(censorparams)){
            nocensoringflag<-TRUE
          }
        }
        if(!nocensoringflag){
          h_datc<-vector(mode="list",length=(dim(censorparams)[1]))
          for(iC in 1:dim(censorparams)[1]){
            h_datc[[iC]]<-censordata(dat,td[[1]],censorparams[iC,])
            for(iS in 1:simparam$Nreps){
              frac_NA<-sum(is.na(h_datc[[iC]][replicate==iS]))/(p$modelparam$N*length(td[[1]]$t_wk))
              analysisout<-lme_analysis(td,h_datc[[iC]][replicate==iS],analysisparams[iAP,])
              o<-cbind(VPG[iR,],as.data.table(p$modelparam),
                       data.table(censorparamset=iC,use_DE=analysisparams[iAP,]$useDE,t_random_slope=analysisparams[iAP,]$t_random_slope,
                                  irep=((iR-1)*simparam$Nreps+iS),frac_NA=frac_NA,ETbeta=ETanalysisout$beta,
                                  ETbetaSE=ETanalysisout$betaSE,beta=analysisout$beta,betaSE=analysisout$betaSE,
                                  p=analysisout$p,issingular=analysisout$issingular,warning=analysisout$warning))
              out<-rbind(out,o)
            }
          }
          #toc()
        }else{
          h_datc<-NA
        }
      }

      if(rawdataout){
        d_out$precensor[[iparamset]]<-dat
        d_out$postcensor[[iparamset]]<-h_datc
      }

      iparamset<-iparamset+1
      toc()
      #progress(iparamset,max.value=totparamsets)
    } # end iR

    # Organize the output and return it
    if(rawdataout){
      outpt<-list(results=as.data.table(out),
                  parameterselections=list(trialdesigns=trialdesigns,
                                           respparamsets=respparamsets,
                                           blparamsets=blparamsets,
                                           censorparams=censorparams,
                                           modelparams=modelparams,
                                           analysisparams=analysisparams,
                                           simparam=simparam),
                  rawdata=d_out)
    }else{
      outpt<-list(results=as.data.table(out),
                  parameterselections=list(trialdesigns=trialdesigns,
                                           respparamsets=respparamsets,
                                           blparamsets=blparamsets,
                                           censorparams=censorparams,
                                           modelparams=modelparams,
                                           analysisparams=analysisparams,
                                           simparam=simparam))
    }

    if(simparam$progressiveSave){
      setwd(simparam$savedir)
      saveRDS(outpt,paste(simparam$basesavename,iLL,sep="_save"))
    }
    toc()
  } # end iLL
  toc()
  setwd(initialdirectory)
  if(!simparam$progressiveSave) return(outpt)
}
#' Generate simulated data
#'
#' \code{generateData} generates a set of simulated clinical trials' data
#'
#' This is the core data simulation code. See vignettes for additional details.
#' This function is primarily called by \link{generateSimulatedResults}.
#'
#' @param modelparam a datatable with a single entry per named column:
#' \itemize{
#'  \item{\code{N}}{  Number of simulated participants}
#'  \item{\code{c.bm}}{  Correlation between the biomarker and the biologic response}
#'  \item{\code{carryover_t1half}}{  Halflife of the carryover effect}
#'  \item{\code{c.tv}}{  Autocorrelation for the tv factor across timepoints}
#'  \item{\code{c.pb}}{ Autocorrelation for the pb factor across timepoints}
#'  \item{\code{c.br}}{  Autocorrelation for the br factor across timepoints}
#'  \item{\code{c.cf1t}}{  Correlation between different factors at a single timepoint}
#'  \item{\code{c.cfct}}{  Correlation between different factors at different timepoints}
#' }
#' @param respparam Data table with 5 columns and 3 rows. The first row lists the 3 factors
#'   defining the response trajectories (tv, pb, and br). The rest of the columns are the
#'   input parameters for the \link{modgompertz} function:
#' \itemize{
#'  \item{\code{max}}{  Maximum response value}
#'  \item{\code{disp}}{  Displacement}
#'  \item{\code{rate}}{  Rate}
#'  \item{\code{sd}}{  Standard deviation}
#' }
#' @param blparam Data table with 3 colums and 3 rows. First row lists the varaibles being
#'   defined, which consist of \code{BL} for the baseline value of the outcome measure, and
#'   \code{bm} for the biomarker values. Remaining columns:
#'   \itemize{
#'     \item{\code{m}}{  Mean}
#'     \item{\code{sd}}{  Standard deviation}
#'   }
#' @param trialdesign The information defining a single path through the clinical trial
#'   design, as defined by the \code{output$trialpaths} output of \link{buildtrialdesign}
#' @param empirical Should the correlations be empirical? (Usually \code{FALSE} unless
#'   you're doing something odd for testing purposes)
#' @param makePositiveDefinite Should the covariance matrix be forced to be positive definite?
#'   (Usually yes, although you may want to turn this off at times to test the impact of
#'   this step on your covariance sturcture)
#' @param seed Randomization seed (defaults to NA)
#' @param scalefactor TODO update when understand what this does?
#' @param verbose Set to \code{TRUE} if you want chatty outputs; defaults to \code{FALSE}
#' @returns A \code{dat} file that contains both the total symptom scores at each timepoint
#'   and also all the individual factors that were used to generate those total scores
#' @examples
#'   #See vignettes for examples of how to use this
#' @export


generateData<-function(modelparam,respparam,blparam,trialdesign,empirical,makePositiveDefinite,seed=NA,scalefactor=2,verbose=FALSE){

  # I. Turn the trial design information into something easier to use
  d<-data.table(trialdesign)
  d$t_wk_cumulative<-cumulative(d$t_wk)
  d[,onDrug:=(tod>0)]
  nP<-dim(trialdesign)[1]

  # II. Now set up a whole host of variables that we're going to want to track - essentially our baseline
  #     parameters for each subject ("bm","BL"), and the three modeled factors for each stage of the trial.

  # Set up the variable names
  cl<-c("tv","pb","br")
  labels<-c(c("bm","BL"),
            paste(trialdesign$timeptname,cl[1],sep="."),
            paste(trialdesign$timeptname,cl[2],sep="."),
            paste(trialdesign$timeptname,cl[3],sep="."))

  # Set up vectors with the standard deviations and means
  sds<-c(blparam[cat=="bm"]$sd,blparam[cat=="BL"]$sd)
  sds<-c(sds,rep(respparam[cat=="tv"]$sd,nP))
  sds<-c(sds,rep(respparam[cat=="pb"]$sd,nP)*trialdesign$e)
  sds<-c(sds,rep(respparam[cat=="br"]$sd,nP))
  means<-c(blparam[cat=="bm"]$m,blparam[cat=="BL"]$m)
  for (c in cl){
    rp<-respparam[cat==c]
    if(c=="tv"){
      means<-c(means,modgompertz(d$t_wk_cumulative,rp$max,rp$disp,rp$rate))
    }
    if(c=="pb"){
      means<-c(means,modgompertz(d$tpb,rp$max,rp$disp,rp$rate)*trialdesign$e)
    }
    if(c=="br"){
      brmeans<-modgompertz(d$tod,rp$max,rp$disp,rp$rate)
      ## Code added in by Ron Thomas:
      brtest <- brmeans == 0
      rawbrmeans <- brmeans
      names(brtest) <- labels[19:26]
      names(rawbrmeans) <- labels[19:26]
      ## End new code
      if(nP>1){
        for(p in 2:nP){
          if(!d[p]$onDrug){
            if(d[p]$tsd>0){
              brmeans[p]<-brmeans[p]+brmeans[p-1]*(1/2)^(scalefactor * d$tsd[p]/modelparam$carryover_t1half)
            }
          }
        }
      }
      means<-c(means,brmeans)
    }
  }

  # Turn this into a matrix of correlations
  correlations<-diag(length(labels))
  rownames(correlations)<-labels
  colnames(correlations)<-labels
  # Give some output if in verbose mode:
  if(verbose==TRUE){
    aa <- data.frame(modelparam$carryover_t1half, rawbrmeans, brmeans, diff = brmeans - rawbrmeans)
    cat("brmeans before and after adj:\n ")
    print(aa)
  }
  for(c in cl){
    # build in the autocorrlations across time
    if(nP>1){
      ac<-modelparam[(paste("c",c,sep="."))][,]
      for(p in 1:(nP-1)){
        for(p2 in (1+p):nP){
          n1<-paste(trialdesign$timeptname[p],c,sep=".")
          n2<-paste(trialdesign$timeptname[p2],c,sep=".")
          correlations[n1,n2]<-ac
          correlations[n2,n1]<-ac
        }
      }
    }
    # build in the autocorrelations across factors
    for(c2 in setdiff(cl,c)){
      for(p in 1:nP){
        n1<-paste(trialdesign$timeptname[p],c,sep=".")
        n2<-paste(trialdesign$timeptname[p],c2,sep=".")
        correlations[n1,n2]<-modelparam$c.cf1t
        correlations[n2,n1] <- modelparam$c.cf1t  ##### <--------FIXING TYPO, this was [n1,n2] again
      }
      for(p in 1:(nP-1)){
        for(p2 in (1+p):nP){
          n1<-paste(trialdesign$timeptname[p],c,sep=".")
          n2<-paste(trialdesign$timeptname[p2],c2,sep=".")
          correlations[n1,n2]<-modelparam$c.cfct
          correlations[n2,n1]<-modelparam$c.cfct
        }
      }
    }
    # correlation with biomarker
    for(p in 1:nP){
      n1<-paste(trialdesign$timeptname[p],"br",sep=".")
      ## RON THOMAS VERSION:
      if (p > 1) {
        n0 <- paste(trialdesign$timeptname[p - 1], "br", sep = ".")
        mm1 <- means[which(n1 == labels)]
        mm0 <- means[which(n0 == labels)]
        correlations["bm", n1] <- correlations[n1, "bm"] <- ifelse(brtest[p], ifelse(brmeans[p] == 0, 0, (mm1 / mm0) * modelparam$c.bm), modelparam$c.bm)
      }
      ##
      ## Following commented out in Ron Thomas version:
      #if(means[which(n1==labels)]!=0){
      #  correlations[n1,'bm']<-modelparam$c.bm
      #  correlations['bm',n1]<-modelparam$c.bm
      #}
      ## End commented out by Ron
    }
  }
  # Again, some output if in verbose mode:
  if(verbose==TRUE){
    cat("carryover: ")
    print(modelparam$carryover_t1half)
    cat("br correlations: \n")
    print(correlations[19:26, 1])
  }
  # Turn correlation matrix into covariance matrix
  sigma<-outer(sds,sds)*correlations
  #  should be faster: outer(S,S)*R where S is SDs and R is correlation matrix
  #  previous: sigma<-correlations*sds%o%sds

  # If turned on, force to be positive definite:
  if(makePositiveDefinite){
    if(!is.positive.definite(sigma)){
      sigma<-make.positive.definite(sigma, tol=1e-3)
    }
  }

  # Set seed:
  #if(is.na(seed)){seed<-round(rnorm(1,10000,10000))}
  #set.seed(seed)

  # Make the componant data!
  dat<-mvrnorm(n=modelparam$N,mu=means,Sigma=sigma,empirical=empirical)
  dat<-data.table(dat)
  setnames(dat,names(dat),labels)

  # TESTING
  #tdat1<-dat[, lapply(.SD, mean)]
  #plot(c(d$t_wk_cumulative),tdat1[,.(OL.br,BD.br,UD.br,COd.br,COp.br)])


  #tdat2<-dat[, lapply(.SD, mean)]
  #plot(c(d$t_wk_cumulative),tdat2[,.(OL.br,BD.br,UD.br,COd.br,COp.br)])

  # Turn it into actual fake data... calc intervals separately, because will use
  dat[,ptID:=1:modelparam$N]
  for(p in 1:nP){
    n<-trialdesign$timeptname[p]
    evalstring<-paste(
      "dat[,D_",n,":=sum(",paste(paste(n,cl,sep='.',collapse=','),sep=','),"),by='ptID']",
      sep="")
    eval(parse(text=evalstring))
  }
  for(p in 1:nP){
    n<-trialdesign$timeptname[p]
    evalstring<-paste(
      "dat[,",n,":=sum(BL,-",paste(paste(n,cl,sep='.',collapse=',-'),sep=','),"),by='ptID']",
      sep="")
    eval(parse(text=evalstring))
  }

  # TESTING
  #tdat3<-dat[, lapply(.SD, mean)]
  #plot(c(0,d$t_wk_cumulative),tdat3[,.(BL,OL,BD,UD,COd,COp)])
  #plot(c(d$t_wk_cumulative),tdat3[,.(D_OL,D_BD,D_UD,D_COd,D_COp)])

  return(dat)
}
#' Extracted baseline parameters for the vignette that reproduces the publication results.
#'
#' This contains the mean and standard deviation of (a) the standing systolic blood
#' pressures taken during orthostatic vital signs, and (b) the CAPS-IV PTSD assessment
#' total severity score.
#'
#' @docType data
#'
#' @usage data(extracted_bp)
"extracted_bp"

#' Extracted response parameters for the vignette that reproduces the publication results.
#'
#' This contains a set of maximum, displacement, rate and standard deviation values that, when
#' fed into the function \link{modgompertz} produces a combination of response factors
#' (biologic response to the drug, br here or BR in the publication; expectancy related response,
#' pb here or ER in the publication; and the non-treatment related time-varying factor, tv here
#' or TR in the publication) that together could plausibly describe our existing RCT data set.
#' This set of parameters was designed to make no other particular assumptions about how these
#' factors would perform - to be a 'tabula rasa' parameter set - where e.g. the timecourse and
#' maximum values of the expectancy related and tv/TR factors were assumed to be equal.
#'
#' @docType data
#'
#' @usage data(extracted_rp)
"extracted_rp"

#' The results of the vigenette Part 1 used to plot the trajectories
#'
#' @docType data
#'
#' @usage data(results_trajectories)
"results_trajectories"

#' The results of the vigenette Part 1 used to plot basic paramater space
#'
#' @docType data
#'
#' @usage data(results_core)
"results_core"

#' The results of the vigenette Part 1 used to plot rate response paramater space
#'
#' @docType data
#'
#' @usage data(results_rates)
"results_rates"

#' The results of the vigenette Part 1 used to plot maxes respone paramater space
#'
#' @docType data
#'
#' @usage data(results_maxes)
"results_maxes"

#' The results of existing clinical trial data organized as a data file, for use in Vignette 3
#'
#' @docType data
#'
#' @usage data(CTdata)
"CTdata"
#' Censor the data to simulate a particular pattern of dropouts
#'
#' \code{censordata} simulates a pattern of participant dropouts
#'
#' This function inputs a \code{dat} file as would be produced by
#' \link{generateData} and selects a subset to drop out (censor).
#' The probability of a particular timepoint being selected for
#' censoring in the initial pass is the sum of two factors: one
#' that censors at a set rate as a function of time, and one that
#' biases the rate of censoring based on the trajectory of symptoms,
#' with a large improvement in symptoms from baseline decreasing the
#' probability of dropout, while a small change or a worsening
#' increases the probability of dropout. In a separate, second pass
#' all dropouts are carried forward, such that once a datapoint
#' has been censored, all future datapoints for that particular
#' participant are also censored
#'
#' @param dat dat file as produced by \code{generateData}
#' @param trialdesign a trial design as defined by \code{buildtrialdesign}
#' @param censorparam a data.table provides 3 named parameters:
#'   (1) beta0 - the total rate (approximately) of censoring (0-1)
#'   (2) beta1 - the proportion of the total dropout that should be biased (0-1)
#'   (3) eb1 - the exponent used to define the shape of the curve defining biased dropout.
#'   See example, below.
#' @return A new data file, with the censored data points replaced by \code{NA},
#'   will be returned
#' @examples
#' # Create a set of censoring params to
#' # feed to censordata one at a time:
#' censorparams<-data.table(
#'   names=c("balanced","more of flat","more of biased","high dropout"),
#'   beta0=c(.05,.05 ,.05,.15),
#'   beta1=c(.5,.2,.8 ,.5),
#'   eb1=  c(2,  2  ,2  ,2 )
#'   )
#' @export


censordata<-function(dat,trialdesign,censorparam){
  #Stuff that should eventually be streamlined in trial def:
  d<-data.table(trialdesign)
  d$t_wk_cumulative<-cumulative(d$t_wk)
#  d[,onDrug:=(tod>0)]
  nP<-dim(trialdesign)[1]
  labels<-d$timeptname

  #Stuff to set up here
  deltas<-paste("D",labels,sep="_")
  evalstring<-paste("cdt<-dat[,.(",paste(deltas, collapse=', ' ),")]",sep="")
  eval(parse(text=evalstring))

  #Implement:
  frac_NA<-censorparam$beta0 #.3
  frac_NA_biased<-censorparam$beta1 #.6
  fna1<-frac_NA*(1-frac_NA_biased)
  fna2<-frac_NA*frac_NA_biased

  cdt_ps1<-cdt
  cdt_ps1[]<-1
  cdt_ps2<-sapply(cdt,function(x) ((x+100)^(censorparam$eb1)))
  cdt_p1<-t(t(cdt_ps1)*d$t_wk)
  cdt_p2<-t(t(cdt_ps2)*d$t_wk)
  cdt_r1<-runif(dim(cdt_p1)[1]*dim(cdt_p1)[2],min=0,max=2*mean(cdt_p1)*(.5/fna1))
  cdt_r2<-runif(dim(cdt_p2)[1]*dim(cdt_p2)[2],min=0,max=2*mean(cdt_p2)*(.5/fna2))
  do1<-as.data.table(cdt_p1>cdt_r1) # not yet cumulative, do this in masking stage
  do2<-as.data.table(cdt_p2>cdt_r2) # not yet cumulative, do this in masking stage
  do<-as.data.table(do1|do2)
  #sum(do==TRUE)/(dim(do1)[1]*dim(do1)[2])

  # TESTING:
  #  sum(do1==TRUE)/(dim(do1)[1]*dim(do1)[2])
  #  sum(do2==TRUE)/(dim(do2)[1]*dim(do2)[2])
  #test1<-melt(cdt)
  #test2<-melt(cdt_ps2)
  #test3<-data.table(x=test1$value,y=test2$value)
  #plot(test3$x,test3$y)

  # Apply to dat:
  for(itp in 1:length(labels)){
    for(tp in labels[1:itp]){
      masking_tp<-paste("D",tp,sep="_")
      masked_tp<-labels[itp]
      evalstring<-paste("dat$",masked_tp,"[do$",masking_tp,"]<-NA",sep="")
      eval(parse(text=evalstring))
    }
  }

  return(dat)
}
#' Build your proposed clinical trial design
#'
#' \code{buildtrialdesign} inputs the details of a proposed clinical trial
#' design in an intuitive way and outputs those details in a structured form
#' that can be used by the pmsimstat package simulation tools
#'
#' @seealso
#' \link{ggplot2} for what it's using to plot
#'
#' @param name_longform A name that will clearly remind you what this trial
#'   design is. Can have spaces, etc. If not provided, will default to "Trial
#'   Design 1"
#' @param name_shortform An abbreviated version of the name that can be used
#'   on plots, in your code when you want to select a certain trial design, etc.
#' @param timepoints A numeric vector containing the timepoints (not including
#'   baseline) at which anything of relevance happens in your trial design. You
#'   must specify any timepoint where a measurement will be taken, the participant's
#'   expectancy about what intervention they are receiving changes, or the actual
#'   intervention itself (e.g. all randomizationpoints) can change.
#'   NOTE: The unit used is not specified - you must just ensure that the unit you
#'   use here is consistent with the unit used for e.g. the halflife of any
#'   carryover effect. Typical units would be weeks or days.
#' @param timeptnames A character vector containing brief labels for each of the timepoints.
#'   If your trial has different phases, it can be helpful to incorporate those into the
#'   timepoint names for later easy reference. It is helpful if they are short enough to
#'   use as x-axis labels on a plot. If not specified, will default to "V1, V2, V3..." for
#'   Visit 1, Visit 2, etc.
#' @param expectancies A numeric vector with all values ranging from 0 to 1, of the same length
#'   as timepoints. These values represent the "expectancy" a participant has at any given
#'   point that they are receiving active, effective treatment. It can match the probability
#'   that a participant is receiving active treatment (e.g. 0.5 for 1:1 randomization of
#'   active drug to placebo), but does not have to. It is used to scale the degree of the
#'   expectancy-related response factor.
#' @param ondrug A list containing binary vectors that describe when participants are on active
#'   treatment or not (whether they are on a placebo or not is irrelevant for this paramater).
#'   Each vector is same length as the number of timepoints, and specificies that someone
#'   is either on active treatment at that timepoint (1) or not on active treatment at that
#'   timepoint (0). Each possible path through the trial is described by a separate vector.
#'   E.g., an open label trial will only have one path (a list with one vector, which contains
#'   all 1's), a traditional parallel group RCT would have 2 paths (all 1's and all 0's), and
#'   a trial with two randomization points would have 4 paths.
#' @return \code{output$metadata} contains the input variables you used for future reference,
#'   named as above.
#' @return \code{output$trialpaths} contains a list whose length is defined by the number
#'   of paths through the clinical trial (ie, the length of the input variable ondrug, above),
#'   containing the information about the trial in the form required by the pmsimstat tools.
#' @examples
#' tdOL<-buildtrialdesign(
#'         name_longform="open label",
#'         name_shortform="OL",
#'         timepoints=cumulative(rep(2.5,8)),
#'         timeptname=paste("OL",1:8,sep=""),
#'         expectancies=rep(1,8),
#'         ondrug=list(
#'           pathA=rep(1,8)
#'           )
#'         )
#'   #Builds a 20 week entirely open-label trial
#'
#'tdCO<-buildtrialdesign(
#'         name_longform="traditional crossover",
#'         name_shortform="CO",
#'         timepoints=cumulative(rep(2.5,8)),
#'         timeptname=c(paste("COa",1:4,sep=""),paste("COb",1:4,sep="")),
#'         expectancies=rep(.5,8),
#'         ondrug=list(
#'           # Assumes on drug entire previous interval and this measurement point
#'           pathA=c(1,1,1,1,0,0,0,0),
#'           pathB=c(0,0,0,0,1,1,1,1)
#'           )
#'         )
#'    # Builds a traditional crossover trial, 20 weeks long, with no washout period
#' @export

buildtrialdesign<-function(name_longform,name_shortform,
                           timepoints,timeptnames,expectancies,ondrug){

  # Add defaults for unspecified parameters
  if(missing(name_longform)){name_longform="Trial Design 1"}
  if(missing(name_shortform)){name_shortform=name_longform}
  if(missing(timeptnames)){timeptnames=paste("V",1:(length(timepoints)),sep="")}

  # Build list of trial designs, one for each path through
  nTD<-length(ondrug)
  tds<-vector(mode="list",length=nTD)
  t_wk<-c(timepoints[1],
          timepoints[2:length(timepoints)]-timepoints[1:(length(timepoints)-1)])
  for(iTD in 1:nTD){
    od<-ondrug[[iTD]]
    od_intervals<-t_wk*od
    tod<-od_intervals
    tsd<-t_wk*(od!=1)
    tpb<-t_wk*(expectancies>0)
    for(iTP in 2:length(timepoints)){
      if(od_intervals[iTP]>0){
        tod[iTP]<-tod[iTP]+tod[iTP-1]
      }else{
        tsd[iTP]<-tsd[iTP]+tsd[iTP-1]
      }
      if(tpb[iTP]>0){
        tpb[iTP]<-tpb[iTP]+tpb[iTP-1]
      }
    }
    everondrug<-(cumulative(od)>0)
    tsd<-everondrug*tsd
    tds[[iTD]]<-data.table(timeptnames=timeptnames,t_wk=t_wk,e=expectancies,tod=tod,tsd=tsd,tpb=tpb)
  }

  # Retain the metadata in structured form in the output
  metadata<-list(name_longform=name_longform,
                 name_shortform=name_shortform,
                 timepoints=timepoints,
                 timeptnames=timeptnames,
                 expectancies=expectancies,
                 ondrug=ondrug)

  # Output
  out<-list(metadata=metadata,trialpaths=tds)
}