R/pipeline.R

Defines functions gtxpipe

Documented in gtxpipe

#' @import data.table

## FIXME Using contrasts as a variable name shadows a base function !!

## option namespace should all be
## gtx.* or gtxpipe.*

## pgx.eigenvec is actually used as a list of genotyped subjects plus any covariates desired

## FIXME add direct hook for "user-derived" endpoints

#' @export
gtxpipe <- function(gtxpipe.models = getOption("gtxpipe.models"),
                    gtxpipe.groups = getOption("gtxpipe.groups", data.frame(group = 'ITT', deps = 'pop.PNITT', fun = 'pop.PNITT', stringsAsFactors = FALSE)),
                    ## ugly to have this in the prototype (and hence verbatim in the man page)
                    ## Also R CMD CHECK thinks derivations.standard.IDSL is an unbound global variable
                    gtxpipe.derivations = getOption("gtxpipe.derivations", {data(derivations.standard.IDSL); derivations.standard.IDSL}),
                    gtxpipe.transformations = getOption("gtxpipe.transformations", data.frame(NULL)),
                    gtxpipe.eigenvec,
                    stop.before.make = FALSE) {
  ## arguments for project-specific

  data(gtx.version)
  message(Sys.time(), " gtxpipe() from package gtx version ", as.character(packageVersion("gtx")), " on ", R.version.string)
  message(Sys.time(), " gtx package build: ", gtx.version[1])
  # R.version$os %in% c("linux-gnu", "cygwin")

  ## Load any packages specified by gtxpipe.packages option, clean up ready to pass to pipeslave
  packages <- intersect(getOption('gtxpipe.packages', NULL), rownames(installed.packages()))
  for (package in packages) {
    message(Sys.time(), " gtxpipe: loading package '", package, "'")
    if (!require(package, character.only = TRUE, quietly = TRUE)) {
      stop("gtxpipe: fatal error, unable to load package '", package, "'")
    }
  }
  ## Note, any packages not available are silently ignored
  options(gtxpipe.packages = packages)
  
  usubjid <- as.character(getOption("gtx.usubjid", "USUBJID"))[1] # variable name for unique subject identifier
  ## Replace this with a function getusubjid that prints warning messages, applies make.names() etc
  
  ## options where read repeatedly in code, constant (in C const sense)
  ## and likely constant over different projects on same IT
  ## overwrite with failsafe defaults now
  options(gtxpipe.genotypes = as.character(getOption("gtxpipe.genotypes", "genotypes"))[1]) # directory containing genotype data
  options(gtxpipe.clinical = as.character(getOption("gtxpipe.clinical", "clinical"))[1]) # directory containing clinical data
  options(gtxpipe.analyses = as.character(getOption("gtxpipe.analyses", "analyses"))[1]) # top level directory for analyses
  dir.create(getOption("gtxpipe.analyses"), recursive = TRUE, showWarnings = FALSE) # FIXME throw error if fails?
  options(gtxpipe.outputs = as.character(getOption("gtxpipe.outputs", "outputs"))[1]) # target directory for outputs
  dir.create(getOption("gtxpipe.outputs"), recursive = TRUE, showWarnings = FALSE) # FIXME throw error if fails?

  ## If user identity not specified, determine from USER environment variable
  options(gtxpipe.user = as.character(getOption("gtxpipe.user", paste("USER", Sys.getenv("USER", unset = "unknown"), sep = ":")))[1])

  ## If date not specified, determine from system date
  options(gtxpipe.date = as.character(getOption("gtxpipe.date", format(Sys.Date(), "%Y-%b-%d")))[1])
  
  ## Check genotypes directory contains at least one dose/info pair and enumerate for check against done files later
  doses = gsub('\\.dose.gz$','',list.files(path = getOption("gtxpipe.genotypes"), pattern = '\\.dose.gz$'))
  infos = gsub('\\.info.gz$','',list.files(path = getOption("gtxpipe.genotypes"), pattern = '\\.info.gz$'))
  chunks = intersect(doses,infos)
  if (length(chunks) < 1) stop("Needs to be at least one *.dose.gz/*.info.gz file pair in the genotypes directory [", 
                                getOption("gtxpipe.genotypes"), "] you can set this directory as the gtxpipe.genotypes option")
  ##
  ## Check gtxpipe.models
  ##
  if (missing(gtxpipe.models) || is.null(gtxpipe.models)) stop("No models specified.  You need to set gtxpipe.models")
  gtxpipe.models <- as.data.frame(gtxpipe.models)
  if (nrow(gtxpipe.models) < 1) stop("No models specified.  You need to have at least one row in gtxpipe.models")
  with(list(mn = setdiff(c("model", "deps", "fun"), names(gtxpipe.models))), 
       if (length(mn) > 0) stop("Models not correctly specified.  You need to have columns ",
                                paste(mn, collapse = ", "), " in gtxpipe.models"))
  gtxpipe.models$model <- gsub("\\s+", "", gtxpipe.models$model)
  ## Enforce alphanumeric names because they will be used as directory names
  with(list(bm = !grepl("^[A-Za-z0-9]+$", gtxpipe.models$model)),
       if (any(bm)) stop("Models ", paste(gtxpipe.models[bm, "model"], collapse = ", "),
                         " have non-alphanumeric names.  You need to fix this in gtxpipe.models"))
  if (is.na(match("groups", names(gtxpipe.models)))) {
    warning("Models do not have analysis group(s) specified.  Defaulting to ITT")
    gtxpipe.models$groups <- "ITT"
    ## Note this group is always defined.  Should not always be called "ITT" though.  FIXME: Generalize all subjects concept.
  }
  if (is.na(match("contrasts", names(gtxpipe.models)))) {
    warning("Models do not have analysis contrast(s) specified.  Defaulting to none")
    gtxpipe.models$contrasts <- ""
  }
  if (is.na(match("cvlist", names(gtxpipe.models)))) {
    warning("Models do not have candidate variant(s) specified.  Defaulting to none")
    gtxpipe.models$cvlist <- ""
  }
  # FIXME: check column classes are all character?
  
  ##
  ## Check gtxpipe.groups
  ##
  if (missing(gtxpipe.groups) || is.null(gtxpipe.groups)) {
    ## This would not be needed if default in the function arg default
    warning("No groups specified.  Defaulting to a single (ITT) group.")
    gtxpipe.groups <- data.frame(group = 'ITT', deps = 'pop.PNITT', fun = 'pop.PNITT', stringsAsFactors = FALSE)
  }
  gtxpipe.groups <- as.data.frame(gtxpipe.groups)
  if (nrow(gtxpipe.groups) < 1) stop("No groups specified.  You need to have at least one row in gtxpipe.groups")
  with(list(mn = setdiff(c("group", "deps", "fun"), names(gtxpipe.groups))), 
       if (length(mn) > 0) stop("Groups not correctly specified.  You need to have columns ",
                                paste(mn, collapse = ", "), " in gtxpipe.groups"))
  gtxpipe.groups$group <- gsub("\\s+", "", gtxpipe.groups$group)
  ## Enforce alphanumeric names because they will be used as directory names
  with(list(bg = !grepl("^[A-Za-z0-9]+$", gtxpipe.groups$group)),
       if (any(bg)) stop("Groups ", paste(groups[bg, "group"], collapse = ", "),
                         "have non-alphanumeric names.  You need to fix this in gtxpipe.groups"))
  
  ##
  ## Check gtxpipe.derivations
  ##
  if (missing(gtxpipe.derivations) || is.null(gtxpipe.derivations)) stop("No derivations specified.  You need to set gtxpipe.derivations")
  gtxpipe.derivations <- as.data.frame(gtxpipe.derivations)  
  with(list(mn = setdiff(c("targets", "types", "deps", "data", "fun"), names(gtxpipe.derivations))), 
       if (length(mn) > 0) stop("Derivations not correctly specified.  You need to have columns ",
                                paste(mn, collapse = ", "), " in gtxpipe.derivations"))

  ##
  ## Check gtxpipe.transformations
  ##
  if (missing(gtxpipe.transformations)) gtxpipe.transformations <- NULL
  gtxpipe.transformations <- as.data.frame(gtxpipe.transformations)
  ## FIXME check code runs with no transformations
  
  ##
  ## Check groups and contrasts specified in gtxpipe.models are defined in gtxpipe.groups
  ## Compute modelwise multiple testing adjustment, alpha1=1/(total number of groups and contrasts)
  ## Compute analysis groups (agroups), the set of groups that require within-group analyses
  ##  e.g. a group not specified for a primary analysis but required for a contrast
  gtxpipe.models <- cbind(gtxpipe.models, 
                      do.call(rbind, lapply(1:nrow(gtxpipe.models), function(modelid) {
                        contrasts1 <- tokenise.whitespace(gtxpipe.models[modelid, "contrasts"])
                        ## make these local variables?
                        contrasts1.bad <- sapply(contrasts1, function(contrast1) return(length(unlist(strsplit(contrast1, "/"))) != 2))
                        if (any(contrasts1.bad)) {
                          stop('Model "', gtxpipe.models[modelid, "model"], '" has contrasts(s) ',
                               paste(contrasts1[contrasts1.bad], collapse = ", "),
                               ' not in format group1/group2')
                        }
                        groups1 <- tokenise.whitespace(gtxpipe.models[modelid, "groups"])
                        agroups <- unique(c(groups1, unlist(strsplit(contrasts1, "/"))))
                        agroups.missing <- is.na(match(agroups, gtxpipe.groups$group))
                        if (any(agroups.missing)) {
                          stop('Model "', gtxpipe.models[modelid, "model"], '" has undefined group(s) ',
                               paste('"', agroups[agroups.missing], '"', sep = "", collapse = ", "))
                        }
                        return(data.frame(alpha1 = 1/(length(contrasts1) + length(groups1)), 
                                          agroups = paste(agroups, collapse = " "), 
                                          stringsAsFactors = FALSE))
                      })))

  ## FIXME would be nice to use groupby to detect whether any contrasts use overlapping groups
  ## In theory, GC should fix this (albeit not optimally)
  
  ## The set of all variables we need for actual analysis
  ## Transformations -> new deps

  ## Compute model dependencies in terms of untransformed variables
  gtxpipe.models$depsu <- sapply(gtxpipe.models$deps, function(deps1) {
    paste(sapply(tokenise.whitespace(deps1), function(dep1) {
      mm <- match(dep1, gtxpipe.transformations$targets)
      if (is.na(mm)) return(dep1)
      return(gtxpipe.transformations$deps[mm])
    }), collapse = " ")})

  ## Should we do the same for group dependencies?
 
  deps <- unique(tokenise.whitespace(c(gtxpipe.derivations$targets[!gtxpipe.derivations$deps %in% c("pop", "demo")],  ##Add all variables back to be derived.
                                       gtxpipe.models$depsu, gtxpipe.groups$deps,
                                       "pop.PNITT", "pop.TRTGRP", "demo.SEX", "demo.AGE", "demo.RACE", "demo.ETHNIC")))
  
   ## force in pop.PNITT even though this is not a mandated variable per dsm ?
  ## Code below for groupall assumes pop.PNITT exists - there will be an error if not FIXME
  ## allow a force in list.
  ## sort columns by unique(forcelist, deps) before computing demographics tables

  with(list(bd = setdiff(deps, tokenise.whitespace(gtxpipe.derivations$targets))),
       if (length(bd) > 0) stop("Required variable(s) ", paste(bd, collapse = ", "),
                                " have no derivation rule(s) defined.  You need to add rules to gtxpipe.derivations"))
  message(Sys.time(), " gtxpipe: Required variables: ", paste(deps, collapse = ", "))
  
  ## which clinical datasets do we actually need?  clinical.derive *always* requires a pop dataset
  ddeps <- unique(c("pop", tokenise.whitespace(gtxpipe.derivations[sapply(1:nrow(gtxpipe.derivations), function(idx) {
    any(tokenise.whitespace(gtxpipe.derivations[idx, "targets"]) %in% deps)
  }), "deps"])))

  message(Sys.time(), " gtxpipe: Required datasets: ", paste(ddeps, collapse = ", "))

  message(Sys.time(), ' gtxpipe Looking for datasets in gtxpipe.clinical="', getOption("gtxpipe.clinical"), '"')
  clindata <- clinical.import(getOption("gtxpipe.clinical"), only = ddeps)
  
  with(list(bd = setdiff(ddeps, names(clindata))),
       if (length(bd) > 0) stop("Required dataset(s) not found"))
  ## FIXME would be nicer if clinical.import had its own error checking; should it have "only" and "require" options??
  
  #message(Sys.time(), " gtxpipe: Read clinical datasets OK")
  anal1 <- clinical.derive(clindata, gtxpipe.derivations, only = deps)
  anal1$pop.PNITT[is.na(anal1$pop.PNITT)]<- F
  ## FIXME why is this so slow?
  message(Sys.time(), " gtxpipe: Computed derived variables OK")

  ## FIXME ancestry PCs are not written here, note if merge, should merge all=TRUE since
  ## we will want clinical data for non-PGx subjects
  write.csv(anal1, 
            file = file.path(getOption("gtxpipe.outputs"), "subject_analysis_dataset.csv"))
            
  ## FIXME hook for user derived variables needed here (in case used in group defs etc)

  ## Order columns as follows:
  ##   USUBJID is first
  ##   if variable has a descriptor, include next, in the order appearing in clinical.descriptors
  ##   the rest
  anal1 <- anal1[ ,
                 order(ifelse(names(anal1) == usubjid, 0, 1),
                       match(names(anal1), names(getOption("clinical.descriptors", NULL))))]


#  inc1 <- with(anal1, eval(parse(text = getOption("clinical.subset", 'pop.PNITT'))))

  if (!missing(gtxpipe.eigenvec) && file.exists(gtxpipe.eigenvec)) {
    ancestrypcs <- read.table(gtxpipe.eigenvec, header = FALSE, sep = " ", as.is = TRUE)[ , -1] # drop first (FID) column
    names(ancestrypcs) <- c(usubjid, paste("PC", 1:(ncol(ancestrypcs) - 1), sep = ""))
  } else {
    warning("PGx eigenvectors not available.  Continuing imperfectly")
    ancestrypcs <- anal1[ , usubjid, drop = FALSE]
  }
  npcs <- min(getOption("gtxpipe.no.PCs"), ncol(ancestrypcs)-1)
  if (npcs == 0) pcs.cov <- names(ancestrypcs)[-1][npcs]
  else pcs.cov <- names(ancestrypcs)[-1][1:npcs]  # list of pcs as default covs
  
  
  ## Even though we will re-apply transformations on subsets, they all should work on the complete analysis dataset
 if (nrow(gtxpipe.transformations) > 0) {
    anal2<- merge(anal1, ancestrypcs, all = T )
    options(na.action="na.exclude")
    for (idx in 1:nrow(gtxpipe.transformations)) {
      target <- gtxpipe.transformations$targets[idx]
      if (target %in% names(anal1)) stop("Transformation overwrites existing variable ", target)
      tryCatch(anal2[[target]] <- eval(parse(text = gtxpipe.transformations$fun[idx]), envir = anal2),
               error = function(e) 
                 stop("Error in transformation '", target, " <- ", gtxpipe.transformations$fun[idx], "':\n", e$message))
    }
  }
  
   
  ## is.PGx indicates PGx only if gtxpipe.eigenvec is one-to-one list of PGx subjects
  anal1$is.PGx <- anal1[[usubjid]] %in% ancestrypcs[[usubjid]]
  
  ## gtxpipe needs to know number enrolled for the automagic report.
  ## Making an additional "ITT" group would have meant that number enrolled/PGx
  ## would automatically appear as a row in gtxpipe.groups, but then would
  ## have to program around unwanted columns appearing in the demographics table,
  ## and messy programming in the calculation of whether any groups overlap.
  ## Hence, storing this information separately.  Percent PGx can be calculated downstream
  ##
  ## Conceptually what we want is all subjects enrolled, which may not always
  ## be the same as the ITT population.  Could perhaps more robustly assume that
  ## enrolment corresponds to *some* flag pop.XXXX and use an option to specify?
  ## The definition used here should be linked to the "Population: ITT" header in displays
  groupall <- with(anal1, list('All enrolled, ITT' = pop.PNITT,
                               'All enrolled, PGx' = pop.PNITT & is.PGx))

  ## FIXME tryCatch inside here
  groupby <- do.call(c, lapply(1:nrow(gtxpipe.groups), function(idx) {
    return(with(anal1, list(eval(parse(text = gtxpipe.groups$fun[idx])),
                            eval(parse(text = gtxpipe.groups$fun[idx])) & is.PGx)))
  }))
  
  names(groupby) <- paste(rep(gtxpipe.groups$group, each = 2),
                          rep(c("ITT", "PGx"), times = nrow(gtxpipe.groups)),
                          sep = ", ")

  gtxpipe.groups$N.ITT <- sapply(groupby[paste(gtxpipe.groups$group, "ITT", sep = ", ")], sum)
  gtxpipe.groups$N.PGx <- sapply(groupby[paste(gtxpipe.groups$group, "PGx", sep = ", ")], sum)

  ## Automagic report what arms (pop.TRTGRP groups) are actually included 
  ## NOTE we are assuming the user plays nice and if they define an overwriting transform this still works
  ## FIXME maybe we should not allow overwriting transforms?
  ## FIXME instead a clinical.ordering list of preferred orders // sort by count otherwise?
  gtxpipe.groups <- cbind(gtxpipe.groups, 
    do.call(rbind, lapply(groupby[paste(gtxpipe.groups$group, "ITT", sep = ", ")], function(gl) {
      tmp <- table(anal1$pop.TRTGRP[gl])
      tmp <- tmp[tmp > 0]
      if (length(tmp) == 0) return(data.frame(arms = "None", adjust.arm = NA, stringsAsFactors = FALSE))
      if (length(tmp) == 1) return(data.frame(arms = names(tmp), adjust.arm = FALSE, stringsAsFactors = FALSE))
      return(data.frame(arms = paste(names(tmp), " (N=", tmp, ")", sep = "", collapse = ", "),
                        adjust.arm = TRUE, stringsAsFactors = FALSE))
    })))

  snippets <- rbind(data.frame(value = c(
                                 getOption("gtxpipe.protocol", "NA"),
                                 getOption("gtxpipe.project", "NA"),
                                 getOption("gtxpipe.user", "NA"),
                                 getOption("gtxpipe.email", "NA"),
                                 getOption("gtxpipe.date", "NA"),
                                 R.version.string,
                                 as.character(packageVersion("gtx")),
                                 gtx.version[1]),
                               row.names = c("Protocol", "Project", "User", "Email", "DataAsOf", "R.version", "gtx.package.version", "gtx.package.build"),
                               stringsAsFactors = FALSE),
                    data.frame(value = format(sapply(groupall, sum)), # automatic row names
                               stringsAsFactors = FALSE),
                    data.frame(value = c(round(100*sum(groupall[[2]])/sum(groupall[[1]])),
                                         if (any(rowSums(as.data.frame(groupby)) > 1)) "overlapping" else "non-overlapping",
                                         if (any(gtxpipe.groups$adjust.arm, na.rm = TRUE)) "Yes" else "No"),
                               ## rowSums all 0 or 1, no overlapping groups.  Any >1 implies overlapping groups
                               ## Using Yes/No because output all text so downstream would have to parse even if logical
                               row.names = c("Overall PGx percent", "PGx group overlap", "PGx combines arms")))
  ## Should add the study name, groups, models, whether "efficacy" or "efficacy and safety" etc
 
  gtxpipe.groups$N.notPGx <-   gtxpipe.groups$N.ITT - gtxpipe.groups$N.PGx

  ## source item 2, can we call this disposition?
  ## write into source subdir
  rownames(gtxpipe.groups) <- gtxpipe.groups$group
#  print(gtxpipe.groups[ , c("group", "arms", "N.ITT", "N.PGx")])

  ## first call, no metadata
  metadata <- pipetable(gtxpipe.groups[ , c("group", "arms", "N.ITT", "N.PGx")],
                        "subject_disposition",
                        "Subject disposition by PGx analysis group",
                        number = 2) # specifying number because first 4 tables are generated out-of-order
#  message(Sys.time(), ' gtxpipe Writing subject disposition to "', file.path(getOption("gtxpipe.outputs"), "02_subject_disposition.csv"), '"')
#  write.csv(gtxpipe.groups[ , c("group", "arms", "N.ITT", "N.PGx")],
#            file = file.path(getOption("gtxpipe.outputs"), "02_subject_disposition.csv"),
#            row.names = TRUE)
  
             
  # apply(gtxpipe.groups[ , c("N.notPGx", "N.PGx")], 1, prettypc)[2, , drop = TRUE]




  
  ## allow a demographics sort list
  ## write into source subdir
  metadata <- pipetable(demographics(anal1, by = groupby), 
                        "subject_demographics",
                        "Demographics of subjects included in PGx analyses",
                        metadata, number = 3) # specifying number because first 4 tables are generated out-of-order
#  message(Sys.time(), ' gtxpipe Writing subject demographics to "', file.path(getOption("gtxpipe.outputs"), "03_subject_demographics.csv"), '"')
#  write.csv(demographics(anal1, by = groupby),
#            file = file.path(getOption("gtxpipe.outputs"), "03_subject_demographics.csv"),
#            row.names = TRUE)

  ## There's a dataframe (here) and an option with the same name - FIXME
  gtxpipe.analyses <- do.call(rbind, lapply(1:nrow(gtxpipe.models), function(modelid) {
    agroups <- tokenise.whitespace(gtxpipe.models[modelid, "agroups"])
    acontrasts <- tokenise.whitespace(gtxpipe.models[modelid, "contrasts"])
    aprimary <- c(agroups %in% tokenise.whitespace(gtxpipe.models[modelid, "groups"]),
                  rep(TRUE, length(acontrasts)))
    return(data.frame(model = rep(gtxpipe.models[modelid, "model"], length(agroups) + length(acontrasts)),
                      group = c(agroups, acontrasts),
                      primary = aprimary,
                      stringsAsFactors = FALSE))
  }))

  ## Currently we compute analysis N's below.  We need to do that earlier
  ## because trying to analyse N=0 groups causes lots of problems.
  ## Need to solve circular argument that adjust.arm depends on whether multiple arms
  ## in the analysis dataset and size of analysis dataset depends on number of subjects
  ## with non-missing covariates (which may include arm).  (And note we may have studies with
  ## missing TRTGRP or ATRTGRP)
  
  adir0 <- getOption("gtxpipe.analyses")

  thisR <- getOption("gtxpipe.slaveR", file.path(R.home(component = "bin"), "R"))
  
  sink("Makefile") # name of this file should be an option

  cat("## Makefile generated by gtxpipe()\n")
  cat(".PHONY:\tall\nall:\t\n\n") # check having all twice is okay
  
  cat("GENOTYPES := $(wildcard ", getOption("gtxpipe.genotypes"), "/*.dose.gz)\n\n", sep = "")

  ## Loop over analyses not nested loops, set N in gtxpipe.analyses
  ## Write analysis datasets and Makefile as side effects
  analN <- do.call(rbind, lapply(1:nrow(gtxpipe.models), function(modelid) {
    return(do.call(rbind, lapply(tokenise.whitespace(gtxpipe.models[modelid, "agroups"]), function(agroup1) {
      groupid <- match(agroup1, gtxpipe.groups$group)
      ## FIXME next line throws an error if adjust.arm is NA (implies N=0 in ITT)
      trtgrp.cov <- if (gtxpipe.groups$adjust.arm[groupid]) "pop.TRTGRP" else NULL
      adir <- file.path(adir0, gtxpipe.models[modelid, "model"], gtxpipe.groups[groupid, "group"])
      dir.create(adir, recursive = TRUE, showWarnings = FALSE)

      ## Ensure that relevant options set in this gtxpipe() call are available
      ## to slave calls
      sink(file.path(adir, "options.R")) # sink inside sink
      cat('options(gtx.usubjid = "', usubjid, '")\n', sep = '')
      cat('options(gtxpipe.genotypes = "', getOption("gtxpipe.genotypes"), '")\n', sep = '') # will break if not scalar!
      cat('options(gtxpipe.threshold.MAF = ', getOption("gtxpipe.threshold.MAF", 0.01), ')\n', sep = '')
      cat('options(gtxpipe.threshold.Rsq = ', getOption("gtxpipe.threshold.Rsq", 0.01), ')\n', sep = '')
      cat('options(gtxpipe.extended.output = ', getOption("gtxpipe.extended.output", TRUE), ')\n', sep = '')
      if (length(getOption("gtxpipe.packages")) > 0) {
        cat('options(gtxpipe.packages = c(',
            paste('\'', getOption("gtxpipe.packages"), '\'', sep = '', collapse = ', '),
            '))\n', sep = '')
      }
      sink() # options.R

      adata <- merge(anal1[groupby[[paste(agroup1, ", PGx", sep = "")]],
                           c(usubjid,  
                             tokenise.whitespace(gtxpipe.models[modelid, "depsu"]),
                             trtgrp.cov)],
                     ancestrypcs,
                     all = FALSE)
      ## Remove subjects with any missing data on any dependency variable
      adata <- adata[which(apply(!is.na(adata), 1, all)), ]
      ## Transformations must be applied to *the analysis dataset*
      ## ONLY APPLY RELEVANT TRANSFORMATIONS depsu -> deps
      ## Although transformations are applied by gtxpipeslave
      ##   (so that transformations can create R classes such as factor, Surv etc)
      ##   transformations also applied here to remove subjects missing-after-transformation
      ##   and to fit null model
      for (idx in which(gtxpipe.transformations$targets %in% tokenise.whitespace(gtxpipe.models[modelid, "deps"]))) {
        target <- gtxpipe.transformations$targets[idx]
        if (target %in% names(adata)) stop("Transformation overwrites existing variable")
        message(target, " <- ", gtxpipe.transformations$fun[idx])
        adata[[target]] <- eval(parse(text = gtxpipe.transformations$fun[idx]), envir = adata)
      }
      ## Remove any subjects with missing model deps
      ##   (note, missing depsu might be okay e.g. derivation returns missing which is converted to
      ##    numeric by transformation e.g. Surv2(t1, t2))
      sel <- apply(!is.na(adata[ , unique(c(usubjid, 
                                            tokenise.whitespace(gtxpipe.models[modelid, "deps"]), 
                                            trtgrp.cov,
                                            names(ancestrypcs)))]), 1, all)
      adata <- adata[sel, , drop = FALSE]

      ## Everything except usubjid will be a factor

      ## Everything as paths relative to getwd() at runtime

      message(Sys.time(), " gtxpipe: Fitting model ", gtxpipe.models[modelid, "model"], " in group ", agroup1)
      mtmp <- eval(parse(text = gtxpipe.models[modelid, "fun"]), envir = adata)
      if (length(c(trtgrp.cov, pcs.cov)) > 0) {
        m0 <- update(mtmp, formula = as.formula(paste("~ . +", paste(c(trtgrp.cov, pcs.cov), collapse = "+"))))
      } else {
        m0 <- mtmp
      }
      
      ## Print model summary in Gx report.
      ## NOTE THAT WITH EMPTY MODELS drop1() throws an unhelpful error message
      ## drop1(m0, test="Chisq")

      ## Should test and not overwrite, or clear out analyses if updating

      sink(file.path(adir, "analysis-dataset.csv")) # sink inside sink
      cat('# Analysis dataset for model "', gtxpipe.models[modelid, "model"], '" in group "', gtxpipe.groups[groupid, "group"], '"\n', sep = '')
      ## first 8 characters MUST be '# call : ' followed by call to fit null model
      cat('# call: ', as.character(as.expression(m0$call)), '\n', sep = '')

      ## optional lines for transformations, e.g. '# where: osMonths <- Surv(SRVMO, SRVCFLCD)'
      for (idx in which(gtxpipe.transformations$targets %in% tokenise.whitespace(gtxpipe.models[modelid, "deps"]))) {
        cat('# where: ', gtxpipe.transformations$targets[idx], ' <- ', gtxpipe.transformations$fun[idx], '\n', sep = '')
      }
      
      ## including candidate variant list to force analysis regardless of MAF and RSQR filters
      ## Note that ideally this should be passed through the options.R file, not in the model/data spec
      cat('# cvlist: ', gtxpipe.models[modelid, "cvlist"], '\n', sep = '')

      sink()
      ## Only write untransformed data since pipeslave will reapply transformations
      suppressWarnings(write.table(adata[ , unique(c(usubjid, 
                                                     tokenise.whitespace(gtxpipe.models[modelid, "depsu"]), 
                                                     trtgrp.cov,
                                                     names(ancestrypcs)))], 
                                   sep = ",", row.names = FALSE, 
                                   file = file.path(adir, "analysis-dataset.csv"),
                                   append = TRUE))

      #If cvlist specified, convert to bed file of regions +/- 500 kb for tabix extract of full genome results
      if (!is.na(gtxpipe.models[modelid, "cvlist"]) && gtxpipe.models[modelid, "cvlist"] != '') {
        pipebed(snplist = tokenise.whitespace(gtxpipe.models[modelid, "cvlist"]),
                flank = 500000, outfile = file.path(adir, "CV.bed"))
      }

      cat('# Analysis for model "', gtxpipe.models[modelid, "model"], '" in group "', gtxpipe.groups[groupid, "group"], '"\n', sep = '')
      cat('MODEL', modelid, 'GROUP', groupid, ' := $(patsubst ', getOption("gtxpipe.genotypes"), '/%.dose.gz,', adir, '/%.done,$(GENOTYPES))\n', sep = '')
      cat('$(MODEL', modelid, 'GROUP', groupid, '):\t', adir, '/%.done:\t', getOption("gtxpipe.genotypes"), '/%.info.gz ', getOption("gtxpipe.genotypes"), '/%.dose.gz\n', sep = '')
      cat('\t@(echo "library(gtx); gtx:::pipeslave(target = \\"$@\\")" | ', thisR, ' --quiet --vanilla && uname -a >$@)\n\n', sep = '')
      ## delete output, delete done file, run R, touch done file
      ## 		mkdir -p a1; rm -f $@; sleep 60; uname -a >$@; date >>$@
      ## Once all chunks "done", combine into a single tabix'd file
      ## Per qmake man page, need to combine multiple commands into single rule by separating with ;
      cat(adir, '/ALL.out.txt.gz: $(MODEL', modelid, 'GROUP', groupid, ')\n', sep='')
      cat("\tzgrep -h '^SNP' ", adir, "/*out.gz | head -n 1 | awk 'BEGIN{FS=", '"\\t";OFS="\\t";} {print "#CHROM","POS",$$0;}', "' | gzip >", 
          adir, '/ALL.out.txt.gz0 ; \\\n', sep='')
      cat('\tif [ -e "', adir, '/ALL.out.txt.gz" ]; then zgrep -v ', "'^#CHROM' ", adir, '/ALL.out.txt.gz | gzip >>', adir, '/ALL.out.txt.gz0 ; fi ; \\\n', sep='')
      cat("\tzgrep -h -v '^SNP' ", adir, "/*out.gz | awk 'BEGIN{FS=", '"\\t";OFS="\\t";} {split($$1,coord,"[:_]"); print coord[1],coord[2],$$0;}', 
          "' | gzip >>", adir, '/ALL.out.txt.gz0 ; \\\n', sep='')
      cat('\tzcat ', adir, '/ALL.out.txt.gz0 | sort -T . -k 1,1 -k 2,2n | uniq | bgzip -f >', adir, '/ALL.out.txt.gz ; \\\n', sep='')
      cat('\trm ', adir, '/ALL.out.txt.gz0\n\n', sep='')
      cat(adir, '/ALL.out.txt.gz.tbi: ', adir, '/ALL.out.txt.gz\n', sep='')
      cat('\ttabix -f -b 2 -e 2 ', adir, '/ALL.out.txt.gz ; \\\n', sep='')
      cat('\trm ', adir, '/*out.gz\n\n', sep='')
      #If cvlist defined, extract results
      if (!is.na(gtxpipe.models[modelid, "cvlist"]) && gtxpipe.models[modelid, "cvlist"] != '') {
        cat(adir, '/CV.flanking.out.txt.gz: ', adir, '/ALL.out.txt.gz.tbi\n',sep='')
        cat('\ttabix -hB ', adir, '/ALL.out.txt.gz ', adir, '/CV.bed | gzip > ', adir, '/CV.flanking.out.txt.gz\n\n', sep='')
        return(data.frame(model = gtxpipe.models[modelid, "model"], group = agroup1, N = nrow(adata),
                        makevar = paste(adir, '/CV.flanking.out.txt.gz', sep = ''),
                        modelCall= gsub("formula = ", "", as.character(as.expression(m0$call))),
                        stringsAsFactors = FALSE))
      }
      else {
        return(data.frame(model = gtxpipe.models[modelid, "model"], group = agroup1, N = nrow(adata),
                        makevar = paste(adir, '/ALL.out.txt.gz.tbi', sep = ''),
                        modelCall= gsub("formula = ", "", as.character(as.expression(m0$call))),
                        stringsAsFactors = FALSE))
      }
    })))
  }))

  cat("all:\t", paste(analN$makevar, collapse = " "), "\n", sep = "") 
  sink() # Makefile

  gtxpipe.analyses$index <- 1:nrow(gtxpipe.analyses)
  gtxpipe.analyses <- merge(gtxpipe.analyses, analN, all.x = TRUE, all.y = FALSE)
  
  ## FIXME Would be nice to automagically compute N or N1/N2 for contrasts.  Needs to be done within levels of model.

  metadata <- pipetable(snippets,
                        "study_summary", "PGx study summary",
                        metadata,
                        number = 1) # specifying number because first 4 tables are generated out-of-order
#  write.csv(snippets, 
#            file = file.path(getOption("gtxpipe.outputs"), "01_study_summary.csv"),
#            row.names = TRUE)
  
  if (stop.before.make) return(invisible(NULL))
  
  ## Call "make all" using option for make command
  ## SGE_ARCH=lx24-amd64 qmake -v PATH -cwd -l qname=dl580 -- --jobs=4
  ## SGE_ARCH=lx24-amd64 nohup qmake -v PATH -cwd -l qname=dl580 -- --jobs=256 &
  ## consider adding -l h_data=4G
  ## consider a series of make comments run in series
  message(Sys.time(), " Running make now")
  makesuccess <- system(paste(getOption("gtxpipe.make", "make"), " 1>Makefile.out 2>Makefile.err", sep = ""))
  if (makesuccess != 0) {
    #stop("make failed, check Makefile.out and Makefile.err")
  }


  gtxpipe.results <- do.call(rbind, lapply(1:nrow(gtxpipe.models), function(modelid) {
    alpha <- .05 # this should be an option
    cvlist <- if (!is.na(gtxpipe.models[modelid, "cvlist"])) tokenise.whitespace(gtxpipe.models[modelid, "cvlist"]) else NULL
    if (length(cvlist) > 0) {
      ## alpha spend 0.5 for GWAS, 0.5 for CV
      thresh1 <- alpha*0.5*gtxpipe.models$alpha1[modelid]*1e-6
      thresh2 <- alpha*0.5*gtxpipe.models$alpha1[modelid]/length(cvlist)
    } else {
      thresh1 <- alpha*gtxpipe.models$alpha1[modelid]*1e-6
      thresh2 <- 0
    }
    #user specified p value threshold
     if("P_threshold_gwas" %in% names(gtxpipe.models)) 
      if(!is.na(gtxpipe.models[modelid, "P_threshold_gwas"]))
        thresh1 <- as.numeric(as.character(gtxpipe.models[modelid, "P_threshold_gwas"]))
    if("P_threshold_cv" %in% names(gtxpipe.models))
      if (!is.na(gtxpipe.models[modelid, "P_threshold_cv"]))
        thresh2 <- as.numeric(as.character(gtxpipe.models[modelid, "P_threshold_cv"]))
   
    
    
    out.signif <- 6
    agroups <- tokenise.whitespace(gtxpipe.models[modelid, "agroups"])
    res <- lapply(agroups, function(agroup1) {
      message(Sys.time(), " gtxpipe: Collating results for model ", gtxpipe.models[modelid, "model"], " in group ", agroup1)
      adir <- file.path(adir0, gtxpipe.models[modelid, "model"], agroup1)

      ## make can silently fail, need to check for .done files
      dones = gsub('\\.done$','',list.files(path = adir,pattern = '\\.done$'))
      if (length(dones) < length(chunks)) stop("Not all chunks analysed for model [", gtxpipe.models[modelid, "model"], 
                                               "] in group [", agroup1, "]. Try re-running to capture the missing chunks [", 
                                               paste(setdiff(chunks,dones), collapse=", "), "]")
      
      file.out <- file.path(adir, "ALL.out.txt.gz")
      file.out.gc<- file.path(adir, "ALL.out.gc.txt.gz")
      file.out.gc.sig <- file.path(adir, "GenomeWideSignif.out.gc.txt")
      file.out.gc.sig.flanking <- file.path(adir, "GenomeWideSignif.flanking.out.gc.txt.gz")
      file.out.gc.cv<- file.path(adir, "CV.out.gc.txt")
      file.out.gc.cv.flanking<- file.path(adir, "CV.flanking.out.gc.txt.gz")
      file.bed.cv<-file.path(adir, "CV.bed")
      file.bed.sig<- file.path(adir, "GenomeWideSignif.bed")
      
      ## Reading results which were compiled across chunks during make call
      res1 <- read.table(gzfile(file.out), quote = "", comment.char = "", header = TRUE, stringsAsFactors = FALSE, check.names=F)
      ## Confirm expected columns present
      stopifnot(all(c("#CHROM", "SNP", "pvalue", "beta", "SE") %in% names(res1)))
      #names(res1) <- gsub("X.", "", names(res1), fixed = T)
      ## Convert to data table to improve efficiency retaining only needed columns and rows with non-missing pvalue
      res1 <- data.table::data.table(res1[!is.na(res1$pvalue), ])
      ## Sort by SNP
      setkey(res1, SNP)
      ## Makes sense to apply GC and not store redundant (non-GCed) results here
      ## pvalue from 'best' method, LRT or F test
      lambda <- res1[ , gclambda(pvalue)]
      setattr(res1, "lambda", lambda)
      invisible(if (lambda <= 1.) res1[ , pvalue.GC := pvalue])
      invisible(if (lambda > 1.) {
        res1[ , pvalue.GC := signif(pchisq(qchisq(pvalue, df = 1, lower.tail = FALSE)/lambda, 
                                           df = 1, lower.tail = FALSE), digits = out.signif  )]
      })
      
      ## Apply GC to SEs using pvalues from Wald test
      lambdaWald <- res1[ , gclambda(pchisq((beta/SE)^2, df = 1, lower.tail = FALSE))]
      setattr(res1, "lambdaWald", lambdaWald)
      invisible(if (lambdaWald <= 1.) res1[ , SE.GC := SE])
      invisible(if (lambdaWald > 1.) {
        res1[ , SE.GC := signif(SE*sqrt(lambdaWald), digits = out.signif  )] 
      })
      
      ## Set SE.GC to NA when mis-calibrated,
      ## working definition being Wald chisquare statistic > 2. times the LRT chisquare statistic
      res1[(beta/SE.GC)^2 / qchisq(pvalue.GC, df = 1, lower.tail = FALSE) > 2., SE.GC := NA]
      
      #output gc results
      message(Sys.time(), ": Writing GWAS results to ", file.out.gc)
      write.table(res1, file=gzfile(file.out.gc), sep = "\t", row.names = F, quote = F)
      if(system(paste("zcat", file.out.gc,  "| sort -T . -k 1,1 -k 2,2n | uniq | bgzip -f >", paste(file.out.gc, "0", sep =""))) ==0) {
        system(paste("mv", "-f", paste(file.out.gc, "0", sep =""), file.out.gc))
        if(system(paste("tabix -f -b 2 -e 2", file.out.gc))!= 0)
          warning("gtxpipe: tabix ", file.out.gc, " failed")
      }
      else warning("gtxpipe: bgzip ", file.out.gc, " failed")
      
      #Create bed file to tabix out genome-wide significant results
      if (sum(res1$pvalue.GC <= thresh1, na.rm = TRUE) > 0) {
        snplist<- res1[res1$pvalue.GC <= thresh1,SNP]
        pipebed(snplist = snplist, flank = 500000, outfile = file.bed.sig)
        #Run tabix extraction
        message(Sys.time(), ": Writing GWAS significant results to ", file.out.gc.sig)
        write.table(res1[snplist,names(res1), with = FALSE], file=file.out.gc.sig, sep = "\t", row.names = F, quote = F)
        message(Sys.time(), ": Writing GWAS significant results plus flanking region to ", file.out.gc.sig.flanking)
        tabixsuccess <- system(paste("tabix -hB", file.out.gc, file.bed.sig, "| gzip >", file.out.gc.sig.flanking))
        if (tabixsuccess != 0) {
          warning("gtxpipe: tabix extraction of genome-wide significant results failed for ", file.out.gc.sig.flanking)
        }
      }
      
      if (length(cvlist) > 0) {
        snplist<- intersect(cvlist, res1$SNP)
        message(Sys.time(), ": Writing candiate gene results to ",file.out.gc.cv)
        write.table(res1[snplist, names(res1), with = FALSE], file=file.out.gc.cv, sep = "\t", row.names = F, quote = F)
        message(Sys.time(), ": Writing candidate gene results plus flanking region to ", file.out.gc.cv.flanking)
        tabixsuccess <- system(paste("tabix -hB", file.out.gc, file.bed.cv, "| gzip >", file.out.gc.cv.flanking))
        if (tabixsuccess != 0) {
          warning("gtxpipe: tabix extraction of candidate gene gc results failed for ", file.out.gc.cv.flanking)
        }
      }
      
      setnames(res1, "SE.GC", paste("SE.GC", agroup1, sep = "."))
      setnames(res1, "beta", paste("beta", agroup1, sep = ".")) # named by groups, used in clever join in constrasts
      
      call.curr<- gsub(" ", "", analN[analN$model == gtxpipe.models[modelid, "model"] & analN$group==agroup1, "modelCall"])
      if(length(names(ancestrypcs)[-1]) > 1)
        call.curr<- gsub(paste(names(ancestrypcs)[-1], collapse = "+"), paste("PC1-", length(names(ancestrypcs)[-1]), sep=""), call.curr, fixed = T)
      plotdata <- rbind(snippets["Project", , drop = FALSE],
                        data.frame(value = c(lambda > 1., round(lambda, 4), 
                                     gtxpipe.models[modelid, "model"],
                                     call.curr,
                                     agroup1,
                                     res1[ , sum(!is.na(res1$pvalue.GC))]), 
                                   row.names = c("GenomicControl", "Lambda",
                                     "Model", "Call","Subgroup","PValues"),
                                   stringsAsFactors = FALSE))
      
      ## Note, QQ and Manhattan plots are drawn *after* genomic control
      assign("metadata", pipeplot('res1[ , qq10(pvalue.GC, pch = 20)]',
                                  filename = paste("QQ", gtxpipe.models[modelid,"model"], agroup1, sep = "_"),
                                  title = paste("QQ plot for", gtxpipe.models[modelid,"model"], "in group", agroup1),
                                  metadata,
                                  number = 5, # *start* at 5 to leave space for 04_summary_results
                                  plotdata = plotdata,
                                  plotpar = list(mar = c(4, 4, 0, 0) + 0.1)),
             pos = parent.frame(n = 4))
      ## Have to use assign(..., pos = ) to update metadata from inside two levels of nested anonymous function
      assign("metadata", pipeplot('res1[ , manhattan(pvalue.GC, SNP, pch = 20, cex = 0.5)]', 
                                  filename = paste("Manhattan", gtxpipe.models[modelid,"model"], agroup1, sep = "_"),
                                  title = paste("Manhattan plot for", gtxpipe.models[modelid,"model"], "in group", agroup1),
                                  metadata,
                                  number = 5, # *start* at 5 to leave space for 04_summary_results
                                  plotdata = plotdata,
                                  plotpar = list(mar = c(4, 4, 0, 0) + 0.1)),
             pos = parent.frame(n = 4))
      ## Have to use assign(..., pos = ) to update metadata from inside two levels of nested anonymous function

        return(res1)
    })
    names(res) <- agroups
    
    contrasts1 <- tokenise.whitespace(gtxpipe.models[modelid, "contrasts"])
    resc <- lapply(contrasts1, function(contrast1) {
      message(Sys.time(), " gtxpipe: Collating results for model ", gtxpipe.models[modelid, "model"], " for contrast ", contrast1)

      ## Are we better to do each contrast as a join, or lam the whole lot up and delete the unwanted columns later?
      
      groups <- unlist(strsplit(contrast1, "/"))
      stopifnot(identical(length(groups), 2L)) # internal error because checked this had length 2 previously
      group1 <- groups[1]
      group2 <- groups[2]
      
      ## More efficient to work with chi2 statistics, monotonic in P and using bespoke GC calculation
      ## ce = contrast expression
      ce <- parse(text = paste("list(chi2 = (beta.", group1, " - beta.", group2, ")^2/",
                    "(SE.GC.", group1, "^2 + SE.GC.", group2, "^2))", sep = ""))
      res1 <- res[[group1]][res[[group2]], eval(ce)]
      lambda <- res1[ , median(chi2, na.rm = TRUE)/qchisq(0.5, df = 1, lower.tail = FALSE)]
      setattr(res1, "lambda", lambda) # This is a Wald-like test but it's the primary one for the contrast
      invisible(if (lambda > 1.) {
        res1[ , pvalue.GC := signif(pchisq(chi2/lambda, df = 1, lower.tail = FALSE), digits = out.signif  )]
      } else {
        res1[ , pvalue.GC := signif(pchisq(chi2, df = 1, lower.tail = FALSE), digits = out.signif  )]
      })
      res1[ , chi2 := NULL]
      setkey(res1, SNP)
      
      adir <- file.path(adir0, gtxpipe.models[modelid, "model"])
      file.out.contrast<- file.path(adir, paste("ALL.", group1, "_vs_", group2, ".out.gc.txt.gz", sep= ""))
      file.out.contrast.sig<- file.path(adir, paste("GenomeWideSignif.", group1, "_vs_", group2, ".out.gc.txt", sep= ""))
      file.out.contrast.cv<- file.path(adir, paste("CV.", group1, "_vs_", group2, ".out.gc.txt", sep= ""))
      message(Sys.time(), ": Writing contrast result to ", file.out.contrast)
      write.table(res1, file=gzfile(file.out.contrast),  sep = "\t", row.names = F, quote = F)
      if (sum(res1$pvalue.GC <= thresh1, na.rm = TRUE) > 0) {
        snplist<- res1[res1$pvalue.GC <= thresh1,SNP]
        message(Sys.time(), ": Writing GWAS significant contrast results to ", file.out.contrast.sig)
        write.table(res1[snplist,], file=file.out.contrast.sig, sep = "\t", row.names = F, quote = F)
      }    
      if (length(snplist<- intersect(cvlist, res1$SNP)) > 0) {
        message(Sys.time(), ": Writing candiate gene contrast results to ",file.out.contrast.cv)
        write.table(res1[snplist,], file=file.out.contrast.cv, sep = "\t", row.names = F, quote = F)
      }
      

      ## Note we can't use '/' separated contrasts in filenames
      plotdata <- rbind(snippets["Project", , drop = FALSE],
                        data.frame(value = c(lambda > 1., round(lambda, 4), 
                                     gtxpipe.models[modelid, "model"],
                                     paste(group1, group2, sep = " vs "), 
                                     res1[ , sum(!is.na(res1$pvalue.GC))]), 
                                   row.names = c("GenomicControl", "Lambda",
                                     "Model", "Contrast","PValues"),
                                   stringsAsFactors = FALSE))
      
      assign("metadata", pipeplot('res1[ , qq10(pvalue.GC, pch = 20)]',
                                  filename = paste("QQ", gtxpipe.models[modelid, "model"], group1, "vs", group2, sep = "_"),
                                  title = paste("QQ plot for", gtxpipe.models[modelid, "model"], "contrasting", group1, "vs", group2),
                                  metadata,
                                  number = 5, # *start* at 5 to leave space for 04_summary_results
                                  plotdata = plotdata,
                                  plotpar = list(mar = c(4, 4, 0, 0) + 0.1)),
             pos = parent.frame(n = 4))
      ## Have to use assign(..., pos = ) to update metadata from inside two levels of nested anonymous function
      assign("metadata", pipeplot('res1[ , manhattan(pvalue.GC, SNP, pch = 20, cex = 0.5)]',
                                  ## Note manhattan() must cope with missing pvalue.GC from SNPs in group1 but not in group2
                                  filename = paste("Manhattan", gtxpipe.models[modelid,"model"], group1, "vs", group2, sep = "_"),
                                  title = paste("Manhattan plot for", gtxpipe.models[modelid,"model"], "contrasting", group1, "vs", group2),
                                  metadata,
                                  number = 5, # *start* at 5 to leave space for 04_summary_results
                                  plotdata = plotdata,
                                  plotpar = list(mar = c(4, 4, 0, 0) + 0.1)),
             pos = parent.frame(n = 4))
      ## Have to use assign(..., pos = ) to update metadata from inside two levels of nested anonymous function

      return(res1)
    })
    names(resc) <- contrasts1
    
    ## Note that ...
    resa <- c(res, resc) # ... is not fast
    
    ## Using the contrast names as part of column labels DOESN'T WORK
    return(do.call(rbind, lapply(names(resa), function(nn) {
      pcv <- resa[[nn]][cvlist , pvalue.GC]$pvalue.GC
      data.frame(model = gtxpipe.models[modelid, "model"],
                 group = nn,
                 lambda = attr(resa[[nn]], "lambda"),
                 thresh1 = thresh1,
                 hits1 = resa[[nn]][ , paste(sum(pvalue.GC <= thresh1, na.rm = TRUE), sum(!is.na(pvalue.GC)), sep = "/")],
                 ## hits1 !is.na() for denomin as constrasts have NA pvalues
                 hits2 = paste(sum(pcv <= thresh2, na.rm = TRUE), sum(!is.na(pcv)), sep = "/"),
                 thresh2 = thresh2,
                 stringsAsFactors = FALSE)
    })))
    ## Using primary threshold for non-primary analyses (specifically, groups used for contrasts but not of interest themselves
  }))
  
  ## Best place to compute per-analysis N???

  gtxpipe.results <- merge(gtxpipe.analyses, gtxpipe.results)
  gtxpipe.results <- gtxpipe.results[order(gtxpipe.results$index),]
  rownames(gtxpipe.results) <- 1:nrow(gtxpipe.results)
  gtxpipe.results$index <- NULL
  gtxpipe.results$makevar <- NULL
  gtxpipe.results$lambda <- round(gtxpipe.results$lambda, 4)
  
  ## set row names to something meaningful?
  metadata <- pipetable(gtxpipe.results,
                        "summary_results", "PGx analysis summary",
                        metadata,
                        number = 4) # specifying number because first 4 tables are generated out-of-order

  if (all(c("PC1", "PC2", "PC3") %in% names(ancestrypcs))) {
    ## Hard coding the options here; we want RACE; ETHNIC if available otherwise RACE,
    if (all(c("demo.RACE", "demo.ETHNIC") %in% names(anal1))) {
      adata <- merge(anal1, ancestrypcs, all = FALSE)
      metadata <- pipeplot('with(adata, pcaplot(cbind(PC1, PC2, PC3), paste(as.character(demo.RACE), as.character(demo.ETHNIC), sep = "; ")))', 
                           filename = "Ancestry_PC_RACE_ETHNIC", 
                           title = "Ancestry principal components by race and ethnicity",
                           metadata,
                           plotdata = rbind(snippets["Project", , drop = FALSE],
                             data.frame(value = "demo.RACE; demo.ETHNIC", 
                                        row.names = c("GroupedBy"),
                                        stringsAsFactors = FALSE)))
      ## plotpar not needed since pcaplot calls par within each (sub)screen
    } else if ("demo.RACE" %in% names(anal1)) {
      adata <- merge(anal1, ancestrypcs, all = FALSE)
      metadata <- pipeplot('with(adata, pcaplot(cbind(PC1, PC2, PC3), as.character(demo.RACE)))', 
                           filename = "Ancestry_PC_RACE", 
                           title = "Ancestry principal components by race",
                           metadata,
                           plotdata = rbind(snippets["Project", , drop = FALSE],
                             data.frame(value = "demo.RACE", 
                                        row.names = c("GroupedBy"),
                                        stringsAsFactors = FALSE)))
    }
    if ("demo.COUNTRY" %in% names(anal1)) {
      adata <- merge(anal1, ancestrypcs, all = FALSE)
      metadata <- pipeplot('with(adata, pcaplot(cbind(PC1, PC2, PC3), as.character(demo.COUNTRY)))', 
                           filename = "Ancestry_PC_COUNTRY", 
                           title = "Ancestry principal components by country",
                           metadata,
                           plotdata = rbind(snippets["Project", , drop = FALSE],
                             data.frame(value = "demo.COUNTRY", 
                                        row.names = c("GroupedBy"),
                                        stringsAsFactors = FALSE)))
    }
    if ("demo.REGION" %in% names(anal1)) {
      adata <- merge(anal1, ancestrypcs, all = FALSE)
      metadata <- pipeplot('with(adata, pcaplot(cbind(PC1, PC2, PC3), as.character(demo.REGION)))', 
                           filename = "Ancestry_PC_REGION", 
                           title = "Ancestry principal components by region",
                           metadata,
                           plotdata = rbind(snippets["Project", , drop = FALSE],
                             data.frame(value = "demo.REGION", 
                                        row.names = c("GroupedBy"),
                                        stringsAsFactors = FALSE)))
    }
  }
    
  message(Sys.time(), ' gtxpipe Writing source display metadata')
  metadata <- metadata[order(as.integer(metadata$number)), ]
  rownames(metadata) <- metadata$number
  metadata$number <- NULL
  ## FIXME, Should warn or truncate if titles >100 characters
  write.csv(metadata,
            file.path(getOption("gtxpipe.outputs"), "lela_metadata"))
  
  ## contrasts, assume independent (user responsibility) but GC will roughly control for overlapping groups
  ## Note the GC coefficient is computed after groupwise GC

  message(Sys.time(), ' gtxpipe Generating report')
  ### 
  ### Make "short" report, should switch on presence/absence of positive results
  ###
  if (file.exists(file.path(getOption("gtxpipe.outputs"), "report-short.Rmd"))) {
    # do nothing
  } else {
    file.copy(system.file("templates/gtxpipe-report-negative.Rmd",
                          package = "gtx", mustWork = TRUE),
              file.path(getOption("gtxpipe.outputs"), "report-short.Rmd"))
  }
  tryCatch({
    suppressMessages(requireNamespace("knitr", quietly = TRUE)) || stop("knitr package not available")
    oldwd <- getwd()
    setwd(getOption("gtxpipe.outputs"))
    knitr::knit2html("report-short.Rmd")
    setwd(oldwd)
  },
           error = function(e) {
             message(Sys.time(), " knitr::knit2html failed")
           })
  
  ## Note, no need to read info files if no hits...

  ## Default filtering on Rsq (at e.g. 0.05) would eliminate invariant dosages 0.3,...,0.3,...
  return(invisible(NULL))
}




## pipeslave, not exported but called from makefile as gtx:::pipeslave
pipeslave <- function(target) {
  target <- tryCatch(as.character(target)[1], error = function(e) "") # sanitize
  adir <- dirname(target) # analysis directory
  ofile <- file.path(adir, "options.R") # options file to read
  if (file.exists(ofile)) {
    source(ofile)
    ## should guarantee to set options gtx.usubjid, gtxpipe.genotypes,
    ## gtxpipe.threshold.MAF, and gtxpipe.threshold.Rsq
    ## but both have failsafe(?) fallbacks
  } else {
    warning('Options file "', ofile, '" does not exist')
  }

  ## gtxpipe.packages should have been cleaned up to match installed.packages by gtxpipe, but
  ## check here in case pipeslave is running in a different environment
  ## Note, any packages not available are silently ignored
  for (package in intersect(getOption('gtxpipe.packages', NULL), rownames(installed.packages()))) {
    if (!require(package, character.only = TRUE, quietly = TRUE)) {
      stop("gtxpipe: fatal error, unable to load package '", package, "'")
    }
  }

  usubjid <- getOption("gtx.usubjid", "USUBJID")
  tmp <- unlist(strsplit(basename(target), ".", fixed = TRUE))
  if (length(tmp) < 2 || !identical(tmp[length(tmp)], "done")) stop ('Bad target "', target, '"')
  job <- paste(tmp[-length(tmp)], collapse = ".") # job is basename with trailing .done stripped off
  rm(tmp)
    
  adataf <- file.path(adir, "analysis-dataset.csv") # analysis data filename
  if (!file.exists(adataf)) stop('Analysis dataset "', adataf, '" does not exist')
  adata0 <- scan(file.path(adir, "analysis-dataset.csv"), sep = "\n", character(0), quiet = TRUE)
  ## match call, error if more than 1
  mc <- which(substr(adata0, 1, 8) == '# call: ')
  stopifnot(identical(length(mc), 1L))
  qcall <- parse(text = substring(adata0[mc], 9)) # quoted call
  ## match transformations (can be any number) and build into dataframe
  mt <- which(substr(adata0, 1, 9) == '# where: ')
  if (length(mt) > 0) {
    gtxpipe.transformations <- do.call(rbind, lapply(strsplit(substring(adata0[mt], 10), ' <- ', fixed = TRUE), function(t1) {
      if (!identical(length(t1), 2L)) {
        stop('fatal error: "', t1, '" is not a transformation like "target <- fun"')
      }
      data.frame(targets = t1[1], fun = t1[2], stringsAsFactors = FALSE)
    }))
  } else {
    gtxpipe.transformations <- data.frame(targets = NULL, fun = NULL)
  }
  ## match cvlist, error if more than 1  FIXME should it be an error if there is no cvlist?
  cvlist <- which(substr(adata0, 1, 10) == '# cvlist: ')
  stopifnot(identical(length(cvlist), 1L))
  cvlist = tokenise.whitespace(substring(adata0[cvlist], 11))
  ## in read.csv should force some settings (stringsAsFactors = TRUE) just in case user options
  ## try to override
  adata <- read.csv(textConnection(adata0[substr(adata0, 1, 1) != "#"]),
                    stringsAsFactors = TRUE)
  adata[[usubjid]] <- as.character(adata[[usubjid]])
  ## apply transformations
  if (nrow(gtxpipe.transformations) > 0) {
    for (idx in 1:nrow(gtxpipe.transformations)) {
      target <- gtxpipe.transformations$targets[idx]
      if (target %in% names(adata)) stop("Transformation overwrites existing variable")
      message(target, " <- ", gtxpipe.transformations$fun[idx]) # debugging message
      adata[[target]] <- eval(parse(text = gtxpipe.transformations$fun[idx]), envir = adata)
    }
  }

  ## ? could sink to .done file, blockassoc should use message() not cat()
  res <- blockassoc(qcall = qcall, data = adata, 
                    minimac = file.path(getOption("gtxpipe.genotypes", "genotypes"), job),
                    threshold.MAF = getOption("gtxpipe.threshold.MAF", 0.01),
                    threshold.Rsq = getOption("gtxpipe.threshold.Rsq", 0.01),
                    threshold.pass = cvlist,
                    ## message specific to intended output file in subsequent write.table
                    message.begin = paste("blockassoc(", file.path(adir, basename(job)), ")", sep = ""))
  ## Require pvalue always, beta and SE if being used for interaction contrasts
  ## Consider writing some comments to this file (blockassoc should return character vector instead of printing messages)
  
  ## Determine which values to write to results file:
  if (getOption("gtxpipe.extended.output", TRUE)) {
    out.columns <- c("SNP", "beta", "SE", "pvalue","Al1","Al2","analysed.Freq1","analysed.Rsq")
  } else {
    out.columns <- c("SNP", "beta", "SE", "pvalue")
  }
  write.table(res[ , out.columns], 
              row.names = FALSE, quote = FALSE, sep = "\t",
              file = gzfile(file.path(adir, paste(job, "out.gz", sep = "."))))
  return(invisible(NULL))
}
        
pipetable <- function(data, filename, title,
                      mdata = data.frame(NULL), number,
                      width = 11.69, height = 8.27) {
  ## FIXME allow option for row names to be suppressed in pdf, requires pass through option to textgrid()

  
  ## number argument is the smallest allowable display number
  number.used <- c(0, as.integer(mdata$number))
  if (missing(number) || number %in% number.used) number <- max(number.used) + 1
  number <- gsub("^ ", "0", format(number, width = getOption("gtxpipe.display.numwidth", 2)))
  number2 <- paste(c(getOption("gtxpipe.display.section", NULL), number), collapse = ".")
  ## Updates to code above should be made here and in pipeplot()

  path <- file.path(getOption("gtxpipe.outputs", "."), paste(number, filename, sep = "_"))
  message(Sys.time(), ' gtxpipe Writing ', title, ' to "', path, '.[csv|pdf]"')
  dir.create(getOption("gtxpipe.outputs", "."), recursive = TRUE, showWarnings = FALSE) # FIXME throw error if fails

  write.csv(data,
            file = paste(path, "csv", sep = "."),
            row.names = TRUE) # can fail silently if directory does not exist

  pdf(width = width, height = height,
      file = paste(path, "pdf", sep = "."))
  scs <- split.screen(matrix(c(1/11.69, # left margin will be 1" *if* a4 portrait
                               1-1/11.69, # right margin
                               1/8.27, # top margin
                               1-1/8.27 # bottom margin
                               ), nrow = 1))
  screen(scs[1])
  par(family="mono", mar = c(0, 0, 4, 0) + 0.1)
  plot.new()
  plot.window(c(0, 1), c(0, 1))
  textgrid(data)
  mtext(paste("Protocol:", getOption("gtxpipe.protocol", "NA")), side = 3, line = 3, adj = 0)
  mtext("Page 1 of 1", side = 3, line = 3, adj = 1)
  mtext(paste("Population:", "ITT"), side = 3, line = 2, adj = 0) # hard coded to ITT
  mtext(paste("Data as of:", getOption("gtxpipe.date", "NA")), side = 3, line = 2, adj = 1)
  mtext(paste("Source table", number2), side = 3, line = 1, adj = 0.5)
  mtext(title, side = 3, line = 0, adj = 0.5)
  close.screen(scs); dev.off()

  return(rbind(mdata,
               data.frame(Source_File_Name = paste(paste(number, filename, sep = "_"), "pdf", sep = "."), 
                          Display_Category = "PHARMACOGENETIC", 
                          Display_Type = "TABLE", 
                          Display_Number = number2,
                          Title = title,
                          number = number, 
                          stringsAsFactors = FALSE)))
}

pipeplot <- function(plotfun, filename, title,
                     mdata = data.frame(NULL), number,
                     plotdata, plotpar, 
                     width = 8.27, height = 11.69) {

  ## number argument is the smallest allowable display number
  number.used <- c(0, as.integer(mdata$number))
  if (missing(number) || number %in% number.used) number <- max(number.used) + 1
  number <- gsub("^ ", "0", format(number, width = getOption("gtxpipe.display.numwidth", 2)))
  number2 <- paste(c(getOption("gtxpipe.display.section", NULL), number), collapse = ".")
  ## Updates to code above should be made here and in pipetable()
  
  path <- file.path(getOption("gtxpipe.outputs", "."), paste(number, filename, sep = "_"))
  dir.create(getOption("gtxpipe.outputs", "."), recursive = TRUE, showWarnings = FALSE) # FIXME throw error if fails
  
  if (all(capabilities(c("png", "cairo")))) {
    png(type = "cairo", filename = paste(path, "png", sep = "."),
        width = width*300, height = height*300, res = 300)
    message(Sys.time(), ' gtxpipe Plotting ', title, ' to "', path, '.png"')
  } else if (suppressMessages(requireNamespace("Cairo", quietly = TRUE))) {
    ## This generates an undeclared import warning with R CMD CHECK, not sure why.  FIXME
    Cairo::CairoPNG(file = paste(path, "png", sep = "."),
             width = width*300, height = height*300, res = 300)
    message(Sys.time(), ' gtxpipe Plotting ', title, ' to "', path, '.png"')
  } else {
    pdf(file = paste(path, "pdf", sep = "."),
        width = width, height = height)
    message(Sys.time(), ' gtxpipe Plotting ', title, ' to "', path, '.pdf"')
  }

  ## By default, all figures are A4 portrait with the TOP half devoted to plot metadata
  ## Should (?) work even if plotfun subsequently alters e.g. par(mfrow)
  scs <- split.screen(matrix(c(1/8.27, 1/8.27, # left margin
                               1-1/8.27, 1-1/8.27, # right margin
                               0.5+0.25/11.69, 1/11.69, # top margin
                               1-1/11.69, 0.5-0.25/11.69 # bottom margin
                               ), nrow = 2))
  
  screen(scs[1])
  oldpar <- par(family="mono", mar = c(0, 0, 4, 0) + 0.1)
  plot.new()
  plot.window(c(0, 1), c(0, 1))
  if (!missing(plotdata)) {
    textgrid(plotdata)
  } else {
    text(0.5, 0.5, "MISSING PLOT METADATA")
  }
  mtext(paste("Protocol:", getOption("gtxpipe.protocol", "NA")), side = 3, line = 3, adj = 0)
  mtext("Page 1 of 1", side = 3, line = 3, adj = 1)
  mtext(paste("Population:", "ITT"), side = 3, line = 2, adj = 0) # hard coded to ITT
  mtext(paste("Data as of:", getOption("gtxpipe.date", "NA")), side = 3, line = 2, adj = 1)
  mtext(paste("Source figure", number2), side = 3, line = 1, adj = 0.5)
  mtext(title, side = 3, line = 0, adj = 0.5)
  par(oldpar)
  
  screen(scs[2])
  if (!missing(plotpar)) oldpar <- do.call(par, as.list(plotpar)) else oldpar <- par()
  eval.parent(parse(text = plotfun)) # was: eval(..., envir = parent.frame())
  par(oldpar)
  close.screen(scs); dev.off()
  
  return(rbind(mdata,
               data.frame(Source_File_Name = paste(paste(number, filename, sep = "_"), "png", sep = "."), 
                          Display_Category = "PHARMACOGENETIC", 
                          Display_Type = "FIGURE", 
                          Display_Number = number2,
                          Title = title,
                          number = number,
                          stringsAsFactors = FALSE)))
}

pipebed <- function(snplist, flank, outfile) {
  #parse chromosomes and coordinates from snplist
  chr <- vapply(strsplit(snplist, ":"), function(ss) return(ss[1]), character(1))
  pos <- as.integer(vapply(strsplit(snplist, "[:_]"), function(ss) return(ss[2]), character(1)))
  #create dataframe with 4 standard bed columns
  cvbed <- data.frame(chr = chr, s = pos - flank - 1, e = pos + flank,SNP = snplist, stringsAsFactors=FALSE)
  #sort by chromosome and start (since all single base coordinates, no need to sort on end)
  cvbed <- cvbed[order(cvbed$chr,cvbed$s),]
  #re-label columns with standard bed headers
  colnames(cvbed) <- c("#chrom","start","end","SNP")
  #confirmed ok for replicate records in bed file, no need to uniquify for tabix
  suppressWarnings(write.table(cvbed, quote=FALSE, sep = "\t", row.names = FALSE, 
                               file = file.path(outfile),
                               append = FALSE))
}
  

  
tobyjohnson/gtx documentation built on Aug. 30, 2019, 8:07 p.m.