pippeline: Pipeline for processing high-dimensional multi-omics biobank datasets

# Helper functions

# Determine the name of the data objects.  @return A character vector
# with object names, or NULL in case of errors
getDataObjs <- function() {
  row <- which(options[, "Design"] == input$dsg &
               options[, "Location"] == input$loc &
               options[, "Material"] == input$mat &
               options[, "Analysis"] == input$ana)
  if (row < 1) {
    showNotification("No matching or unique data found. (Check file \"",
                     basics$optionsFile, "\"?) Error code #1.", type = "error",
                     duration = basics$msgDuration)
    return(NULL)
  }
  jObjs <- as.list(t(options[row, c("Obj1", "Obj2", "Obj3")]))  # joint objects and remove empty elements
  jObjs <- jObjs[jObjs != ""]  # remove empty elements
  jObjs <- lapply(jObjs, function(e) {
    l <- as.list(strsplit(e, ",")[[1]])
    names(l) <- c("ge", "nc")  # gene expressions and negative controls
    l
  })
  # check existance
  allObjsExist <- TRUE
  sapply(jObjs, function(e) {
    sapply(e, function(ee) {
      if (!exists(ee)) {
        showNotification(paste0("Object \"", ee, "\" does not exist. (Check file \"",
                                basics$optionsFile, "\" or load object.) Error code #2."),
                         type = "error", duration = NULL, id = "data")
        allObjsExist <<- FALSE
      }
    })
  })
  if (!allObjsExist)
    return(NULL)
  removeNotification("data")
  jObjs
}  # function getDataObjs

# Format a code line as a comment.  @param string: Code line @return
# string
cmt <- function(line = "") {
  paste("#'", line)
}

# Create a list object to store the attributes of a processing step.
# @param string: step label @param character vector: step explanation
# @param boolean: true if step is enabled, false otherwise @param
# function which generates a vector with code line strings @param
# arguments to the function call @return mixed: list if enabled, NULL
# otherwise
createStep <- function(label, explanation = c(), enabled = FALSE, generateCode = function() {
  c()
}, args = list()) {
  if (enabled)
    list(label = label,
         expl = explanation,
         enabled = enabled,
         code = do.call(generateCode, args)) else NULL
}

# Generate the documentation for all processing steps.  @param list
# object with processing pipeline attributes @return character vector:
# code lines
documentSteps <- function(pipeline) {
  doc <- c()
  for (step in pipeline) {
    if (!is.null(step) && step$enabled) {
      doc <- c(doc, "", cmt(paste("#", step$label)), cmt(step$expl),
               step$code)
    }
  }
  return(doc)
}  # function documentSteps

# Write the code that instantiates both the processing and documentation of a processing pipeline.
# @param list object with pipeline attributes
# @param string: filename to write the code to
writeScript <- function(pipeline, scriptFile) {
  pkgInfo = read.dcf(file.path('..', 'DESCRIPTION'), fields = c('Package', 'Version'))
  doc <- c(
    # YAML
    cmt('---'),
    cmt(paste0('title: "', basics$title, '"')),
    cmt(paste0('author: "', input$author, '"')),
    cmt(paste0('project name: "', input$projname, '"')),
    cmt(paste0('date: "', ts, '"')),
    cmt('---'),
    # header
    cmt(),
    cmt('This document describes how the associated biobank dataset was prepared.'),
    cmt(paste0('It was generated by ', basics$appName, ', an application included in the R package ', pkgInfo[ 1], ', version ', pkgInfo[ 2], '.')),
    cmt('See the R directory for documentation and implementation of statistical functions.'),
    cmt(),
    cmt('***'),
    cmt(),
    cmt('# Basic choices'),
    cmt(paste('* Processing description:', input$descr)),
    cmt(paste('* Design:', input$dsg)),
    cmt(paste('* Probe location:', input$loc)),
    cmt(paste('* Biological material:', input$mat)),
    cmt(paste('* Genomic analysis:', input$ana)),
    cmt(),
    cmt(paste0('Data source file names are read from file: ', basics$optionsFile, '.')),
    cmt(),
    cmt(paste0('Questionnaire object names are read from file: ', basics$questsFile, '.')),
    '',
    '# requirements',
    # suppressMessages(library())
    sprintf('library(%s)', pkgInfo[1] ),
    sprintf('library(arrayQualityMetrics)'),
    sprintf('library(limma)'),
    sprintf('library(lumi)'),
    sprintf('library(nlme)'),
    sprintf('library(sva)'),
    sprintf('library(illuminaHumanv3.db)'),
    sprintf('library(illuminaHumanv4.db)'),
    sprintf('library(lumiHumanIDMapping)'),
    sprintf('library(genefilter)'),
    sprintf('library(Biobase)'),
    # steps, incl. read and write
    documentSteps(pipeline),
    # footer
    cmt(),
    cmt('***')
  )
  writeLines( doc, scriptFile)
} # function writeScript

# Build a list object which keeps all pipeline details stored.
# @param list: parameters
# @return list: pipeline attributes
generatePipeline <- function(params) {
  numberOfRuns <- length(params$sourceObjs)
  idxSeq <- 1:numberOfRuns

  # details for mandatory processing steps step: reading
  generateCode <- function(gExprs) {
    code <- c(sprintf("data <- vector(\"list\",length=%d)", numberOfRuns),
              sprintf("data[[%d]] <- vector(\"list\",length=2)", idxSeq))
    if (sum(sapply(gExprs, exists)) == numberOfRuns)
      code <- c(code, sprintf("data[[%d]]$lumi <- get(\"%s\")", idxSeq, gExprs))
    else code <- c(code, "stop(\"Source dataset is non-existant. Error code #7.\")")
      code
  }
  readStep <- createStep("Datasets", "Reading in datasets", TRUE, generateCode,
                         list(unlist(lapply(params$sourceObjs, "[", "ge"))))

  # step: combination
  generateCode <- function() {
    if (numberOfRuns > 1) {
      args <- paste(sprintf("data[[%d]]", idxSeq), collapse = ",")
      c(sprintf("data <- BiocGenerics::combine(%s)", args))
    } else {
      c(sprintf("data <- data[[1]]$lumi"))
    }
  }
  combStep <- createStep("Combination", "Combining all runs", TRUE, generateCode)

  # step: anonymization
  generateCode <- function() {
    c("# simply overwrite all sample/person IDs",
      "colnames(data) <- 1:ncol(data)",
      "# remove all other identification traces",
      sprintf("m <- match(c(\"%s\"), rownames(data))",
      paste(as.character(basics$ids), collapse = "\",\"")),
      "m <- m[!is.na(m)]",
      "if(length(m)) data <- data[-m,]", "rm(m)")
  }
  anoStep <- createStep("Anonymization", "Overwriting all identifying labels and numbers",
                        TRUE, generateCode)

  # step: storage
  generateCode <- function(file) {
    c(sprintf("saveRDS(data,file=\"%s\")", file),
      cmt("# Cleaning environment"))
  }
  writeStep <- createStep("Storage", "Writing processed datasets", TRUE,
                          generateCode, list(params$targetFile))

  # step: outliers
  generateCode <- function() {
    code <- c(cmt(),
              cmt(paste("Description of outliers:",
              ifelse(input$outlierDescr != "", input$outlierDescr, "Not available"))))

    outlierFile <- file.path(nowacleanOutliers, input$outlierFile)
    if (!file.exists(outlierFile) || as.integer(file.access(outlierFile,  mode = 4)) < 0) {
      code <- c(code, "stop(\"Could not read outliers file. Error code #4.\")")
    } else {
      code <- c(code, sprintf("# filename: %s (using original file)", input$outlierFile),
                sprintf("outliers <- readRDS(\"%s\")", outlierFile),
                sprintf("m <- vector(\"list\",length=%d)", numberOfRuns),
                sprintf("m[[%1$d]] <- match(outliers,colnames(exprs(data[[%1$d]]$lumi)))",idxSeq),
                sprintf("m[[%1$d]] <- m[[%1$d]][!is.na(m[[%1$d]])]", idxSeq),
                sprintf("if(length(m[[%1$d]])) data[[%1$d]]$lumi <- data[[%1$d]]$lumi[,-m[[%1$d]]]", idxSeq),
                "rm(m,outliers)")
    }
    code
  }

  outlStep <- createStep("Outliers", "Removal of outliers", as.logical(input$outlierEnabled),
                         generateCode)

  # details for non-mandatory processing steps step: control transitions
  generateCode <- function() {
    cctFile <- file.path(cctransExcl, input$cctFile)
    if (!file.exists(cctFile) || as.integer(file.access(cctFile, mode = 4)) < 0) {
      c("stop(\"Could not read transitions file. Error code #4_1.\")")
    } else {
      c(sprintf("cctrans <- readRDS(\"%s\")", cctFile),
        sprintf("m <- vector(\"list\",length=%d)", numberOfRuns), 
        sprintf("m[[%1$d]] <- match(cctrans,colnames(exprs(data[[%1$d]]$lumi)))", idxSeq),
        sprintf("m[[%1$d]] <- m[[%1$d]][!is.na(m[[%1$d]])]", idxSeq),
        sprintf("if(length(m[[%1$d]])) data[[%1$d]]$lumi <- data[[%1$d]]$lumi[,-m[[%1$d]]]", idxSeq),
        "rm(m,cctrans)")
    }
  }
  exclStep <- createStep("Transitions", "Exclusion of control-case transitions",
                         as.logical(input$transEnabled), generateCode)

  # step: background correction
  generateCode <- function(nCtrls) {
    if (sum(sapply(nCtrls, exists)) == numberOfRuns)
      code <- c("# preparing the negative control probes",
                sprintf("data[[%d]]$negCtrl <- get(\"%s\")", idxSeq, nCtrls),
                sprintf("pIDs <- vector(\"list\",length=%d)", numberOfRuns),
                sprintf("pIDs[[%1$d]] <- data[[%1$d]]$negCtrl$ProbeID", idxSeq),
                sprintf("data[[%1$d]]$negCtrl <- t(data[[%1$d]]$negCtrl[,-c(1,2)])", idxSeq),
                sprintf("colnames(data[[%1$d]]$negCtrl) <- pIDs[[%1$d]]", idxSeq),
                "rm(pIDs)")
    else code <- c("stop(\"Negative control probes are non-existant. Error code #8.\")")
    code <- c(code,
              "# now correct",
              sprintf("data[[%d]]$lumi <- pippeline::performBackgroundCorrection(data[[%1$d]]$lumi,data[[%1$d]]$negCtrl)", idxSeq))
    code
  }
  bcorrStep <- createStep("Background correction", "Perform background correction and remove bad probes",
                          as.logical(input$corrEnabled), generateCode,
                          list(unlist(lapply(params$sourceObjs, "[", "nc"))))

  # step: probe filtering
  generateCode <- function() {
    c(sprintf("pValue <- %1.2f", input$pval),
      sprintf("pLimit <- %1.2f", input$plimit),
      "graph_data <- data",
      "data <- pippeline::filterData(data,pValue,pLimit)",
      "rm(pValue, pLimit)")
  }
  filtStep <- createStep("Probe filtering", "Filtering based on on pValue and presentLimit",
                         as.logical(input$filtEnabled), generateCode)

  # step: normalization
  generateCode <- function() {
    if (input$nmeth == "vstQuantileNorm") {
      code <- c("data <- pippeline::normalizeDataVstQ(data)")
    }
    if (input$nmeth == "ComBat") {
      code <- c(sprintf("data <- pippeline::normalizeDataComBat(data, \"%s\", \"%s\", \"%s\")",
                        input$batchTab, input$batchSampleID, input$batchVar))
    }
    code
  }
  normStep <- createStep("Normalization", "log2 transformation and quantile normalization or ComBat",
                         as.logical(input$normEnabled), generateCode)

  # step: extraction
  generateCode <- function() {
    if (as.logical(input$normEnabled))
      c("# have gene expressions matrix due to normalization: no need for extraction")
    else
      c("data <- exprs(data)")
  }
  extrStep <- createStep("Extraction", "Extraction of gene expression matrix from lumi object",
                         TRUE, generateCode)

  # step: conversion
  generateCode <- function() {
    if (as.logical(input$wantGenes)) {
      code <- c("# Note: The target data file contains genes.")
      if (!as.logical(input$normEnabled))
        code <- c(code,
                  cmt("Also, as the normalization is currently disabled, consider
                      that the mapping relies on taking the average of multiple probes.
                      You may thus want to enable normalization."))
      c(code, "data <- pippeline::mapToGenes(data)")
    } else
      c("# Note: The target data file contains probes.")
  }
  convStep <- createStep("Conversion", "Optional mapping of probes to genes",
                         TRUE, generateCode)

  # step: questionnaires
  generateCode <- function() {
    code <- c()
    if (exists(input$questObj)) {
      code <- c(code, "# ",
                sprintf("quest <- get(\"%s\")", input$questObj),
                "# determine sample ID matches",
                "m <- match(colnames(data), quest$labnr)",
                "# reduce variable set if necessary",
                sprintf("qvars <- c(\"%s\")", paste(as.character(input$questVars), collapse = "\",\"")),
                "# ad-hoc for column naming",
                "questmatr <- as.matrix(quest)",
                "if (length(qvars) > 1) {",
                "extrquest <- as.matrix(t(questmatr)[qvars,m])",
                "} else {",
                "extrquest <- as.matrix(t(t(questmatr)[qvars,m]))",
                "rownames(extrquest) <- qvars",
                "}",
                "",
                "# sew together matches",
                "data <- rbind(data,extrquest)",
                "questData <- as.data.frame(extrquest)",
                "rm(qvars,m,quest,questmatr,extrquest)")
    } else
      code <- c("stop(\"Questionnaire object non-existant. Error code #10.\")")
    if (as.logical(input$deathEnabled)) {
      code <- c(code,
                "# ",
                "# Adding death information",
                sprintf("deathInfo <- read.csv(file=\"%s\")", deathInfoFile),
                "md <- match(colnames(data), deathInfo$labnr)",
                "deathVars <- c(\"DODDT\",\"DC\")",
                "deathInfoMatr <- as.matrix(deathInfo)",
                "extr <- as.matrix(t(deathInfoMatr)[deathVars,md])",
                "data <- rbind(data,extr)",
                "rm(deathInfo,md,deathVars,deathInfoMatr,extr)")
    }
    return(code)
  }
  questStep <- createStep("Questionnaires", "Selecting variables from associated questionnaires",
                          (as.logical(input$questEnabled)), generateCode)
  # now concatenate all steps
  list(
    readStep,
    outlStep,
    exclStep,
    bcorrStep,
    combStep,
    filtStep,
    normStep,
    extrStep,
    convStep,
    questStep,
    anoStep,
    writeStep
  )
} # function generatePipeline


performInterSteps <- function(tempDataFile, tempScriptFile) {
  showNotification("Processing...Please wait a moment.", duration = NULL,
                   id = "waitInter")

  tryCatch({
    file.remove(tempDataFile)
    file.remove(tempScriptFile)
  }, warning = function(wrn) {
    NULL
  })

  pipeline <- generatePipeline(list(sourceObjs = sourceObjs(), targetFile = tempDataFile))

  tryCatch({
    writeScript(pipeline, tempScriptFile)
    # Hide cat messages from libs (check source params(echo, verbose))
    invisible(capture.output(source(tempScriptFile)))
    removeNotification("waitInter")
    file.remove(tempScriptFile)

    # showNotification('Successfull step execution', type = 'message',
    # duration = 4)
  }, error = function(err) {
    removeNotification("waitInter")
    showNotification("Could not perform step. Error code #11.", type = "error",
                     duration = NULL)
    showNotification("Error info: ", toString(err), type = "error",
                     duration = NULL)
  })
  rm(pipeline)
}


getTempDataset <- function(tempDataFile) {
  tryCatch({
    ds <- readRDS(tempDataFile)
    return(ds)
  }, error = function(err) {
    showNotification("Could not produce data/documentation. (Error while sourcing script.) Error code #14.",
                     type = "error", duration = NULL)
    showNotification("Error info: ", toString(err), type = "error",
                     duration = NULL)
    return(NULL)
  })
}

interStepAndUpdate <- function(tmpDSVec) {
  performInterSteps(tmpDSVec[1], tmpDSVec[2])
  ds <- getTempDataset(tmpDSVec[1])
  return(c(as.numeric(dim(ds)[1]), as.numeric(dim(ds)[2])))
}

setBtnInitState <- function() {
  shinyjs::hide("outlierDown")
  shinyjs::hide("outlierApply")
  shinyjs::hide("corrDown")
  shinyjs::hide("corrApply")
  shinyjs::hide("filtDown")
  shinyjs::hide("filtApply")
  shinyjs::hide("normDown")
  shinyjs::hide("normApply")
  shinyjs::hide("questApply")

  shinyjs::show("outlierNext")
  shinyjs::show("outlierSkip")
  shinyjs::show("corrNext")
  shinyjs::show("corrSkip")
  shinyjs::show("corrBack")
  shinyjs::show("filtNext")
  shinyjs::show("filtSkip")
  shinyjs::show("filtBack")
  shinyjs::show("normNext")
  shinyjs::show("normSkip")
  shinyjs::show("normBack")
  shinyjs::show("questNext")
  shinyjs::show("questSkip")
  shinyjs::show("questBack")
}

resetChosenValues <- function() {
  # reset all checkboxes
  reset("outlierEnabled")
  reset("outlierFile")
  reset("outlierFileReport")
  reset("transEnabled")
  reset("cctFile")
  reset("transFileReport")
  reset("corrEnabled")
  reset("filtEnabled")
  reset("normEnabled")
  reset("batchTab")
  reset("batchSampleID")
  reset("batchVar")
  reset("questEnabled")
  reset("deathEnabled")
  reset("wantGenes")
}

resetStepsAndInfo <- function() {
  # reset choices and btns
  resetChosenValues()
  setBtnInitState()

  updateTextAreaInput(session, "outlierDescr", value = "")

  # reset reactive values
  piplInfo$origInfoStr <<- notProcMsg
  piplInfo$currFeatures <<- notProcMsg
  piplInfo$currSamples <<- notProcMsg
  origPairs <<- NULL
}

# setWorkingTabs <- function(nextId) {
# for (id in tabsId) {
#   if (id != nextId || !(id %in% nextId)) js$disableTab(id) else js$enableTab(id)
#   }
# }

setTabInitialState <- function(tbId) {
  for (id in tbId) {
    js$disableTab(id)
  }
}

# Using for pkg='nowac'
getPkgDataSetNames <- function(pkg, prefix = "") {
  result <- c()
  ds_vector <- c(data(package = pkg))
  df_info <- as.data.frame(ds_vector$results)
  raw_ds_names <- as.character(df_info$Item)
  splitted_ds_names <- strsplit(raw_ds_names, " ")
  for (i in splitted_ds_names) {
    result <- c(result, i[1])
  }
  return(result)
}

getDSColNames <- function(dataset) {
  result <- c()
  ds <- get(dataset)
  result <- colnames(ds)
  return(result)
}

# For ad-hoc
removeLine <- function(line, file) {
  print("Removing")
  system(sprintf("sed -i '/%s/d' %s", line, file))
}

buildGraph <- function(dataset) {
  xvalues <- seq(from = 0, to = 1, by = 0.05)
  datacopy <- dataset
  prLimitVariate_Features <- c()
  pValVariate_Features <- c()

  for (i in xvalues) {
    tempd <- pippeline::filterData(datacopy, i, input$plimit)
    pValVariate_Features <- c(pValVariate_Features, dim(tempd)[1])
  }
  for (i in xvalues) {
    tempd <- pippeline::filterData(datacopy, input$pval, i)
    prLimitVariate_Features <- c(prLimitVariate_Features, dim(tempd)[1])
  }

  plot_data <- data.frame(pValVariate_Features, prLimitVariate_Features,
                          xvalues)

  require(ggplot2)
  result <- ggplot(plot_data, aes(x = xvalues)) +
            geom_line(aes(y = pValVariate_Features, colour = sprintf("const presentLimit = %1.2f, p-Value variate", input$plimit))) +
            geom_point(y = pValVariate_Features) +
            geom_line(aes(y = prLimitVariate_Features, colour = sprintf("const p-Value = %1.2f, presentLimit variate", input$pval))) +
            geom_point(y = prLimitVariate_Features) +
            labs(title = "p-Value/presentLimit vs feature number", x = "p-Value/presentLimit, %", y = "Number of features", color = "Legend")
  return(result)
}
uit-bdps/pippeline documentation built on May 22, 2019, 5:35 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
uit-bdps/pippeline
Pipeline for processing high-dimensional multi-omics biobank datasets

shiny/server/functions.R
In uit-bdps/pippeline: Pipeline for processing high-dimensional multi-omics biobank datasets

R Package Documentation

Browse R Packages

We want your feedback!

uit-bdps/pippeline Pipeline for processing high-dimensional multi-omics biobank datasets

shiny/server/functions.R In uit-bdps/pippeline: Pipeline for processing high-dimensional multi-omics biobank datasets

R Package Documentation

Browse R Packages

We want your feedback!

uit-bdps/pippeline
Pipeline for processing high-dimensional multi-omics biobank datasets

shiny/server/functions.R
In uit-bdps/pippeline: Pipeline for processing high-dimensional multi-omics biobank datasets