#' Description: Profiling preprocessing script
#'
#' Arguments:
#' @param dbuser MySQL username
#' @param dbpwd MySQL password
#' @param dbname MySQL database name
#' @param verbose monitor processing through intermediate messages
#' @param host host; needed with FTP connections
#' @param port port; needed with FTP connections
#' @param use.precalculated.phylogeny use precalculated phylogeny?
#' @param summarization.methods List summarization methods to be included in output. With HITChip frpa always used; with other chips rpa always used. Other options: "sum", "ave"
#' @param which.projects Optionally specify the projects to extract. All samples from these projects will be included.
#' @param all.samples Use all samples from the selected project by default? TRUE / FALSE
#' @param save.dir Output data folder
#' @param use.default.parameters use.default.parameters
#' @return Preprocessed data and parameters
#'
#' @export
#' @importFrom DBI dbDriver
#' @references See citation("microbiome")
#' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com}
#' @keywords utilities
preprocess.chipdata <- function (dbuser, dbpwd, dbname, verbose = TRUE, host = NULL, port = NULL, use.precalculated.phylogeny = NULL, summarization.methods = c("frpa", "sum"), which.projects = NULL, all.samples = TRUE, save.dir = ".", use.default.parameters = FALSE) {
## ask parameters or read from R-file
drv <- dbDriver("MySQL")
if (!(is.null(host) && is.null(port))) {
con <- dbConnect(drv, username = dbuser, password = dbpwd, dbname = dbname, host = host, port = port)
} else {
con <- dbConnect(drv, username = dbuser, password = dbpwd, dbname = dbname)
}
chip <- detect.chip(dbname)
params <- ReadParameters(con, which.projects = which.projects, all.samples = all.samples, chip = chip, save.dir = save.dir, use.default.parameters = use.default.parameters)
if (params$chip == "HITChip" && "rpa" %in% summarization.methods) {
warning("Frozen-RPA (fRPA) used instead of RPA for HITChip.")
summarization.methods <- unique(gsub("^rpa", "frpa", summarization.methods))
} else if (!params$chip == "HITChip" && "frpa" %in% summarization.methods) {
warning(paste("RPA used instead of frozen-RPA (fRPA) for", params$chip))
summarization.methods <- unique(gsub("frpa", "rpa", summarization.methods))
}
message("Get sample information matrix for the selected projects")
project.info <- fetch.sample.info(params$prj$projectName, chiptype = NULL,
dbuser = dbuser,
dbpwd = dbpwd,
dbname = dbname,
host = host,
port = port,
selected.samples = params$samples$sampleID)
# Let us require that all data is from a unique chip design; otherwise stop
if (length(unique(project.info$designID)) > 1) {
# message(table(project.info$designID, project.info$projectName))
stop("The selected projects are from different array versions or missing the array design info! Combining these is not allowed.")
}
if (nrow(project.info) == 0 || is.null(project.info)) {
warning("project.info is empty")
return(NULL)
}
message("Get probe-level data for the selected hybridisations")
tmp <- get.probedata(unique(project.info[["hybridisationID"]]), params$rm.phylotypes$oligos, dbuser, dbpwd, dbname, host = host, port = port)
fdat.orig <- tmp$data # features x hybs, original non-log scale
fdat.oligoinfo <- tmp$info # oligoinfo
# Annotations for selected hybridisations
fdat.hybinfo <- project.info[match(colnames(fdat.orig), project.info$hybridisationID), ]
rownames(fdat.hybinfo) <- colnames(fdat.orig)
message("Discard the hybs that contain only NAs")
onlyNA <- colMeans(is.na(fdat.orig)) == 1
naHybs <- colnames(fdat.orig)[onlyNA]
if(sum(onlyNA) > 0){
message("Removing the following hybs, because they contain only NAs:\n")
message(naHybs,"\n")
fdat.orig <- fdat.orig[, !onlyNA]
fdat.hybinfo <- fdat.hybinfo[!onlyNA, ]
}
##############################
## Between-array normalization
##############################
message("Minmax parameters hard-coded to standardize normalization")
# Using the parameters from HITChip atlas
# params$minmax.points <- c(30.02459, 132616.91371)
params$minmax.points <- c(30, 133000)
# "selected scaling for featurelevel data"
# Background correction after this step, if any.
# Order of normalization / bg correction was validated empirically.
# bg.adjust intentionally set to NULL
# bg correction done _after_ oligo summarization, if any (see next steps)
d.scaled <- ScaleProfile(fdat.orig, params$normalization, bg.adjust = NULL, minmax.points = params$minmax.points)
##################################
## GET OLIGO-PHYLOTYPE MAPPINGS
##################################
message("ReadPhylogeny")
ph <- ReadPhylogeny(params$phylogeny, params$rm.phylotypes,
params$remove.nonspecific.oligos,
dbuser = dbuser,
dbpwd = dbpwd,
dbname = dbname,
host = host,
port = port,
verbose = verbose,
chip = params$chip)
taxonomy.filtered <- ph$filtered
taxonomy.full <- ph$full
####################
## COMPUTE SUMMARIES
####################
message("Summarize probes into oligos and hybridisations into samples")
oligo.log10 <- summarize.rawdata(log10(d.scaled),
fdat.hybinfo,
fdat.oligoinfo = fdat.oligoinfo)
# Return to the original scale
oligo.abs <- matrix(10^oligo.log10, nrow = nrow(oligo.log10))
rownames( oligo.abs ) <- rownames( oligo.log10 )
colnames( oligo.abs ) <- colnames( oligo.log10 )
levels <- c("species", "L2", "L1")
if (params$chip %in% c("MITChip", "PITChip", "ChickChip")) {
levels <- c(levels, "L0")
if ("frpa" %in% summarization.methods) {
warning("fRPA summarization not implemented for PITChip or MITChip - using RPA instead!")
summarization.methods[[which(summarization.methods == "frpa")]] <- "rpa"
summarization.methods <- unique(summarization.methods)
}
}
params$summarization.methods <- summarization.methods
list(probedata = oligo.abs, taxonomy = taxonomy.filtered, taxonomy.full = taxonomy.full, naHybs = naHybs, params = params)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.