#' @include methods_MseekGraph.R
#' @title analyzeTable
#' @name analyzeTable
#'
#' @description Run a series of analyses on a feature table data.frame
#'
#' @param df a data.frame with numeric (intensity) values, rt and mz values
#' @param intensities the intensity column names, before normalization
#' (without __norm suffix), will be automatically renamed if useNormalized.
#' @param groups named list of non-normalized intensity columns listed by group
#' (as supplied by $anagroupnames of MseekFT objects), will be automatically
#' renamed if useNormalized.
#' @param analyze character vector to select the analyses to be run:
#' "Basic analysis", "clara_cluster", "t-test", "Peak shapes"
#' @param normalize normalze intensity columns
#' @param useNormalized use normalized values for analyses; will trigger
#' normalize if there is no normalized data available for all selected
#' intensity columns
#' @param logNormalized if TRUE, applies a log10 to intensity values after normalization
#' @param MSData a (named) list of xcmsRaw objects, e.g. generated by
#' Mseek::loadRawM, OR an MSnbase OnDiskMSnExp object required for peak
#' shape analysis
#' @param ppm ppm range for peak shape analysis
#' @param controlGroup control group for foldChange (part of Basic analysis)
#' analysis (optional)
#' @param numClusters number of clusters for clara_clusters analysis
#' @param mzMatchParam list of parameters passed to mass
#' @param workers number of workers to use for multithreaded analyses
#'
#' @return \code{df} with additional columns (or overridden columns) as generated by the selected analyses
#'
#' @importFrom stats prcomp
analyzeTable <- function(df, intensities,
groups,
analyze = c("Basic analysis", "clara_cluster",
"t-test", "Peak shapes",
"Fast peak shapes", "PCA features",
"PCA samples", "mzMatch"),
normalize = T,
useNormalized = T,
logNormalized = F,
MSData = NULL,
ppm = 5,
controlGroup = NULL,
numClusters = 2,
mzMatchParam = list("smid-db_pos.csv",
ppm = 5,
mzdiff = 0.001),
workers = 1){
out <- list(errmsg = list(),
history = list())
if(normalize
|| (useNormalized && !identical(grep("__norm",colnames(df), value = T), paste0(intensities,"__norm")))
){
tryCatch({
temp <- .withHistory(fun = "FTNormalize",
args = list(logNormalized = logNormalized),
longArgs = list(object = df),
addHistory = FALSE,
continueWithErrors = TRUE,
returnIfError = df)
df <- temp$df
out$history <- c(out$history, temp$history)
if(length(temp$history@error)){
out$errMsg[["normalize"]] <- temp$history@error[[1]]
}
},
error = function(e){out$errMsg[["normalize"]] <- paste(e)})
}
if(useNormalized){
tryCatch({
intensities <- paste0(intensities,"__norm")
groups <- lapply(groups, paste0, "__norm")
},
error = function(e){out$errMsg[["useNormalized"]] <- paste(e)})
}
if("Peak shapes" %in% analyze){
tryCatch({
if(is.null(MSData)){
out$errmsg[["Peak shapes"]] <- "Peak shapes analysis was not performed because no MS data is loaded."
}else if (!"mz" %in% colnames(df) || !"rt" %in% colnames(df)){
out$errmsg[["Peak shapes"]] <- "Peak shapes analysis was not performed because table does not contain 'rt' and 'mz' columns."
}else{
inp <- bestgauss(
rawdata= MSData,
mz = data.frame(mzmin = df$mz-ppm*1e-6*df$mz, mzmax=df$mz+ppm*1e-6*df$mz),
rt = data.frame(rtmin = df$rt-10, rtmax=df$rt+10),
workers = workers,
rnames = row.names(df)
)
df <- updateDF (inp, df)
}
},
error = function(e){out$errMsg[["Peak shapes"]] <- paste(e)})
}
if("Fast peak shapes" %in% analyze){
tryCatch({
if(is.null(MSData)){
out$errmsg[["Fast peak shapes"]] <- "Peak shapes analysis was not performed because no MS data is loaded."
}else if (!"mz" %in% colnames(df) || !"rt" %in% colnames(df)){
out$errmsg[["Fast peak shapes"]] <- "Peak shapes analysis was not performed because table does not contain 'rt' and 'mz' columns."
}else{
inp <- data.frame("Fast_Peak_Quality" = fastPeakShapes(
rawdata= MSData,
mz = df$mz,
ppm = ppm,
rtw = data.frame(rtmin = df$rt-10, rtmax=df$rt+10),
workers = workers
))
df <- updateDF (inp, df)
}
},
error = function(e){out$errMsg[["Fast peak shapes"]] <- paste(e)})
}
if("Basic analysis" %in% analyze){
tryCatch({
df <- updateDF(featureCalcs(df),
df)
# if(is.null(groups)){
# out$errmsg[["Basic analysis"]] <- "Fold change analysis was not performed because no grouping information is loaded."
#
# }else if (length(groups) == 1){
# out$errmsg[["Basic analysis"]] <- "Fold change analysis was not performed because there is only one sample group."
# }else{
df <- updateDF(foldChange(as.matrix(df[,intensities]),
groups, ctrl = controlGroup),
df)
# }
},
error = function(e){out$errMsg[["Basic analysis"]] <- paste(e)})
}
if("mzMatch" %in% analyze){
tryCatch({
#remove previous mzMatch results
remcols <- grepl("mzMatch_", colnames(df)) | colnames(df) %in% c("mzMatches", "mzMatchError")
df <- updateDF(do.call(mzMatch, c(list(df[,!remcols, drop = F]),mzMatchParam)),
df[,!remcols, drop = F])
},
error = function(e){out$errMsg[["mzMatch"]] <- paste(e)})
}
if("t-test" %in% analyze){
tryCatch({
df <- updateDF(multittest(df = df[,intensities],
groups,
ttest = T,
adjmethod = "bonferroni"),
df)
},
error = function(e){out$errMsg[["t-test"]] <- paste(e)})
}
if("anova" %in% analyze){
tryCatch({
df <- updateDF(MseekAnova(df = df[,intensities],
groups),
df)
},
error = function(e){out$errMsg[["anova"]] <- paste(e)})
}
if("clara_cluster" %in% analyze){
tryCatch({
if(length(intensities) <2 ){
out$errmsg[["clara_cluster"]] <- "Clara cluster analysis was not performed because there is only one sample."
}else{
mx <- sqrt(as.matrix(df[,intensities])) #using sqrt here to condense data values which may contain 0s
df <- updateDF(MosCluster(x = mx / rowMeans(mx),
k = numClusters,
samples = 100),
df)
}
},
error = function(e){out$errMsg[["clara_cluster"]] <- paste(e)})
}
if("PCA features" %in% analyze){
tryCatch({
pcamemx <- as.matrix(df[,intensities])
pcamemx <- scale(pcamemx, center = T, scale = T)
#PCA to separate features
prin_comp <- prcomp(pcamemx)
f <- function(x){sqrt(sum(x^2))}
dists <- apply(prin_comp$x,1,f)
prin_comp <- as.data.frame(prin_comp$x[,1:min(ncol(prin_comp$x),15)])
prin_comp$vlength <- dists
colnames(prin_comp) <- paste0("PCA__", colnames(prin_comp))
df <-updateDF(prin_comp, df)
},
error = function(e){out$errMsg[["PCA features"]] <- paste(e)})
}
if("PCA samples" %in% analyze){
tryCatch({
pcamemx <- t(as.matrix(df[,intensities]))
pcamemx <- scale(pcamemx, center = T, scale = T)
#PCA to separate features
prin_comp <- prcomp(pcamemx)
f <- function(x){sqrt(sum(x^2))}
dists <- apply(prin_comp$x,1,f)
prin_comp <- as.data.frame(prin_comp$x[,1:min(ncol(prin_comp$x),15)])
prin_comp$vlength <- dists
colnames(prin_comp) <- paste0("PCA__", colnames(prin_comp))
out$PCA_samples <- prin_comp
},
error = function(e){out$errMsg[["PCA samples"]] <- paste(e)})
}
out$df <- df
return(out)
}
#' @title featureTableNormalize
#'
#' @description Function to normalize data in a matrix.
#'
#'
#' @param mx a matrix of numeric (intensity) values
#' @param raiseZeros if not NULL, values of 0 will be raised to a level
#' defined by a character string. "min" will raise all zeros to the lowest
#' non-zero value in mx - normalization is done after this step so that the
#' zero-replacement values will ary across samples (which is needed for t-tests, etc.)
#' @param log if not NULL, log10 will be applied to values in mx
#' @param normalize if not NULL, column values will be normalized by
#' column averages
#' @param normalizationFactors vector of length(ncol(mx)) with factors to apply to each column for normalization.
#' @param threshold numeric(1).
#' @param thresholdMethod if not NULL, removes all rows in mx in which
#' no value is above threshold
#'
#' @importFrom Biobase rowMax
#'
#' @return \code{mx}, normalized so that all columns have the same mean value
#' which is also the mean of all values in \code{mx}
#'
#' @rdname featureTableNormalize
#'
#' @export
featureTableNormalize <- function (mx,
raiseZeros = NULL,#"min",
log = NULL,#"log10",
normalize = FALSE,#"columnMean",
normalizationFactors = NULL,
threshold = NULL,#0.1,
thresholdMethod = NULL#"rowMax"
){
if(!is.null(mx)){
if(!is.null(raiseZeros)){
#add switch for dirrerent options later
mx[which(mx==0, arr.ind=T)] <- raiseZeros
}
if(!is.null(log)){
mx <- log10(mx)
}
if(!is.null(normalize) && normalize){
if(length(normalizationFactors)){
if(length(normalizationFactors) != ncol(mx)){stop("Length of normalizationFactors does not match number of intensity columns!")}
mx <- t(t(mx) * normalizationFactors)
}else{
#calculate correction factor for each column
mxmeans <- colMeans(mx)
mxmeans <- mxmeans/mean(mxmeans)
#apply it
for (i in 1:ncol(mx)){
mx[,i] <- mx[,i]/mxmeans[i]}
}
}
if(!is.null(threshold) & !is.null(thresholdMethod)){
mx <- mx[which(rowMax(mx)>threshold),]
}
return(mx)
}
}
#' @param intensityCols selected columns (with intensity values)
#' @param logNormalized if TRUE, applies a log10 to intensity values after normalization
#' @rdname featureTableNormalize
#' @param object a data.frame
#' @export
setMethod("FTNormalize", "data.frame",
function(object, intensityCols, logNormalized = FALSE){
#normalize data and save it in matrix
mx <- as.matrix(object[,intensityCols])
mx <- featureTableNormalize(mx,
raiseZeros = min(mx[which(!mx==0, arr.ind=T)]))
#
mx <- featureTableNormalize(mx, normalize = "colMeans")
if(!is.null(logNormalized) && logNormalized){
mx <- featureTableNormalize(mx, log = "log10")
}
#make copy of normalized intensities in active table df
mx <- as.data.frame(mx)
colnames(mx) <- paste0(colnames(mx),"__norm")
object <- updateDF (mx,object)
return(object)
})
#' featureCalcs
#'
#'
#' Calculate simple parameters from feature table data.frame.
#' Currently ONLY calculates the mass defect in ppm.
#'
#' @param df a feature table data.frame with a column "mz"
#' @param massdef if true, calculate the mass defect for each entry in df from their mz values
#'
#' @return \code{df} with an additional column, \code{massdefppm}
#'
featureCalcs <- function(df,
massdef = T# calculate mass defect for each feature
){
if(massdef){
df$massdefppm <- ((df$mz-floor(df$mz))/df$mz)*1e6
return(data.frame(massdefppm = df$massdefppm))
}
}
#' foldChange
#'
#' calculate fold changes between grouped columns of a matrix
#'
#' @param mx a matrix of positive numeric (intensity) values
#' @param groups named list of intensity columns listed by group (as supplied
#' by $anagroupnames or $anagroupnames_norm of MseekFT objects)
#' @param ctrl character() naming the control group
#' @param calc currently has to be "mean", compare rowMeans of one group vs.
#' rowMeans of other groups (and optionally rowMeans of controls only)
#' @param topgroup if TRUE, return group with highest intensity for each feature
#' @param maxFold if TRUE, make a column with maximum fold change between
#' any two groups for each feature
#' @param foldMaxK if not NULL, make column with fold change of highest
#' group value over foldMaxK largest group value.
#' @param foldmode if "complex", gives ratios between all groups (not recommended
#' nor documented)
#'
#' @importFrom Biobase rowMax rowMin rowQ
#'
#' @return data.frame with columns representing fold change information for
#' data in \code{mx}, see \code{Details}
#'
#' @details
#' \code{GX} in the following table means that this column is generated for each
#' group in \code{groups}, with the group name as prefix.
#' Columns in the returned data.frame:
#' \itemize{
#' \item \code{maxint} maximum intensity value across all samples
#' \item \code{topgroup} the group that has the highes mean intensity
#' \item \code{maxfold} maximum fold change between any two group mean intensities
#' \item \code{maxfoldoverK} (where K is an integer) fold change of Max
#' intensity over kth largest intensity across all samples
#' \item \code{GX__minInt} minimum intensity value within a group
#' \item \code{GX__meanInt} mean intensity value within a group
#' \item \code{GX__foldOverRest} fold change of mean of intensity values in
#' this group over mean of intensities in all other groups
#' \item \code{GX__minFold} fold change of MINIMUM intensity value in
#' this group over MAXIMUM intensity value across all other samples outside
#' of this group
#' \item \code{GX__minFoldMean} fold change of MEAN intensity value in
#' this group over MAXIMUM intensity value across all other samples outside
#' of this group
#' \item \code{GX__foldOverCtrl} fold change of mean of intensity values in
#' this group over mean of intensities in control group
#' \item \code{GX__minFoldOverCtrl} fold change of MINIMUM intensity value in
#' this group over MAXIMUM intensity value in control group
#' \item \code{best_minFold} Highest minFold value found across all groups
#' \item \code{best_minFoldMean} Highest minFoldMean value found across all groups
#' \item \code{best_minFoldCtrl} Highest minFoldOverCtrl value found across all groups
#' }
#'
foldChange <- function(mx,
groups, #
ctrl = NULL, #control group
calc = "mean", #use rowMeans (vs. rowMean), rowMedians (vs. row), or rowMax to calculate upingroup and maxFold
topgroup = T, #return group with highest intensity for each feature
maxFold = T, #make a column with maximum fold change between any two groups for each feature
foldMaxK = 2, #make column with fold change of Max over kth largest
foldmode = "simple" #or "complex" (which gives ratios between all groups)
){
#needed to fix a problem where 0/0 results in NaN, which causes problems with rowQ functions
removeNaNs <- function(mat, replacement = 1){
mat[is.na(mat)] <- replacement
return(mat)
}
if(length(ctrl) > 1){simpleError("Cannot use multiple control groups")}
out <- data.frame(pholder=integer(nrow(mx)))
out$maxint <- if(ncol(mx)==1){mx}else{rowMax(mx)}
if(length(groups) >1){
if(calc == "mean"){
#make rowMeans for each group
rme <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMeans(mx[,cols])})}
rmeans <- sapply(groups,rme, mx)
rmax <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMax(mx[,cols])})}
rmaxes <- sapply(groups,rmax, mx)
rmin <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMin(mx[,cols])})}
rmins <- sapply(groups,rmin, mx)
out$topgroup <- colnames(rmeans)[apply(rmeans,1,which.max)]
out$maxfold <- rowMax(rmeans)/rowMin(rmeans)
if(!is.null(foldMaxK)){
k <- if(foldMaxK >= ncol(rmeans)){1}else{ncol(rmeans) - foldMaxK + 1}
out[[paste0("maxfoldover",foldMaxK)]] <- rowMax(rmeans)/rowQ(rmeans,k)
}
#make columns for each group, fold over all the other groups, and fold over ctrl if ctrl is defined
for(i in colnames(rmeans)){
#remove everything after double underscore (new default notation)
#remove the XIC tag if it has only one underscore (old notation)
#remove trailing underscores if any
barename <- gsub("_$","",gsub("_XIC","",gsub("__(.*)","",i)))
out[[paste0(barename,"__minInt")]] <- rmins[,i]
out[[paste0(barename,"__meanInt")]] <- rmeans[,i]
if(!is.null(foldmode) && foldmode=="complex"){
out[[paste0(barename,"__foldOver_")]] <- rmeans[,i]/rmeans[,which(colnames(rmeans)!=i)]
out[[paste0(barename,"__minFoldOver_")]] <- rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]
}else{
out[[paste0(barename,"__foldOverRest")]] <- if(length(colnames(rmeans)) >2){
unname(removeNaNs(rmeans[,i]/rowMeans(rmeans[,which(colnames(rmeans)!=i)])))
}else{
unname(rmeans[,i]/rmeans[,which(colnames(rmeans)!=i)])
}
out[[paste0(barename,"__minFold")]] <- if(length(colnames(rmeans)) >2){
rowMin(removeNaNs(rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]))
}else{
rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]
}
out[[paste0(barename,"__minFoldMean")]] <- if(length(colnames(rmeans)) >2){
rowMin(removeNaNs(rmeans[,i]/rmaxes[,which(colnames(rmeans)!=i)]))
}else{
rmeans[,i]/rmaxes[,which(colnames(rmeans)!=i)]
}
}
if(!is.null(ctrl)){
out[[paste0(barename,"__foldOverCtrl")]] <- rmeans[,i]/rmeans[,ctrl]
out[[paste0(barename,"__minFoldOverCtrl")]] <- rmins[,i]/rmaxes[,ctrl]
}
}
}
if(is.null(foldmode) || !foldmode=="complex"){
barenames <- gsub("_$","",gsub("_XIC","",gsub("__(.*)","",colnames(rmeans))))
minFoldCols <- paste0(barenames,"__minFold")
minFoldMeansCols <- paste0(barenames,"__minFoldMean")
minFoldCtrlCols <- paste0(barenames,"__minFoldOverCtrl")
out$best_minFold <- rowMax(as.matrix(removeNaNs(out[,minFoldCols])))
out$best_minFoldMean <- rowMax(as.matrix(removeNaNs(out[,minFoldMeansCols])))
if(!is.null(ctrl)){
out$best_minFoldCtrl <- rowMax(as.matrix(removeNaNs(out[,minFoldCtrlCols])))
}
}
}
return(out[,which(colnames(out)!="pholder")])
}
#' multittest
#'
#'
#' Calculate per-row t-test between grouped columns of a data.frame.
#'
#'
#' @param df a data.frame with numeric (intensity) values
#' @param groups named list of intensity columns listed by group (as supplied by
#' \code{$anagroupnames} or \code{$anagroupnames_norm} of \code{MseekFT}
#' objects)
#' @param ttest if TRUE, ttest will be calculated
#' @param adjmethod method to adjust p values (passed on to stats::p.adjust)
#' @param controlGroup name of the control group in \code{groups}. If NULL, all
#' groups will be compared against all samples outside the group.
#'
#' @importFrom stats p.adjust t.test
#'
#' @return a data.frame with same number of rows as df, with coluns as described in \code{details}
#'
#' @details columns in the export data.frame. All columns are generated for
#' each group defined in \code{groups}, where GX is replaced by the group name:
#' \itemize{
#' \item\code{GX__CV} Coefficient of variation (relative standard deviation (sd/mean)) within the group
#' \item\code{GX__pval} p value between this group and all samples in all other
#' groups, as calculated by \code{\link[stats]{t.test}()}
#' \item\code{GX__pval_adj} p values, adjusted by the selected \code{adjmethod}
#' using \code{\link[stats]{p.adjust}()}
#' }
#'
#' @export
multittest <- function (df,
groups,
ttest=TRUE,
adjmethod='bonferroni',
controlGroup = NULL){
# withProgress(message = 'Please wait!', detail = "calculating pvalues", value = 0.03,{
out = data.frame(pholder= numeric(nrow(df)))
#calculate parameters for each group
for (n in c(1:length(groups))){
i <- groups[[n]]
#sdev <- apply(df[,i],1,sd)/apply(df[,i],1,mean)
m <- as.matrix(df[,i])
rowVars <- rowSums((m - rowMeans(m, na.rm=FALSE))^2, na.rm=FALSE) / (ncol(m) - 1) #https://stackoverflow.com/questions/55327096/r-tidyverse-calculating-standard-deviation-across-rows/61891777#61891777
cv <- sqrt(rowVars) / rowMeans(m, na.rm=FALSE)
cv[which(is.na(cv))] <-0 #if all data point in a line are 0, sd returns 0
if(ttest && (!length(controlGroup) || !names(groups)[n] %in% controlGroup)){
if(length(groups) >1){
if(!length(controlGroup)){
noni <- unlist(groups[-n])
}else{
noni <- unlist(groups[which(names(groups) %in% controlGroup)])
if(!length(noni)){stop("The specified Control Group does not exist")}
}
pval <- apply(df[,c(i,noni)],1,function(x, i , noni){
tryCatch({
return(
t.test(as.numeric(x[i]), as.numeric(x[noni]))[["p.value"]]
)
},
error = function(e){
print(e)
return(NA)})
}, i = i, noni = noni)
padj <- p.adjust(pval, method = adjmethod)
out[[paste0(names(groups)[n],"__pval")]] <- pval
out[[paste0(names(groups)[n],"__pval_adj")]] <- padj
}else{
simpleError("Did not calculate p-value because only a single sample group is defined")
}
}
out[[paste0(names(groups)[n],"__CV")]] <- cv
}
return(out[,which(colnames(out) !="pholder")])
}
#' MosCluster
#'
#' Cluster a matrix
#'
#' @param method name of a function, defaults to "clara" (from the cluster package)
#' @param ... additional arguments to the clustering method
#'
#' @return a data.frame with one column, called \code{cluster__METHODNAME},
#' where METHODNAME is the applied \code{method}
#'
#' @importFrom cluster clara
#'
#' @export
MosCluster <- function(method = "clara",
...){
#requireNamespace("cluster")
res <- do.call(paste0(method), list(...))
res <- data.frame(col1 = res$clustering)
colnames(res) <- paste0("cluster__",method)
return(res)
}
#' MseekAnova
#'
#' Calculate per-row one-way ANOVA between grouped columns of a data.frame.
#' NOTE: Equal variance is not assumed (uses stats::oneway.test)
#' Returns NaN in cases where one group has all equal values (no variance),
#'
#' @param df a data.frame with numeric (intensity) values
#' @param groups named list of intensity columns listed by group
#' (as supplied by $anagroupnames or $anagroupnames_norm of MseekFT objects)
#' @param adjmethod method to adjust p values (passed on to stats::p.adjust)
#'
#' @return a data.frame with one column, called \code{ANOVA__pvalue}
#'
#' @importFrom stats oneway.test
#'
MseekAnova <- function(df, groups, adjmethod = "bonferroni"){
ints <- df[,unlist(groups)]
assigngroups <- lapply(groups, match, colnames(ints))
grp <- character(ncol(ints))
for(i in seq(length(assigngroups))){
grp[assigngroups[[i]]] <- names(groups)[i]
}
grp <- as.factor(grp)
pvals <- apply(ints,1,function(x,grp){
oneway.test(x ~ grp, var.equal = FALSE)$p.value
}, grp)
return(data.frame(ANOVA_pvalue = pvals,
ANOVA_pvalue_adj = p.adjust(pvals, method = adjmethod)))
}
#' mzMatch
#'
#' Find m/z matches for each item in a Feature Table in a database.
#'
#' @param df data.frame that contains a mz column
#' @param db data.frame that contains the compound database and at least
#' columns mz and id. Can also be a character vector of .csv file paths
#' (will be combined into a single list for the search, with duplicate
#' lines removed)
#' @param ppm ppm mz tolerance
#' @param mzdiff maximum mz difference. NOTE: either ppm OR mzdiff
#' condition has to be met
#'
#' @return a data.frame with the same number of rows as \code{df} and columns
#' as described in \code{details}
#'
#' @details columns in the resulting data.frame contain hits for each entry
#' (row) in \code{df} :
#' \itemize{
#' \item\code{mzMatches} identity of the seach hits in \code{db}, taken from
#' the \code{id} column of each \code{db}
#' \item\code{mzMatchError} ppm error, based on difference between db$mz and df$mz for each search hit.
#'}
#' All other columns defined in any \code{db} file are added as well with
#' the prefix \code{mzMatch_}. If there are multiple hits across the \code{db}
#' files, they will be separated by "|" within each column (including
#' \code{mzMatches} and \code{mzMatchError})
#'
#' @importFrom data.table rbindlist
#'
#' @examples
#' testdb <- read.csv(system.file("extdata", "examples", "example_projectfolder",
#' "mini_example_features.csv", package = "Metaboseek"))
#' testdb
#'
#' matches <- mzMatch(df = testdb, db = c(system.file("db", "smid-db_pos.csv", package = "Metaboseek")))
#'
#' updateDF(matches,testdb)
#'
#' @export
mzMatch <- function(df, db, ppm = 5, mzdiff = 0.001){
if(!is.data.frame(db) && !is.null(db) && all(file.exists(db))){
db <- as.data.frame(rbindlist(lapply(db, data.table::fread, sep = ","), fill = T))
}
db <- db[!duplicated.data.frame(db),]
db <- db[db$mz > min(df$mz) & db$mz < max(df$mz),]
resdf <- data.frame(mzMatches = character(nrow(df)),
mzMatchError = character(nrow(df)),
stringsAsFactors = F)
for(n in colnames(db)[!colnames(db) %in% c("id", "mz")]){
resdf[[paste0("mzMatch_",n)]] <- character(nrow(df))
}
if(nrow(db) ==0){ return(resdf)}
selmx <- abs(outer(df$mz, db$mz, "-"))
sel <- which(selmx < mzdiff | selmx/df$mz < ppm*1e-6, arr.ind = T)
for(n in colnames(db)[!colnames(db) %in% c("id", "mz")]){
resdf[[paste0("mzMatch_",n)]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
return(paste(db[[n]][sel[,2][sel[,1]==i]], collapse = "|"))
},n, db, sel)
}
resdf[["mzMatches"]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
return(paste(db[[n]][sel[,2][sel[,1]==i]], collapse = "|"))
},n = "id", db, sel)
resdf[["mzMatchError"]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
return(paste(round(df$mz[i] - db$mz[sel[,2][sel[,1]==i]], 5), collapse = "|" ))
},n = "mz", db, sel)
return(resdf)
}
#' .calculateM
#'
#' Calculates M value as detailed by Vandesompele et al. (2002)
#'
#'
#' @author Aiden Kolodziej, Maximilian Helf
#' @param x a matrix which is expected to contain imputed, log2-transformed intensity values only
#' @param na.rm remove NA values for sd calculation?
#' @param ... Arguments passed to \code{bplapply()}
#'
#' @references
#' \enumerate{
#' \item Vandesompele J. et al (2002) Accurate normalization of real-time
#' quantitative RT-PCR data by geometric averaging of multiple internal control genes.
#' Genome Biol. 3(7):research0034.1, doi: \href{https://dx.doi.org/10.1186%2Fgb-2002-3-7-research0034}{10.1186/gb-2002-3-7-research0034}
#' }
#'
.calculateM <- function(x, na.rm = FALSE, ...){
if(isRunning()){
try({
setProgress(value = 0, message = 'Calculating M...')
})
}
withCallingHandlers({
res <- bplapply(seq_len(nrow(x)), function(i, m = x, rm.na = na.rm){ #assignment changes to avoid recursive default argument reference error in bplapply
m <- t(t(m) - m[i,])
rowVars <- rowSums((m - rowMeans(m, na.rm=rm.na))^2, na.rm=rm.na) / (ncol(m) - 1) #https://stackoverflow.com/questions/55327096/r-tidyverse-calculating-standard-deviation-across-rows/61891777#61891777
if(!i%%1000){message(1000)}
sum(sqrt(rowVars))/ (nrow(m)-1)
}, ...)
},
message = function(m){if(isRunning()){incProgress(amount = sum(as.numeric(unlist(strsplit(as.character(m$message), split = "\n"))))/nrow(x))} })
return(unlist(res))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.