R/Functions_FeatureTable_analysis.R

Defines functions .calculateM mzMatch MseekAnova MosCluster foldChange featureCalcs analyzeTable

Documented in analyzeTable .calculateM featureCalcs foldChange MosCluster MseekAnova mzMatch

#' @include methods_MseekGraph.R

#' @title analyzeTable
#' @name analyzeTable
#'
#' @description Run a series of analyses on a feature table data.frame
#'
#' @param df a data.frame with numeric (intensity) values, rt and mz values
#' @param intensities the intensity column names, before normalization 
#' (without __norm suffix), will be automatically renamed if useNormalized.
#' @param groups named list of non-normalized intensity columns listed by group 
#' (as supplied by $anagroupnames of MseekFT objects), will be automatically 
#' renamed if useNormalized.
#' @param analyze character vector to select the analyses to be run: 
#' "Basic analysis", "clara_cluster", "t-test", "Peak shapes"
#' @param normalize normalze intensity columns
#' @param useNormalized use normalized values for analyses; will trigger 
#' normalize if there is no normalized data available for all selected 
#' intensity columns
#' @param logNormalized if TRUE, applies a log10 to intensity values after normalization
#' @param MSData a (named) list of xcmsRaw objects, e.g. generated by 
#' Mseek::loadRawM, OR an MSnbase OnDiskMSnExp object required for peak 
#' shape analysis
#' @param ppm ppm range for peak shape analysis
#' @param controlGroup control group for foldChange (part of Basic analysis) 
#' analysis (optional) 
#' @param numClusters number of clusters for clara_clusters analysis
#' @param mzMatchParam list of parameters passed to mass
#' @param workers number of workers to use for multithreaded analyses
#' 
#' @return \code{df} with additional columns (or overridden columns) as generated by the selected analyses
#'
#' @importFrom stats prcomp 
analyzeTable <- function(df, intensities,
                         groups,
                         analyze = c("Basic analysis", "clara_cluster",
                                     "t-test", "Peak shapes",
                                     "Fast peak shapes", "PCA features",
                                     "PCA samples", "mzMatch"), 
                         normalize = T,
                         useNormalized = T,
                         logNormalized = F,
                         MSData = NULL,
                         ppm = 5,
                         controlGroup = NULL,
                         numClusters = 2,
                         mzMatchParam = list("smid-db_pos.csv",
                                               ppm = 5,
                                               mzdiff = 0.001),
                         workers = 1){
  out <- list(errmsg = list(),
              history = list())
  
  if(normalize 
     || (useNormalized && !identical(grep("__norm",colnames(df), value = T), paste0(intensities,"__norm")))
  ){
    
    tryCatch({
    temp <- .withHistory(fun = "FTNormalize",
                 args = list(logNormalized = logNormalized),
                 longArgs = list(object = df),
                 addHistory = FALSE,
                 continueWithErrors = TRUE,
                 returnIfError = df)
    
    df <- temp$df
    out$history <- c(out$history, temp$history)
    if(length(temp$history@error)){
    out$errMsg[["normalize"]] <- temp$history@error[[1]]
    }

    },
    error = function(e){out$errMsg[["normalize"]] <- paste(e)})
    
  }
  
  if(useNormalized){
    tryCatch({
    intensities <- paste0(intensities,"__norm")
    groups <- lapply(groups, paste0, "__norm")
  },
  error = function(e){out$errMsg[["useNormalized"]] <- paste(e)})
  }
  
  if("Peak shapes" %in% analyze){
    tryCatch({
    if(is.null(MSData)){
      out$errmsg[["Peak shapes"]] <- "Peak shapes analysis was not performed because no MS data is loaded."
      
    }else if (!"mz" %in% colnames(df) || !"rt" %in% colnames(df)){
      out$errmsg[["Peak shapes"]] <- "Peak shapes analysis was not performed because table does not contain 'rt' and 'mz' columns."
    }else{
        
        
        
      inp <- bestgauss(
        rawdata= MSData,
        mz = data.frame(mzmin = df$mz-ppm*1e-6*df$mz, mzmax=df$mz+ppm*1e-6*df$mz),
        rt = data.frame(rtmin = df$rt-10, rtmax=df$rt+10),
        workers = workers,
        rnames = row.names(df)
      )
      
      df <- updateDF (inp, df)
      
    }
  },
error = function(e){out$errMsg[["Peak shapes"]] <- paste(e)})
    
  }
  
  if("Fast peak shapes" %in% analyze){
    tryCatch({
      if(is.null(MSData)){
        out$errmsg[["Fast peak shapes"]] <- "Peak shapes analysis was not performed because no MS data is loaded."
        
      }else if (!"mz" %in% colnames(df) || !"rt" %in% colnames(df)){
        out$errmsg[["Fast peak shapes"]] <- "Peak shapes analysis was not performed because table does not contain 'rt' and 'mz' columns."
      }else{
        inp <- data.frame("Fast_Peak_Quality" = fastPeakShapes(
          rawdata= MSData,
          mz = df$mz,
          ppm = ppm,
          rtw = data.frame(rtmin = df$rt-10, rtmax=df$rt+10),
          workers = workers
        ))
        
        df <- updateDF (inp, df)
        
      }
    },
    error = function(e){out$errMsg[["Fast peak shapes"]] <- paste(e)})
    
      
      
  }
  
  if("Basic analysis" %in% analyze){
    tryCatch({
      
    df <- updateDF(featureCalcs(df),
                   df)
    
    # if(is.null(groups)){
    #   out$errmsg[["Basic analysis"]] <- "Fold change analysis was not performed because no grouping information is loaded."
    #   
    # }else if (length(groups) == 1){
    #   out$errmsg[["Basic analysis"]] <- "Fold change analysis was not performed because there is only one sample group."
    # }else{
    df <- updateDF(foldChange(as.matrix(df[,intensities]),
                                 groups, ctrl = controlGroup),
                   df)
    # }
    
  },
error = function(e){out$errMsg[["Basic analysis"]] <- paste(e)})
  }
  
  if("mzMatch" %in% analyze){
    tryCatch({
      
      #remove previous mzMatch results
      remcols <-  grepl("mzMatch_", colnames(df)) | colnames(df) %in% c("mzMatches", "mzMatchError")
      
     
      df <- updateDF(do.call(mzMatch, c(list(df[,!remcols, drop = F]),mzMatchParam)),
                               
                     df[,!remcols, drop = F])
      
      
    },
    error = function(e){out$errMsg[["mzMatch"]] <- paste(e)})
  }
  
  if("t-test" %in% analyze){
    tryCatch({
    df <- updateDF(multittest(df = df[,intensities],
                                 groups,
                                 ttest = T,
                                 adjmethod = "bonferroni"),
                   df)
  },
error = function(e){out$errMsg[["t-test"]] <- paste(e)})
  }
  
  if("anova" %in% analyze){
    tryCatch({
      df <- updateDF(MseekAnova(df = df[,intensities],
                                groups),
                     df)
    },
    error = function(e){out$errMsg[["anova"]] <- paste(e)})
  }
  
  
  if("clara_cluster" %in% analyze){
    tryCatch({
      
    if(length(intensities) <2 ){
      out$errmsg[["clara_cluster"]] <- "Clara cluster analysis was not performed because there is only one sample."
    }else{
      
      mx <- sqrt(as.matrix(df[,intensities])) #using sqrt here to condense data values which may contain 0s
      
      
      df <- updateDF(MosCluster(x = mx / rowMeans(mx),
                                   k = numClusters,
                                   samples = 100), 
                     df)
    }
  },
error = function(e){out$errMsg[["clara_cluster"]] <- paste(e)})
  }
  
  if("PCA features" %in% analyze){
    tryCatch({
      
    pcamemx <- as.matrix(df[,intensities])
    
    pcamemx <- scale(pcamemx, center = T, scale = T)
    
    #PCA to separate features
    prin_comp <- prcomp(pcamemx)
    
    f <- function(x){sqrt(sum(x^2))}
    
    dists <- apply(prin_comp$x,1,f)
    prin_comp <- as.data.frame(prin_comp$x[,1:min(ncol(prin_comp$x),15)])
    prin_comp$vlength <- dists
    colnames(prin_comp) <- paste0("PCA__", colnames(prin_comp))
    
    df <-updateDF(prin_comp, df)
    
  },
error = function(e){out$errMsg[["PCA features"]] <- paste(e)})
  }
  
  if("PCA samples" %in% analyze){
    tryCatch({
      
    pcamemx <- t(as.matrix(df[,intensities]))
    
    pcamemx <- scale(pcamemx, center = T, scale = T)
    
    #PCA to separate features
    prin_comp <- prcomp(pcamemx)
    
    f <- function(x){sqrt(sum(x^2))}
    
    dists <- apply(prin_comp$x,1,f)
    
    prin_comp <- as.data.frame(prin_comp$x[,1:min(ncol(prin_comp$x),15)])
    prin_comp$vlength <- dists
    colnames(prin_comp) <- paste0("PCA__", colnames(prin_comp))
    
    out$PCA_samples <- prin_comp
    
  },
error = function(e){out$errMsg[["PCA samples"]] <- paste(e)})
  }
  
  out$df <- df
  return(out)
  
}



#' @title featureTableNormalize
#' 
#' @description Function to normalize data in a matrix.
#' 
#' 
#' @param mx a matrix of numeric (intensity) values
#' @param raiseZeros if not NULL, values of 0 will be raised to a level 
#' defined by a character string. "min" will raise all zeros to the lowest
#'  non-zero value in mx - normalization is done after this step so that the 
#'  zero-replacement values will ary across samples (which is needed for t-tests, etc.) 
#' @param log if not NULL, log10 will be applied to values in mx
#' @param normalize if not NULL, column values will be normalized by 
#' column averages
#' @param normalizationFactors vector of length(ncol(mx)) with factors to apply to each column for normalization.
#' @param threshold numeric(1). 
#' @param thresholdMethod if not NULL, removes all rows in mx in which 
#' no value is above threshold
#' 
#' @importFrom Biobase rowMax
#' 
#' @return \code{mx}, normalized so that all columns have the same mean value 
#' which is also the mean of all values in \code{mx} 
#' 
#' @rdname featureTableNormalize
#' 
#' @export
featureTableNormalize <- function (mx,
                      raiseZeros = NULL,#"min",
                      log = NULL,#"log10",
                      normalize = FALSE,#"columnMean",
                      normalizationFactors = NULL,
                      threshold = NULL,#0.1,
                      thresholdMethod = NULL#"rowMax"
                      ){
    
    if(!is.null(mx)){
    
    if(!is.null(raiseZeros)){
        #add switch for dirrerent options later
        mx[which(mx==0, arr.ind=T)]  <- raiseZeros
    }
    
    if(!is.null(log)){
        mx <- log10(mx)
    }
    
    if(!is.null(normalize) && normalize){
      if(length(normalizationFactors)){
        if(length(normalizationFactors) != ncol(mx)){stop("Length of normalizationFactors does not match number of intensity columns!")}
        
        mx <- t(t(mx) * normalizationFactors)
        
      }else{
        #calculate correction factor for each column
        mxmeans <- colMeans(mx)
        mxmeans <- mxmeans/mean(mxmeans)
        #apply it
        for (i in 1:ncol(mx)){
        mx[,i] <- mx[,i]/mxmeans[i]}
      }
    }
    
    if(!is.null(threshold) & !is.null(thresholdMethod)){
        mx <- mx[which(rowMax(mx)>threshold),]
    }
    return(mx)
    
    }
}

#' @param intensityCols selected columns (with intensity values)
#' @param logNormalized if TRUE, applies a log10 to intensity values after normalization
#' @rdname featureTableNormalize
#' @param object a data.frame
#' @export
setMethod("FTNormalize", "data.frame",
          function(object, intensityCols, logNormalized = FALSE){
              
              #normalize data and save it in matrix
              mx <- as.matrix(object[,intensityCols])
              mx <- featureTableNormalize(mx,
                                          raiseZeros =  min(mx[which(!mx==0, arr.ind=T)]))
              # 
              mx <- featureTableNormalize(mx, normalize = "colMeans")
              if(!is.null(logNormalized) && logNormalized){
                  mx <- featureTableNormalize(mx, log =  "log10")
              }
              
              #make copy of normalized intensities in active table df
              mx <- as.data.frame(mx)
              colnames(mx) <- paste0(colnames(mx),"__norm")
              object <- updateDF (mx,object)
              return(object)
          })


#' featureCalcs
#' 
#' 
#' Calculate simple parameters from feature table data.frame. 
#' Currently ONLY calculates the mass defect in ppm.
#' 
#' @param df a feature table data.frame with a column "mz"
#' @param massdef if true, calculate the mass defect for each entry in df from their mz values
#' 
#' @return \code{df} with an additional column, \code{massdefppm}
#' 
featureCalcs <- function(df,
                         massdef = T# calculate mass defect for each feature
){
  if(massdef){
    df$massdefppm <- ((df$mz-floor(df$mz))/df$mz)*1e6
  
    return(data.frame(massdefppm = df$massdefppm))
  }

    }




#' foldChange
#' 
#' calculate fold changes between grouped columns of a matrix
#' 
#' @param mx a matrix of positive numeric (intensity) values
#' @param groups named list of intensity columns listed by group (as supplied 
#' by $anagroupnames or $anagroupnames_norm of MseekFT objects)
#' @param ctrl character() naming the control group
#' @param calc currently has to be "mean", compare rowMeans of one group vs. 
#' rowMeans of other groups (and optionally rowMeans of controls only)
#' @param topgroup if TRUE, return group with highest intensity for each feature
#' @param maxFold if TRUE, make a column with maximum fold change between 
#' any two groups for each feature
#' @param foldMaxK if not NULL, make column with fold change of highest 
#' group value over foldMaxK largest group value.
#' @param foldmode if "complex", gives ratios between all groups (not recommended 
#' nor documented)
#' 
#' @importFrom Biobase rowMax rowMin rowQ
#' 
#' @return data.frame with columns representing fold change information for 
#' data in \code{mx}, see \code{Details}
#' 
#' @details 
#' \code{GX} in the following table means that this column is generated for each 
#' group in \code{groups}, with the group name as prefix.
#' Columns in the returned data.frame:
#' \itemize{
#' \item \code{maxint} maximum intensity value across all samples
#' \item \code{topgroup} the group that has the highes mean intensity
#' \item \code{maxfold} maximum fold change between any two group mean intensities
#' \item \code{maxfoldoverK} (where K is an integer) fold change of Max 
#' intensity over kth largest intensity across all samples
#' \item \code{GX__minInt} minimum intensity value within a group
#' \item \code{GX__meanInt} mean intensity value within a group
#' \item \code{GX__foldOverRest} fold change of mean of intensity values in 
#' this group over mean of intensities in all other groups
#' \item \code{GX__minFold} fold change of MINIMUM intensity value in 
#' this group over MAXIMUM intensity value across all other samples outside
#'  of this group
#' \item \code{GX__minFoldMean} fold change of MEAN intensity value in 
#' this group over MAXIMUM intensity value across all other samples outside
#'  of this group
#' \item \code{GX__foldOverCtrl} fold change of mean of intensity values in 
#' this group over mean of intensities in control group
#' \item \code{GX__minFoldOverCtrl} fold change of MINIMUM intensity value in 
#' this group over MAXIMUM intensity value in control group
#' \item \code{best_minFold} Highest minFold value found across all groups
#' \item \code{best_minFoldMean} Highest minFoldMean value found across all groups
#' \item \code{best_minFoldCtrl} Highest minFoldOverCtrl value found across all groups
#' }
#'    
foldChange <- function(mx,
                       groups, #
                       ctrl = NULL, #control group
                       calc = "mean", #use  rowMeans (vs. rowMean), rowMedians (vs. row), or rowMax to calculate upingroup and maxFold
                       topgroup = T, #return group with highest intensity for each feature
                       maxFold = T, #make a column with maximum fold change between any two groups for each feature
                       foldMaxK = 2, #make column with fold change of Max over kth largest
                       foldmode = "simple" #or "complex" (which gives ratios between all groups)
                       ){
    
  #needed to fix a problem where 0/0 results in NaN, which causes problems with rowQ functions
  removeNaNs <- function(mat, replacement = 1){
    mat[is.na(mat)] <- replacement
    return(mat)
  }
  
  if(length(ctrl) > 1){simpleError("Cannot use multiple control groups")}
  
  out <- data.frame(pholder=integer(nrow(mx)))   
  out$maxint <- if(ncol(mx)==1){mx}else{rowMax(mx)}
  
  if(length(groups) >1){
  if(calc == "mean"){
    
    #make rowMeans for each group
    rme <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMeans(mx[,cols])})}
    rmeans <- sapply(groups,rme, mx)
    rmax <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMax(mx[,cols])})}
    rmaxes <- sapply(groups,rmax, mx)
    rmin <- function(cols,mx){return(if(length(cols)==1){mx[,cols]}else{rowMin(mx[,cols])})}
    rmins <- sapply(groups,rmin, mx)
    
    out$topgroup <- colnames(rmeans)[apply(rmeans,1,which.max)]
    out$maxfold <- rowMax(rmeans)/rowMin(rmeans)
    
    if(!is.null(foldMaxK)){
      k <- if(foldMaxK >= ncol(rmeans)){1}else{ncol(rmeans) - foldMaxK + 1}
      out[[paste0("maxfoldover",foldMaxK)]] <- rowMax(rmeans)/rowQ(rmeans,k)
    }
    
    #make columns for each group, fold over all the other groups, and fold over ctrl if ctrl is defined
    
    for(i in colnames(rmeans)){
      
      #remove everything after double underscore (new default notation)
      #remove the XIC tag if it has only one underscore (old notation)
      #remove trailing underscores if any
      
      barename <- gsub("_$","",gsub("_XIC","",gsub("__(.*)","",i)))
      
      out[[paste0(barename,"__minInt")]] <- rmins[,i]
      out[[paste0(barename,"__meanInt")]] <- rmeans[,i]
      if(!is.null(foldmode) && foldmode=="complex"){
        out[[paste0(barename,"__foldOver_")]] <- rmeans[,i]/rmeans[,which(colnames(rmeans)!=i)]
        out[[paste0(barename,"__minFoldOver_")]] <- rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]
      }else{
        out[[paste0(barename,"__foldOverRest")]] <- if(length(colnames(rmeans)) >2){
          unname(removeNaNs(rmeans[,i]/rowMeans(rmeans[,which(colnames(rmeans)!=i)])))
        }else{
          unname(rmeans[,i]/rmeans[,which(colnames(rmeans)!=i)])
        }
        out[[paste0(barename,"__minFold")]] <- if(length(colnames(rmeans)) >2){
          
          rowMin(removeNaNs(rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]))  
          
          
        }else{
          rmins[,i]/rmaxes[,which(colnames(rmaxes)!=i)]
        }
        out[[paste0(barename,"__minFoldMean")]] <- if(length(colnames(rmeans)) >2){
          rowMin(removeNaNs(rmeans[,i]/rmaxes[,which(colnames(rmeans)!=i)]))  
        }else{
          rmeans[,i]/rmaxes[,which(colnames(rmeans)!=i)]
        }
      }
      
      if(!is.null(ctrl)){
        out[[paste0(barename,"__foldOverCtrl")]] <- rmeans[,i]/rmeans[,ctrl]
        out[[paste0(barename,"__minFoldOverCtrl")]] <- rmins[,i]/rmaxes[,ctrl]
      }
    }
  }
  
  
  if(is.null(foldmode) || !foldmode=="complex"){
    barenames <- gsub("_$","",gsub("_XIC","",gsub("__(.*)","",colnames(rmeans))))
    minFoldCols <- paste0(barenames,"__minFold")
    minFoldMeansCols <- paste0(barenames,"__minFoldMean")
    minFoldCtrlCols <- paste0(barenames,"__minFoldOverCtrl")
    
    out$best_minFold <- rowMax(as.matrix(removeNaNs(out[,minFoldCols])))
    out$best_minFoldMean <- rowMax(as.matrix(removeNaNs(out[,minFoldMeansCols])))
    if(!is.null(ctrl)){
      out$best_minFoldCtrl <- rowMax(as.matrix(removeNaNs(out[,minFoldCtrlCols])))

    }
    
  }
  }
  return(out[,which(colnames(out)!="pholder")])
        
       
    }
    


#' multittest
#' 
#' 
#' Calculate per-row t-test between grouped columns of a data.frame.
#' 
#' 
#' @param df a data.frame with numeric (intensity) values
#' @param groups named list of intensity columns listed by group (as supplied by
#'  \code{$anagroupnames} or \code{$anagroupnames_norm} of \code{MseekFT} 
#'  objects)
#' @param ttest if TRUE, ttest will be calculated
#' @param adjmethod method to adjust p values (passed on to stats::p.adjust)
#' @param controlGroup name of the control group in \code{groups}. If NULL, all
#' groups will be compared against all samples outside the group.
#'  
#' @importFrom stats p.adjust t.test
#' 
#' @return a data.frame with same number of rows as df, with coluns as described in \code{details}
#'  
#' @details columns in the export data.frame. All columns are generated for 
#' each group defined in \code{groups},  where GX is replaced by the group name:
#' \itemize{
#' \item\code{GX__CV} Coefficient of variation (relative standard deviation (sd/mean)) within the group
#' \item\code{GX__pval} p value between this group and all samples in all other 
#' groups, as calculated by \code{\link[stats]{t.test}()}
#' \item\code{GX__pval_adj} p values, adjusted by the selected \code{adjmethod}
#'  using \code{\link[stats]{p.adjust}()}
#' }
#'  
#' @export
multittest <- function (df,
                    groups,
                    ttest=TRUE,
                    adjmethod='bonferroni',
                    controlGroup = NULL){
   # withProgress(message = 'Please wait!', detail = "calculating pvalues", value = 0.03,{

    out = data.frame(pholder= numeric(nrow(df)))
    #calculate parameters for each group
    for (n in c(1:length(groups))){
        
        i <- groups[[n]]
        
        #sdev <- apply(df[,i],1,sd)/apply(df[,i],1,mean)
        m <- as.matrix(df[,i])
        rowVars <- rowSums((m - rowMeans(m, na.rm=FALSE))^2, na.rm=FALSE) / (ncol(m) - 1) #https://stackoverflow.com/questions/55327096/r-tidyverse-calculating-standard-deviation-across-rows/61891777#61891777
        cv <- sqrt(rowVars) / rowMeans(m, na.rm=FALSE)
        
        cv[which(is.na(cv))] <-0 #if all data point in a line are 0, sd returns 0

    
        if(ttest && (!length(controlGroup) || !names(groups)[n] %in% controlGroup)){
          if(length(groups) >1){
            if(!length(controlGroup)){
            noni <- unlist(groups[-n])
            }else{
            noni <- unlist(groups[which(names(groups) %in% controlGroup)])
            if(!length(noni)){stop("The specified Control Group does not exist")}
            }
          
            pval <- apply(df[,c(i,noni)],1,function(x, i , noni){
              tryCatch({
                return(
                t.test(as.numeric(x[i]), as.numeric(x[noni]))[["p.value"]]
                )
              },
              error = function(e){
                print(e)
                return(NA)})
              
              
            }, i = i, noni = noni)
            
            padj <- p.adjust(pval, method = adjmethod)
        
        

        
        out[[paste0(names(groups)[n],"__pval")]] <- pval
        out[[paste0(names(groups)[n],"__pval_adj")]] <- padj
        }else{
         simpleError("Did not calculate p-value because only a single sample group is defined") 
        }
        }
        
      out[[paste0(names(groups)[n],"__CV")]] <- cv

    }

    return(out[,which(colnames(out) !="pholder")])
}


#' MosCluster
#' 
#' Cluster a matrix
#'  
#' @param method name of a function, defaults to "clara" (from the cluster package)
#' @param ... additional arguments to the clustering method
#' 
#' @return a data.frame with one column, called \code{cluster__METHODNAME}, 
#' where METHODNAME is the applied \code{method}
#'
#' @importFrom cluster clara
#' 
#' @export
MosCluster <- function(method = "clara",
                       ...){
  #requireNamespace("cluster")
  res <- do.call(paste0(method), list(...))
  
  res <- data.frame(col1 = res$clustering)
  colnames(res) <- paste0("cluster__",method)
  
  return(res)
}    

#' MseekAnova
#' 
#' Calculate per-row one-way ANOVA between grouped columns of a data.frame. 
#' NOTE: Equal variance is not assumed (uses stats::oneway.test)
#' Returns NaN in cases where one group has all equal values (no variance), 
#' 
#' @param df a data.frame with numeric (intensity) values
#' @param groups named list of intensity columns listed by group 
#' (as supplied by $anagroupnames or $anagroupnames_norm of MseekFT objects)
#' @param adjmethod method to adjust p values (passed on to stats::p.adjust)
#' 
#' @return a data.frame with one column, called \code{ANOVA__pvalue}
#' 
#' @importFrom stats oneway.test
#'  
MseekAnova <- function(df, groups, adjmethod = "bonferroni"){
  
  ints <- df[,unlist(groups)]  
  
  assigngroups <- lapply(groups, match, colnames(ints))
  
  grp <- character(ncol(ints))
  
  for(i in seq(length(assigngroups))){
    
    grp[assigngroups[[i]]] <- names(groups)[i]
    
  }
  
  grp <- as.factor(grp)
  
  pvals <- apply(ints,1,function(x,grp){
    oneway.test(x ~ grp, var.equal = FALSE)$p.value
    
  }, grp)
  
  return(data.frame(ANOVA_pvalue = pvals,
                    ANOVA_pvalue_adj = p.adjust(pvals, method = adjmethod)))
  
}

#' mzMatch
#' 
#' Find m/z matches for each item in a Feature Table in a database.
#'
#' @param df data.frame that contains a mz column
#' @param db data.frame that contains the compound database and at least 
#' columns mz and id. Can also be a character vector of .csv file paths 
#' (will be combined into a single list for the search, with duplicate 
#' lines removed)
#' @param ppm ppm mz tolerance
#' @param mzdiff maximum mz difference. NOTE: either ppm OR mzdiff 
#' condition has to be met
#'
#' @return a data.frame with the same number of rows as \code{df} and columns
#'  as described in \code{details}
#'
#' @details columns in the resulting data.frame contain hits for each entry 
#' (row) in \code{df} :
#' \itemize{
#' \item\code{mzMatches} identity of the seach hits in \code{db}, taken from 
#' the \code{id} column of each \code{db}
#' \item\code{mzMatchError} ppm error, based on difference between db$mz and df$mz for each search hit.
#'}
#' All other columns defined in any \code{db} file are added as well with 
#' the prefix \code{mzMatch_}. If there are multiple hits across the \code{db}
#'  files, they will be separated by "|" within each column (including 
#'  \code{mzMatches} and \code{mzMatchError})
#'
#' @importFrom data.table rbindlist
#' 
#' @examples 
#' testdb <- read.csv(system.file("extdata", "examples", "example_projectfolder",
#'  "mini_example_features.csv", package = "Metaboseek"))
#' testdb
#' 
#' matches <- mzMatch(df = testdb, db = c(system.file("db", "smid-db_pos.csv", package = "Metaboseek")))
#' 
#' updateDF(matches,testdb)
#'
#' @export
mzMatch <- function(df, db, ppm = 5, mzdiff = 0.001){
  
  if(!is.data.frame(db) && !is.null(db) && all(file.exists(db))){
    
    db <- as.data.frame(rbindlist(lapply(db, data.table::fread, sep = ","), fill = T))
    
  }
  
  db <- db[!duplicated.data.frame(db),]
  
  db <- db[db$mz > min(df$mz) & db$mz < max(df$mz),]
  
  resdf <- data.frame(mzMatches = character(nrow(df)),
                      mzMatchError = character(nrow(df)),
                      stringsAsFactors = F)
  
  for(n in colnames(db)[!colnames(db) %in% c("id", "mz")]){
    resdf[[paste0("mzMatch_",n)]] <- character(nrow(df))
  }
  
  if(nrow(db) ==0){ return(resdf)}
  
  selmx <- abs(outer(df$mz, db$mz, "-"))
  
  sel <- which(selmx < mzdiff | selmx/df$mz < ppm*1e-6, arr.ind = T)
  
  for(n in colnames(db)[!colnames(db) %in% c("id", "mz")]){
    
    resdf[[paste0("mzMatch_",n)]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
      return(paste(db[[n]][sel[,2][sel[,1]==i]], collapse = "|"))
    },n, db, sel)
    
  }
  
  
  resdf[["mzMatches"]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
    return(paste(db[[n]][sel[,2][sel[,1]==i]], collapse = "|"))
  },n = "id", db, sel)
  
  resdf[["mzMatchError"]][unique(sel[,1])] <- sapply(unique(sel[,1]), function(i, n, db, sel){
    return(paste(round(df$mz[i] - db$mz[sel[,2][sel[,1]==i]], 5), collapse = "|"  ))
  },n = "mz", db, sel)
  
  
  
  return(resdf)  
  
}


#' .calculateM
#'
#' Calculates M value as detailed by Vandesompele et al. (2002) 
#' 
#' 
#' @author Aiden Kolodziej, Maximilian Helf
#' @param x a matrix which is expected to contain imputed, log2-transformed intensity values only
#' @param na.rm remove NA values for sd calculation?
#' @param ... Arguments passed to \code{bplapply()}
#' 
#' @references 
#' \enumerate{
#' \item Vandesompele J. et al (2002) Accurate normalization of real-time 
#' quantitative RT-PCR data by geometric averaging of multiple internal control genes.
#'  Genome Biol. 3(7):research0034.1, doi: \href{https://dx.doi.org/10.1186%2Fgb-2002-3-7-research0034}{10.1186/gb-2002-3-7-research0034}
#' }
#'
.calculateM <- function(x, na.rm = FALSE, ...){
  
  if(isRunning()){
    try({
      setProgress(value = 0, message = 'Calculating M...')
    })
  }
  
  withCallingHandlers({
  
 
    res <- bplapply(seq_len(nrow(x)), function(i, m = x, rm.na = na.rm){ #assignment changes to avoid recursive default argument reference error in bplapply
    
    m <- t(t(m) - m[i,])
    rowVars <- rowSums((m - rowMeans(m, na.rm=rm.na))^2, na.rm=rm.na) / (ncol(m) - 1) #https://stackoverflow.com/questions/55327096/r-tidyverse-calculating-standard-deviation-across-rows/61891777#61891777
    
    if(!i%%1000){message(1000)}
    
    sum(sqrt(rowVars))/ (nrow(m)-1)
    
  }, ...)
  },
  message = function(m){if(isRunning()){incProgress(amount = sum(as.numeric(unlist(strsplit(as.character(m$message), split = "\n"))))/nrow(x))} })
  
  return(unlist(res))
}
mjhelf/METABOseek documentation built on April 27, 2022, 5:13 p.m.