metaboprep: Metabolomics data preparation and processing pipeline

Documented in perform.metabolite.qc

#' perform metabolomics quality control
#'
#' This function is a wrapper function that performs the key quality controls steps on a metabolomics data set
#'
#' @param wdata the metabolite data matrix. samples in row, metabolites in columns
#' @param fmis defaulted at 0.2, this defines the feature missingness cutoff
#' @param smis defaulted at 0.2, this defines the sample missingness cutoff
#' @param tpa_out_SD defaulted at 5, this defines the number of standard deviation from the mean in which samples will be excluded for total peak area. Pass NA to this paramater to exclude this step.
#' @param outlier_udist defaulted at 5, this defines the number of interquartile range units from the median to define a value as an outlier
#' @param outlier_treatment defaulted to "leave_be". Choices are "leave_be", "winsorize", or "turn_NA", which defines how to treat outlier values prior to estimating principal components. "leave_be" will do nothing to ourlier values. "turn_NA" will turn outliers into NA and thus be median imputed for the purpose of the PCA. "winsorize" will turn NA values into the "winsorize_quantile" of all remaining (non-outlying) values at a feature. 
#' @param winsorize_quantile the quantile (0-1) to winzorise outlier values to, if "outlier_treatment" parameter set to "winsorize". Defaulted to 1, or the maximum value of all remaining (non-outlying) values at a feature.
#' @param tree_cut_height The height at which to cut the feature|metabolite dendrogram to identify "independent" features. tree_cut_height is 1-absolute(Spearman's Rho) for intra-cluster correlations.
#' @param PC_out_SD defaulted at '5', this defines the number of standard deviation from the mean in which samples will be excluded for principle components. NA is NOT an excepted paramater.
#' @param feature_colnames_2_exclude names of columns to be excluded from all analysis, such as for Xenobiotics. Pass NA to this parameter to exclude this step.
#' @param derived_colnames_2_exclude names of columns to exclude from all sample QC steps, including independent feature identification, which is used to generate the sample PCA.
#' 
#' @keywords metabolomics quality control
#' 
#' @return a list object of:
#' (1) "wdata" qc'd data matrix,
#' (2) "featuresumstats" a list with a (1:"table") data frame of feature summary statistics and a (2:"tree") hclust object 
#' (3) "pca" a list with a (1:"pcs") data frame of the top 10 PCs and a binary for outliers on the top 2 PCs at 3,4, and 5 SD from the mean, a (2:"varexp") vector of the variance explained for each PC, an estimate of the number of 'significant' PCs as determined by (3:"accelerationfactor") an acceleration factor and (4:"nsig_parrallel") a parrallel analysis, and finally (5:"prob_pca") the top 10 PCs derived by a probabilistic PC analysis.
#' (4) "exclusion_data" a matrix of exclusion summary statistics
#'
#' @export
#'
#' @examples
#' ## define a covariance matrix
#' cmat = matrix(1, 4, 4 )
#' cmat[1,] = c(1, 0.8, 0.6, 0.2)
#' cmat[2,] = c(0.8, 1, 0.7, 0.5)
#' cmat[3,] = c(0.6, 0.7, 1, 0.6)
#' cmat[4,] = c(0.2, 0.5, 0.6,1)
#' ## simulate some correlated data (multivariable random normal)
#' set.seed(1110)
#' d1 = MASS::mvrnorm(n = 250, mu = c(5, 45, 25, 15), Sigma = cmat )
#' set.seed(1010)
#' d2 = MASS::mvrnorm(n = 250, mu = c(5, 45, 25, 15), Sigma = cmat )
#' ## simulate some random data
#' d3 = sapply(1:20, function(x){ rnorm(250, 40, 5) })
#' ## define the data set
#' ex_data = cbind(d1,d2,d3)
#' rownames(ex_data) = paste0("ind", 1:nrow(ex_data))
#' colnames(ex_data) = paste0("var", 1:ncol(ex_data))
#' ## add in some missingness
#' ex_data[sample(1:length(ex_data), 450)] = NA
#' ## add in some technical error to two samples
#' m = apply(ex_data, 2, function(x){ mean(x, na.rm = TRUE) })
#' ex_data[c(1,10), ] = ex_data[1, ] + (m*0.00001) 
#' 
#' ## run the quality control
#' example_qc = perform.metabolite.qc(ex_data)
#' 
#' ## the filtered data is found in 
#' dim(example_qc$wdata)
#' example_qc$wdata[1:5, 1:5]
#' ## a data frame of summary statistics
#' head( example_qc$featuresumstats$table )
#' ## a hclust dendrogram can be plotted
#' plot( example_qc$featuresumstats$tree , hang = -1)
#' abline(h = 0.5, col = "red", lwd = 1.5)
#' ## (median imputed) PCs for all samples can be plotted
#' pcol = c("blue","red")[ as.factor( example_qc$pca$pcs[, "PC1_5_SD_outlier"] )]
#' plot(example_qc$pca$pcs[,"PC1"], example_qc$pca$pcs[,"PC2"], 
#'      pch = 21, cex = 1.5, bg = pcol, 
#'      xlab = paste0( "PC1: Var Exp = " , round(example_qc$pca$varexp[1], d = 4)*100, "%" ) , 
#'      ylab = paste0( "PC2: Var Exp = " , round(example_qc$pca$varexp[2], d = 4)*100, "%" ) )
#' ## A Scree plot can be generated by
#' plot( x = 1:length(example_qc$pca$varexp), 
#'       y = example_qc$pca$varexp, 
#'       type = "b", pch = 21, cex = 2, bg = "blue",
#'       xlab = "PC", ylab = "Variance Explained", main = "Scree Plot")
#' abline(v = example_qc$pca$accelerationfactor, col = "red", lwd = 2)
#' abline(v = example_qc$pca$nsig_parrallel, col = "green", lwd = 2)
#' ## Probablistic PCs for all samples can be plotted
#' plot(example_qc$pca$prob_pca[,1], example_qc$pca$prob_pca[,2], 
#'      pch = 21, cex = 1.5, bg = pcol, 
#'      xlab = "PC1", 
#'      ylab = "PC2" )
#' ## A summary of the exclusion statistics
#' example_qc$exclusion_data
#'
perform.metabolite.qc = function(wdata, 
                                 fmis = 0.2, 
                                 smis = 0.2, 
                                 tpa_out_SD = 5,
                                 outlier_udist = 5, 
                                 outlier_treatment = "leave_be", 
                                 winsorize_quantile = 1,         
                                 tree_cut_height = 0.5, 
                                 PC_out_SD = 5,
                                 feature_colnames_2_exclude = NA,
                                 derived_colnames_2_exclude = NA
                                 ){
  ###########################
  ## parameter check
  ###########################
  if( !outlier_treatment %in% c("winsorize","turn_NA","leave_be") ){
    stop( "The parameter 'outlier_treatment' must be one of 'winsorize', 'turn_NA', or 'leave_be'." )
  }
  
  ###########################
  ## remove and retain exclusions
  ###########################
  if( is.na( feature_colnames_2_exclude[1] ) == FALSE ){
    if( length(feature_colnames_2_exclude) > 0){
      cat( paste0("\t\t- QCstep: extracting exclusion features from analysis.\n") )
      w = which( colnames(wdata)  %in% feature_colnames_2_exclude )
      exdata = wdata[ , w ]
      wdata = wdata[ , -w ]
      
    }
  }
  
  ###########################
  ## Record of exclusions made
  ###########################
  exclusion_data = matrix(NA, nrow = 6, ncol = 1, dimnames = list( c("Extreme_sample_missingness","Extreme_feature_missingness","User_defined_sample_missingness","User_defined_feature_missingness","User_defined_sample_totalpeakarea","User_defined_sample_PCA_outliers"),c("count") ) )

  ###########################
  ## 1) estimate inital sample missingness
  ###########################
  cat( paste0("\t\t- QCstep: estimate initial sample missingness.\n") )
  if( !is.na(derived_colnames_2_exclude[1]) ){
    samplemis = sample.missingness(wdata, excludethesefeatures = derived_colnames_2_exclude)
    } else {
      samplemis = sample.missingness(wdata, excludethesefeatures = NA)
    }
  
  ###########################
  ## 2) exclude terrible samples (smis > 0.8)
  ###########################
  cat( paste0("\t\t- QCstep: exclude those sample with missingness >= 80%.\n") )
  if( ncol(samplemis) == 2){
    r = which(samplemis[,2] >= 0.8)
    } else {
      samplemis$sample_missingness_w_exclusions = samplemis$sample_missingness
      r = which(samplemis[,1] >= 0.8)    
    }
  exclusion_data[1,1] = length(r)
  ###
  if( length(r) > 0) { 
    cat( paste0("\t\t\t* ", length(r), " sample(s) excluded for extreme missingness.\n") )
    wdata = wdata[ -r, ]; samplemis = samplemis[-r,] 
  } else {
    cat( paste0("\t\t\t* 0 samples excluded for extreme missingness.\n") )
  }
  
  ###########################
  ## 3) estimate inital feature missingness
  ###########################
  cat( paste0("\t\t- QCstep: estimate initial feature missingness.\n") )
  if( ncol(samplemis) == 2){
    SM = samplemis[,2]
    } else {
      SM = samplemis[,1]
    }
  featuremis = feature.missingness(wdata = wdata)
  
  ###########################
  ## 4) exclude terrible features (fmis > 0.8)
  ###########################
  cat( paste0("\t\t- QCstep: exclude those features with missingness >= 80%.\n") )
  r = which(featuremis >= 0.8)
  exclusion_data[2,1] = length(r)
  ##
  if( length(r) > 0) {  
    cat( paste0("\t\t\t* ", length(r), " feature(s) excluded for extreme missingness.\n") )
    wdata = wdata[ , -r ]  
  } else {
    cat( paste0("\t\t\t* 0 features excluded for extreme missingness.\n") )
  }
  
  ###########################
  ## 5) re-estimate sample missingness
  ###########################
  cat( paste0("\t\t- QCstep: RE-estimate sample missingness.\n") )
  if( !is.na(derived_colnames_2_exclude[1]) ){
    samplemis = sample.missingness(wdata, excludethesefeatures = derived_colnames_2_exclude)
    } else {
      samplemis = sample.missingness(wdata, excludethesefeatures = NA)
    }
  
  ###########################
  ## 6) exclude samples defined by user ( smis >= 0.2 (default) )
  ###########################
  cat( paste0("\t\t- QCstep: exclude those sample with missingness >= ", smis*100,"%.\n") )
  if( ncol(samplemis) == 2){
    r = which(samplemis[,2] >= smis)
    } else {
      samplemis$sample_missingness_w_exclusions = samplemis$sample_missingness
      r = which(samplemis[,1] >= smis)    
    }
  exclusion_data[3,1] = length(r)
  ##
  if( length(r) > 0) { 
    cat( paste0("\t\t\t* ", length(r), " sample(s) excluded for user defined missingness.\n") )
    wdata = wdata[ -r, ]
    samplemis = samplemis[-r,] 
  } else {
    cat( paste0("\t\t\t* 0 samples excluded for user defined missingness.\n") )
  }
  
  ###########################
  ## 7) re-estimate feature missingness
  ###########################
  cat( paste0("\t\t- QCstep: RE-estimate feature missingness.\n") )
  if( ncol(samplemis) == 2){
    SM = samplemis[,2]
    } else {
      SM = samplemis[,1]
    }
  featuremis = feature.missingness(wdata = wdata)
  
  ###########################
  ## exclude features based on user defined feature missingness ( fmis > 0.2 (default) )
  ###########################
  cat( paste0("\t\t- QCstep: exclude those features with missingness >= ", fmis*100,"%.\n") )
  r = which(featuremis[,1] >= fmis)
  exclusion_data[4,1] = length(r)
  ##
  if( length(r) > 0) {  
    cat( paste0("\t\t\t* ", length(r), " feature(s) excluded for user defined missingness.\n") )
    wdata = wdata[ , -r ]  
  } else {
    cat( paste0("\t\t\t* 0 features excluded for user defined missingness.\n") )
  }
  
  
  ###########################
  ### 9) total peak area
  ###########################
  cat( paste0("\t\t- QCstep: estimate total abundance.\n") )
  if( !is.na(derived_colnames_2_exclude[1]) ){
    tpa = total.peak.area(wdata, feature_names_2_exclude = derived_colnames_2_exclude)
    } else {
      tpa = total.peak.area(wdata, feature_names_2_exclude = NA)    
    }
  
  ###########################
  ## if you want to filter on TPA
  ###########################
  if( is.na(tpa_out_SD) == FALSE){
    cat( paste0("\t\t- QCstep: exclude those features with TPA >= ", tpa_out_SD,"SD from the mean.\n") )
    s = sd( tpa[,1] ); m = mean( tpa[,1] )
    upper_sdout = m + (tpa_out_SD * s)
    lower_sdout = m - (tpa_out_SD * s)
    w = which(tpa[,1] >= upper_sdout | tpa[,1] <= lower_sdout )

    exclusion_data[5,1] = length(w)

    if( length(w) > 0 ){
      cat( paste0("\t\t\t* ", length(w), " sample(s) excluded for user defined TPA SD from the mean.\n") )
      wdata = wdata[ -w, ]
    } else {
    cat( paste0("\t\t\t* 0 samples excluded for user defined TPA SD from the mean.\n") )
    }
  } else {
    cat( paste0("\t\t\tYou have chosen NOT to apply a QC-filter on individuals total peak area.\n") )
    cat( paste0("\t\t\ttotal_peak_area_SD in the parameter file was set to NA.\n") )
  }
  
  ############################
  ## define PCA Data
  ############################
  pcadata = wdata

  ###########################
  ### 10) identify outliers
  ###########################
  if(outlier_treatment != "leave_be"){
    cat( paste0("\t\t- PCA preparation: identifying extreme | outlier values at each feature.\n")  )
    Omat = outlier.matrix(data = pcadata, nsd = outlier_udist, meansd = FALSE)  
    ## add row names to Omat
    rownames(Omat) =  rownames(pcadata)
    ## outlier count
    outier_sum = sum(Omat, na.rm = TRUE)
    nm = nrow(Omat) * ncol(Omat)
    cat( paste0("\t\t\t- A total of ", outier_sum," outlier values (", signif( (outier_sum/nm) * 100 , digits = 5) ,"% of all values) were identified in the data set.\n")   )
    
    ###########################
    ## 11) What to do with outliers?
    ###########################
    if(outlier_treatment == "turn_NA"){
      pcadata[Omat == 1] = NA
      cat(paste0("\t\t- All identified outliers were turned into NA.\n") )
    } else {
      if(outlier_treatment == "winsorize"){
        for(i in 1:ncol(pcadata)){
          ## identify any outliers
          w = which(Omat[,i] == 1)
          if(length(w)>0){
            ## estimate the 'Q' quantile value from all non-outlier samples
            quantile_value = quantile(pcadata[-w,i], probs = c(winsorize_quantile), na.rm = TRUE)
            ## set outliers to the quantile value
            pcadata[w,i] = quantile_value
          }
        }
        cat(paste0("\t\t- Outliers were winsorized to the ", winsorize_quantile * 100 ," quantile of remaining (non outlying) values.\n") )
      }
    }
  }

  ###########################
  ### 12) re-identify feature independence and PC outliers
  ###########################
  cat( paste0("\t\t- QCstep: re-identify independent features through correlation analysis and dendrogram clustering.\n") )
  cat( paste0("\t\t\t- using currently QCd data.\n") )
  
  ###########################
  ## re-estimate independent features using the qc-data to this point
  ###########################
  if( !is.na(derived_colnames_2_exclude[1]) ){
    featuresumstats = feature.sum.stats( wdata = pcadata, sammis = samplemis, tree_cut_height = tree_cut_height, outlier_udist = outlier_udist, feature_names_2_exclude = derived_colnames_2_exclude)
    } else {
      featuresumstats = feature.sum.stats( wdata = pcadata, sammis = samplemis, tree_cut_height = tree_cut_height, outlier_udist = outlier_udist,  feature_names_2_exclude = NA)
    }
  
  ###########################
  ## extract independent feature list
  ###########################
  w = which(featuresumstats$table$independent_features_binary == 1)
  ind_feature_names = rownames(featuresumstats$table)[w]
  cat( paste0("\t\t\t* ", length(ind_feature_names), " independent features identified.\n") )
  
  ###########################
  ## identify PC outliers
  ###########################
  cat( paste0("\t\t- QCstep: Perform Principle Componenet Analysis of currently QC'd data.\n") )
  PCs_outliers = pc.and.outliers(metabolitedata =  pcadata, 
                                 indfeature_names = ind_feature_names )
  
  ###########################
  ## extract PCs 1-2 or 1-number of Acceleration factor PCs 
  ###########################
  af = as.numeric( PCs_outliers[[3]] )
  if(af<2){
    pcs = PCs_outliers[[1]][, 1:2]  
  } else {
    pcs = PCs_outliers[[1]][, 1:af]  
  }
  
  ###########################
  ## perform exclusion on top PCs to ID outliers
  ###########################
  cat( paste0("\t\t- QCstep: Identify PC 1-",af," outliers >= +/-", PC_out_SD , "SD of the mean.\n") )
  if( is.na(PC_out_SD) == FALSE){
    outliers = outlier.matrix(pcs, nsd = PC_out_SD, meansd = TRUE)
    outliers = apply(outliers, 1, sum)
    w = which(outliers>0)

    exclusion_data[6,1] = length(w)

    if(length(w)>0){
      cat( paste0("\t\t\t* ", length(w), " samples excluded as PC outliers.\n") )
      wdata = wdata[-w, ]
    } else {
      cat( paste0("\t\t\t* 0 samples excluded as PC outliers.\n") )
      }
  } else {
    cat( paste0("\t\t\tYou have chosen NOT to apply a QC-filter on individuals based on their PC eigenvectors.\n") )
    cat( paste0("\t\t\tPC_outlier_udist in the parameter file was set to NA.\n") )
  }
  
  ###########################
  ## 13) put the exclusion features back
  ###########################
  if( exists("exdata") ){
    cat( paste0("\t\t- QCstep: placing the initially extracted exclusion features back into the data frame.\n") )
    ## match sample ids
    m = match(rownames(wdata), rownames(exdata))
    wdata = cbind(wdata, exdata[m, ])
  }
  
  ##
  return( list( wdata = wdata, featuresumstats = featuresumstats, pca = PCs_outliers, exclusion_data = exclusion_data) )
}